summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c7
-rw-r--r--mm/bootmem.c2
-rw-r--r--mm/bounce.c11
-rw-r--r--mm/dmapool.c3
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c1
-rw-r--r--mm/highmem.c2
-rw-r--r--mm/huge_memory.c37
-rw-r--r--mm/internal.h46
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/memcontrol.c1007
-rw-r--r--mm/memory-failure.c1
-rw-r--r--mm/memory.c4
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mm_init.c2
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/mmu_context.c2
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mmzone.c1
-rw-r--r--mm/nobootmem.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c708
-rw-r--r--mm/page_cgroup.c9
-rw-r--r--mm/quicklist.c1
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/slob.c2
-rw-r--r--mm/sparse-vmemmap.c1
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c85
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c1
-rw-r--r--mm/truncate.c2
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmscan.c7
43 files changed, 1240 insertions, 743 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7520ef0bfd47..a0860640378d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -97,6 +97,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"BdiDirtyThresh: %10lu kB\n"
"DirtyThresh: %10lu kB\n"
"BackgroundThresh: %10lu kB\n"
+ "BdiDirtied: %10lu kB\n"
"BdiWritten: %10lu kB\n"
"BdiWriteBandwidth: %10lu kBps\n"
"b_dirty: %10lu\n"
@@ -109,6 +110,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
K(bdi_thresh),
K(dirty_thresh),
K(background_thresh),
+ (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
(unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
(unsigned long) K(bdi->write_bandwidth),
nr_dirty,
@@ -473,7 +475,8 @@ static int bdi_forker_thread(void *ptr)
* the bdi from the thread. Hopefully 1024 is
* large enough for efficient IO.
*/
- writeback_inodes_wb(&bdi->wb, 1024);
+ writeback_inodes_wb(&bdi->wb, 1024,
+ WB_REASON_FORKER_THREAD);
} else {
/*
* The spinlock makes sure we do not lose
@@ -683,6 +686,8 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->bw_time_stamp = jiffies;
bdi->written_stamp = 0;
+ bdi->balanced_dirty_ratelimit = INIT_BW;
+ bdi->dirty_ratelimit = INIT_BW;
bdi->write_bandwidth = INIT_BW;
bdi->avg_write_bandwidth = INIT_BW;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 01d5a4b3dd0c..1a77012ecdb3 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -12,7 +12,7 @@
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/bootmem.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kmemleak.h>
#include <linux/range.h>
#include <linux/memblock.h>
diff --git a/mm/bounce.c b/mm/bounce.c
index 1481de68184b..4e9ae722af83 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -4,7 +4,7 @@
*/
#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/swap.h>
#include <linux/gfp.h>
#include <linux/bio.h>
@@ -14,6 +14,7 @@
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
+#include <linux/bootmem.h>
#include <asm/tlbflush.h>
#include <trace/events/block.h>
@@ -26,12 +27,10 @@ static mempool_t *page_pool, *isa_page_pool;
#ifdef CONFIG_HIGHMEM
static __init int init_emergency_pool(void)
{
- struct sysinfo i;
- si_meminfo(&i);
- si_swapinfo(&i);
-
- if (!i.totalhigh)
+#ifndef CONFIG_MEMORY_HOTPLUG
+ if (max_pfn <= max_low_pfn)
return 0;
+#endif
page_pool = mempool_create_page_pool(POOL_SIZE, 0);
BUG_ON(!page_pool);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index fbb58e346888..c5ab33bca0a8 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -27,11 +27,12 @@
#include <linux/dmapool.h>
#include <linux/kernel.h>
#include <linux/list.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/poison.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/stat.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/types.h>
diff --git a/mm/filemap.c b/mm/filemap.c
index 5cf820a7c8ec..c0018f2d50e0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -9,7 +9,7 @@
* most "normal" filesystems (but you don't /have/ to use this:
* the NFS filesystem used to do this differently, for example)
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 93356cd12828..f91b2f687343 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -10,7 +10,7 @@
#include <linux/fs.h>
#include <linux/pagemap.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/uio.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
diff --git a/mm/fremap.c b/mm/fremap.c
index b8e0e2d468af..9ed4fd432467 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -13,7 +13,6 @@
#include <linux/pagemap.h>
#include <linux/swapops.h>
#include <linux/rmap.h>
-#include <linux/module.h>
#include <linux/syscalls.h>
#include <linux/mmu_notifier.h>
diff --git a/mm/highmem.c b/mm/highmem.c
index e159a7b1cc22..57d82c6250c3 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -17,7 +17,7 @@
*/
#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/pagemap.h>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 860ec211ddd6..4298abaae153 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -990,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON(!PageCompound(page));
if (flags & FOLL_GET)
- get_page(page);
+ get_page_foll(page);
out:
return page;
@@ -1202,6 +1202,7 @@ static void __split_huge_page_refcount(struct page *page)
unsigned long head_index = page->index;
struct zone *zone = page_zone(page);
int zonestat;
+ int tail_count = 0;
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irq(&zone->lru_lock);
@@ -1210,11 +1211,27 @@ static void __split_huge_page_refcount(struct page *page)
for (i = 1; i < HPAGE_PMD_NR; i++) {
struct page *page_tail = page + i;
- /* tail_page->_count cannot change */
- atomic_sub(atomic_read(&page_tail->_count), &page->_count);
- BUG_ON(page_count(page) <= 0);
- atomic_add(page_mapcount(page) + 1, &page_tail->_count);
- BUG_ON(atomic_read(&page_tail->_count) <= 0);
+ /* tail_page->_mapcount cannot change */
+ BUG_ON(page_mapcount(page_tail) < 0);
+ tail_count += page_mapcount(page_tail);
+ /* check for overflow */
+ BUG_ON(tail_count < 0);
+ BUG_ON(atomic_read(&page_tail->_count) != 0);
+ /*
+ * tail_page->_count is zero and not changing from
+ * under us. But get_page_unless_zero() may be running
+ * from under us on the tail_page. If we used
+ * atomic_set() below instead of atomic_add(), we
+ * would then run atomic_set() concurrently with
+ * get_page_unless_zero(), and atomic_set() is
+ * implemented in C not using locked ops. spin_unlock
+ * on x86 sometime uses locked ops because of PPro
+ * errata 66, 92, so unless somebody can guarantee
+ * atomic_set() here would be safe on all archs (and
+ * not only on x86), it's safer to use atomic_add().
+ */
+ atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
+ &page_tail->_count);
/* after clearing PageTail the gup refcount can be released */
smp_mb();
@@ -1232,10 +1249,7 @@ static void __split_huge_page_refcount(struct page *page)
(1L << PG_uptodate)));
page_tail->flags |= (1L << PG_dirty);
- /*
- * 1) clear PageTail before overwriting first_page
- * 2) clear PageTail before clearing PageHead for VM_BUG_ON
- */
+ /* clear PageTail before overwriting first_page */
smp_wmb();
/*
@@ -1252,7 +1266,6 @@ static void __split_huge_page_refcount(struct page *page)
* status is achieved setting a reserved bit in the
* pmd, not by clearing the present bit.
*/
- BUG_ON(page_mapcount(page_tail));
page_tail->_mapcount = page->_mapcount;
BUG_ON(page_tail->mapping);
@@ -1269,6 +1282,8 @@ static void __split_huge_page_refcount(struct page *page)
lru_add_page_tail(zone, page, page_tail);
}
+ atomic_sub(tail_count, &page->_count);
+ BUG_ON(atomic_read(&page->_count) <= 0);
__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
diff --git a/mm/internal.h b/mm/internal.h
index d071d380fb49..2189af491783 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page)
atomic_dec(&page->_count);
}
+static inline void __get_page_tail_foll(struct page *page,
+ bool get_page_head)
+{
+ /*
+ * If we're getting a tail page, the elevated page->_count is
+ * required only in the head page and we will elevate the head
+ * page->_count and tail page->_mapcount.
+ *
+ * We elevate page_tail->_mapcount for tail pages to force
+ * page_tail->_count to be zero at all times to avoid getting
+ * false positives from get_page_unless_zero() with
+ * speculative page access (like in
+ * page_cache_get_speculative()) on tail pages.
+ */
+ VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+ VM_BUG_ON(atomic_read(&page->_count) != 0);
+ VM_BUG_ON(page_mapcount(page) < 0);
+ if (get_page_head)
+ atomic_inc(&page->first_page->_count);
+ atomic_inc(&page->_mapcount);
+}
+
+/*
+ * This is meant to be called as the FOLL_GET operation of
+ * follow_page() and it must be called while holding the proper PT
+ * lock while the pte (or pmd_trans_huge) is still mapping the page.
+ */
+static inline void get_page_foll(struct page *page)
+{
+ if (unlikely(PageTail(page)))
+ /*
+ * This is safe only because
+ * __split_huge_page_refcount() can't run under
+ * get_page_foll() because we hold the proper PT lock.
+ */
+ __get_page_tail_foll(page, true);
+ else {
+ /*
+ * Getting a normal page or the head of a compound page
+ * requires to already have an elevated page->_count.
+ */
+ VM_BUG_ON(atomic_read(&page->_count) <= 0);
+ atomic_inc(&page->_count);
+ }
+}
+
extern unsigned long highest_memmap_pfn;
/*
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index d6880f542f95..f3b2a00fe9c1 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -69,7 +69,7 @@
#include <linux/sched.h>
#include <linux/jiffies.h>
#include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kthread.h>
#include <linux/prio_tree.h>
#include <linux/fs.h>
diff --git a/mm/maccess.c b/mm/maccess.c
index 4cee182ab5f3..d53adf9ba84b 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,7 +1,7 @@
/*
* Access kernel memory without faulting.
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2d5755544afe..6aff93c98aca 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -33,6 +33,7 @@
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
#include <linux/limits.h>
+#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
@@ -201,8 +202,8 @@ struct mem_cgroup_eventfd_list {
struct eventfd_ctx *eventfd;
};
-static void mem_cgroup_threshold(struct mem_cgroup *mem);
-static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
+static void mem_cgroup_threshold(struct mem_cgroup *memcg);
+static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
/*
* The memory controller data structure. The memory controller controls both
@@ -362,29 +363,29 @@ enum charge_type {
#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
-static void mem_cgroup_get(struct mem_cgroup *mem);
-static void mem_cgroup_put(struct mem_cgroup *mem);
-static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
-static void drain_all_stock_async(struct mem_cgroup *mem);
+static void mem_cgroup_get(struct mem_cgroup *memcg);
+static void mem_cgroup_put(struct mem_cgroup *memcg);
+static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
+static void drain_all_stock_async(struct mem_cgroup *memcg);
static struct mem_cgroup_per_zone *
-mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
+mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
{
- return &mem->info.nodeinfo[nid]->zoneinfo[zid];
+ return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
}
-struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
+struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
{
- return &mem->css;
+ return &memcg->css;
}
static struct mem_cgroup_per_zone *
-page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
+page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
{
int nid = page_to_nid(page);
int zid = page_zonenum(page);
- return mem_cgroup_zoneinfo(mem, nid, zid);
+ return mem_cgroup_zoneinfo(memcg, nid, zid);
}
static struct mem_cgroup_tree_per_zone *
@@ -403,7 +404,7 @@ soft_limit_tree_from_page(struct page *page)
}
static void
-__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
+__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz,
unsigned long long new_usage_in_excess)
@@ -437,7 +438,7 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
}
static void
-__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz)
{
@@ -448,17 +449,17 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
}
static void
-mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz)
{
spin_lock(&mctz->lock);
- __mem_cgroup_remove_exceeded(mem, mz, mctz);
+ __mem_cgroup_remove_exceeded(memcg, mz, mctz);
spin_unlock(&mctz->lock);
}
-static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
+static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
unsigned long long excess;
struct mem_cgroup_per_zone *mz;
@@ -471,9 +472,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
* Necessary to update all ancestors when hierarchy is used.
* because their event counter is not touched.
*/
- for (; mem; mem = parent_mem_cgroup(mem)) {
- mz = mem_cgroup_zoneinfo(mem, nid, zid);
- excess = res_counter_soft_limit_excess(&mem->res);
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+ excess = res_counter_soft_limit_excess(&memcg->res);
/*
* We have to update the tree if mz is on RB-tree or
* mem is over its softlimit.
@@ -482,18 +483,18 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
spin_lock(&mctz->lock);
/* if on-tree, remove it */
if (mz->on_tree)
- __mem_cgroup_remove_exceeded(mem, mz, mctz);
+ __mem_cgroup_remove_exceeded(memcg, mz, mctz);
/*
* Insert again. mz->usage_in_excess will be updated.
* If excess is 0, no tree ops.
*/
- __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
+ __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
spin_unlock(&mctz->lock);
}
}
}
-static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
+static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
int node, zone;
struct mem_cgroup_per_zone *mz;
@@ -501,9 +502,9 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
for_each_node_state(node, N_POSSIBLE) {
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- mz = mem_cgroup_zoneinfo(mem, node, zone);
+ mz = mem_cgroup_zoneinfo(memcg, node, zone);
mctz = soft_limit_tree_node_zone(node, zone);
- mem_cgroup_remove_exceeded(mem, mz, mctz);
+ mem_cgroup_remove_exceeded(memcg, mz, mctz);
}
}
}
@@ -564,7 +565,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
* common workload, threashold and synchonization as vmstat[] should be
* implemented.
*/
-static long mem_cgroup_read_stat(struct mem_cgroup *mem,
+static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
{
long val = 0;
@@ -572,81 +573,83 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
get_online_cpus();
for_each_online_cpu(cpu)
- val += per_cpu(mem->stat->count[idx], cpu);
+ val += per_cpu(memcg->stat->count[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
- spin_lock(&mem->pcp_counter_lock);
- val += mem->nocpu_base.count[idx];
- spin_unlock(&mem->pcp_counter_lock);
+ spin_lock(&memcg->pcp_counter_lock);
+ val += memcg->nocpu_base.count[idx];
+ spin_unlock(&memcg->pcp_counter_lock);
#endif
put_online_cpus();
return val;
}
-static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
+static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
bool charge)
{
int val = (charge) ? 1 : -1;
- this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
+ this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
}
-void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
+void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val)
{
- this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
+ this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
}
-void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
+void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val)
{
- this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
+ this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
}
-static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
+static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx)
{
unsigned long val = 0;
int cpu;
for_each_online_cpu(cpu)
- val += per_cpu(mem->stat->events[idx], cpu);
+ val += per_cpu(memcg->stat->events[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
- spin_lock(&mem->pcp_counter_lock);
- val += mem->nocpu_base.events[idx];
- spin_unlock(&mem->pcp_counter_lock);
+ spin_lock(&memcg->pcp_counter_lock);
+ val += memcg->nocpu_base.events[idx];
+ spin_unlock(&memcg->pcp_counter_lock);
#endif
return val;
}
-static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
+static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
bool file, int nr_pages)
{
preempt_disable();
if (file)
- __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
+ __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
+ nr_pages);
else
- __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
+ __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
+ nr_pages);
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
- __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
+ __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
else {
- __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
+ __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
nr_pages = -nr_pages; /* for event */
}
- __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
+ __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
preempt_enable();
}
unsigned long
-mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
+mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
unsigned int lru_mask)
{
struct mem_cgroup_per_zone *mz;
enum lru_list l;
unsigned long ret = 0;
- mz = mem_cgroup_zoneinfo(mem, nid, zid);
+ mz = mem_cgroup_zoneinfo(memcg, nid, zid);
for_each_lru(l) {
if (BIT(l) & lru_mask)
@@ -656,44 +659,45 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
}
static unsigned long
-mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,
+mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
{
u64 total = 0;
int zid;
for (zid = 0; zid < MAX_NR_ZONES; zid++)
- total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);
+ total += mem_cgroup_zone_nr_lru_pages(memcg,
+ nid, zid, lru_mask);
return total;
}
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
unsigned int lru_mask)
{
int nid;
u64 total = 0;
for_each_node_state(nid, N_HIGH_MEMORY)
- total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);
+ total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
return total;
}
-static bool __memcg_event_check(struct mem_cgroup *mem, int target)
+static bool __memcg_event_check(struct mem_cgroup *memcg, int target)
{
unsigned long val, next;
- val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
- next = this_cpu_read(mem->stat->targets[target]);
+ val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
+ next = __this_cpu_read(memcg->stat->targets[target]);
/* from time_after() in jiffies.h */
return ((long)next - (long)val < 0);
}
-static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
+static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target)
{
unsigned long val, next;
- val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
+ val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
switch (target) {
case MEM_CGROUP_TARGET_THRESH:
@@ -709,34 +713,36 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
return;
}
- this_cpu_write(mem->stat->targets[target], next);
+ __this_cpu_write(memcg->stat->targets[target], next);
}
/*
* Check events in order.
*
*/
-static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
+static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
{
+ preempt_disable();
/* threshold event is triggered in finer grain than soft limit */
- if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
- mem_cgroup_threshold(mem);
- __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
- if (unlikely(__memcg_event_check(mem,
+ if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) {
+ mem_cgroup_threshold(memcg);
+ __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH);
+ if (unlikely(__memcg_event_check(memcg,
MEM_CGROUP_TARGET_SOFTLIMIT))) {
- mem_cgroup_update_tree(mem, page);
- __mem_cgroup_target_update(mem,
+ mem_cgroup_update_tree(memcg, page);
+ __mem_cgroup_target_update(memcg,
MEM_CGROUP_TARGET_SOFTLIMIT);
}
#if MAX_NUMNODES > 1
- if (unlikely(__memcg_event_check(mem,
+ if (unlikely(__memcg_event_check(memcg,
MEM_CGROUP_TARGET_NUMAINFO))) {
- atomic_inc(&mem->numainfo_events);
- __mem_cgroup_target_update(mem,
+ atomic_inc(&memcg->numainfo_events);
+ __mem_cgroup_target_update(memcg,
MEM_CGROUP_TARGET_NUMAINFO);
}
#endif
}
+ preempt_enable();
}
static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
@@ -762,7 +768,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
{
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *memcg = NULL;
if (!mm)
return NULL;
@@ -773,25 +779,25 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
*/
rcu_read_lock();
do {
- mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!mem))
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!memcg))
break;
- } while (!css_tryget(&mem->css));
+ } while (!css_tryget(&memcg->css));
rcu_read_unlock();
- return mem;
+ return memcg;
}
/* The caller has to guarantee "mem" exists before calling this */
-static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
+static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg)
{
struct cgroup_subsys_state *css;
int found;
- if (!mem) /* ROOT cgroup has the smallest ID */
+ if (!memcg) /* ROOT cgroup has the smallest ID */
return root_mem_cgroup; /*css_put/get against root is ignored*/
- if (!mem->use_hierarchy) {
- if (css_tryget(&mem->css))
- return mem;
+ if (!memcg->use_hierarchy) {
+ if (css_tryget(&memcg->css))
+ return memcg;
return NULL;
}
rcu_read_lock();
@@ -799,13 +805,13 @@ static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
* searching a memory cgroup which has the smallest ID under given
* ROOT cgroup. (ID >= 1)
*/
- css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
+ css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found);
if (css && css_tryget(css))
- mem = container_of(css, struct mem_cgroup, css);
+ memcg = container_of(css, struct mem_cgroup, css);
else
- mem = NULL;
+ memcg = NULL;
rcu_read_unlock();
- return mem;
+ return memcg;
}
static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
@@ -859,29 +865,29 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
for_each_mem_cgroup_tree_cond(iter, NULL, true)
-static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
- return (mem == root_mem_cgroup);
+ return (memcg == root_mem_cgroup);
}
void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
{
- struct mem_cgroup *mem;
+ struct mem_cgroup *memcg;
if (!mm)
return;
rcu_read_lock();
- mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!mem))
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!memcg))
goto out;
switch (idx) {
case PGMAJFAULT:
- mem_cgroup_pgmajfault(mem, 1);
+ mem_cgroup_pgmajfault(memcg, 1);
break;
case PGFAULT:
- mem_cgroup_pgfault(mem, 1);
+ mem_cgroup_pgfault(memcg, 1);
break;
default:
BUG();
@@ -990,6 +996,16 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
return;
pc = lookup_page_cgroup(page);
VM_BUG_ON(PageCgroupAcctLRU(pc));
+ /*
+ * putback: charge:
+ * SetPageLRU SetPageCgroupUsed
+ * smp_mb smp_mb
+ * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU
+ *
+ * Ensure that one of the two sides adds the page to the memcg
+ * LRU during a race.
+ */
+ smp_mb();
if (!PageCgroupUsed(pc))
return;
/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
@@ -1041,7 +1057,16 @@ static void mem_cgroup_lru_add_after_commit(struct page *page)
unsigned long flags;
struct zone *zone = page_zone(page);
struct page_cgroup *pc = lookup_page_cgroup(page);
-
+ /*
+ * putback: charge:
+ * SetPageLRU SetPageCgroupUsed
+ * smp_mb smp_mb
+ * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU
+ *
+ * Ensure that one of the two sides adds the page to the memcg
+ * LRU during a race.
+ */
+ smp_mb();
/* taking care of that the page is added to LRU while we commit it */
if (likely(!PageLRU(page)))
return;
@@ -1063,21 +1088,21 @@ void mem_cgroup_move_lists(struct page *page,
}
/*
- * Checks whether given mem is same or in the root_mem's
+ * Checks whether given mem is same or in the root_mem_cgroup's
* hierarchy subtree
*/
-static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,
- struct mem_cgroup *mem)
+static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
+ struct mem_cgroup *memcg)
{
- if (root_mem != mem) {
- return (root_mem->use_hierarchy &&
- css_is_ancestor(&mem->css, &root_mem->css));
+ if (root_memcg != memcg) {
+ return (root_memcg->use_hierarchy &&
+ css_is_ancestor(&memcg->css, &root_memcg->css));
}
return true;
}
-int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
+int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
{
int ret;
struct mem_cgroup *curr = NULL;
@@ -1091,25 +1116,29 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
if (!curr)
return 0;
/*
- * We should check use_hierarchy of "mem" not "curr". Because checking
+ * We should check use_hierarchy of "memcg" not "curr". Because checking
* use_hierarchy of "curr" here make this function true if hierarchy is
- * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
- * hierarchy(even if use_hierarchy is disabled in "mem").
+ * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
+ * hierarchy(even if use_hierarchy is disabled in "memcg").
*/
- ret = mem_cgroup_same_or_subtree(mem, curr);
+ ret = mem_cgroup_same_or_subtree(memcg, curr);
css_put(&curr->css);
return ret;
}
-static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
{
- unsigned long active;
+ unsigned long inactive_ratio;
+ int nid = zone_to_nid(zone);
+ int zid = zone_idx(zone);
unsigned long inactive;
+ unsigned long active;
unsigned long gb;
- unsigned long inactive_ratio;
- inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
- active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
+ inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+ BIT(LRU_INACTIVE_ANON));
+ active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+ BIT(LRU_ACTIVE_ANON));
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
@@ -1117,39 +1146,20 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
else
inactive_ratio = 1;
- if (present_pages) {
- present_pages[0] = inactive;
- present_pages[1] = active;
- }
-
- return inactive_ratio;
+ return inactive * inactive_ratio < active;
}
-int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
-{
- unsigned long active;
- unsigned long inactive;
- unsigned long present_pages[2];
- unsigned long inactive_ratio;
-
- inactive_ratio = calc_inactive_ratio(memcg, present_pages);
-
- inactive = present_pages[0];
- active = present_pages[1];
-
- if (inactive * inactive_ratio < active)
- return 1;
-
- return 0;
-}
-
-int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
+int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
{
unsigned long active;
unsigned long inactive;
+ int zid = zone_idx(zone);
+ int nid = zone_to_nid(zone);
- inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
- active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
+ inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+ BIT(LRU_INACTIVE_FILE));
+ active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+ BIT(LRU_ACTIVE_FILE));
return (active > inactive);
}
@@ -1254,13 +1264,13 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
* Returns the maximum amount of memory @mem can be charged with, in
* pages.
*/
-static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
+static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
{
unsigned long long margin;
- margin = res_counter_margin(&mem->res);
+ margin = res_counter_margin(&memcg->res);
if (do_swap_account)
- margin = min(margin, res_counter_margin(&mem->memsw));
+ margin = min(margin, res_counter_margin(&memcg->memsw));
return margin >> PAGE_SHIFT;
}
@@ -1275,33 +1285,33 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
return memcg->swappiness;
}
-static void mem_cgroup_start_move(struct mem_cgroup *mem)
+static void mem_cgroup_start_move(struct mem_cgroup *memcg)
{
int cpu;
get_online_cpus();
- spin_lock(&mem->pcp_counter_lock);
+ spin_lock(&memcg->pcp_counter_lock);
for_each_online_cpu(cpu)
- per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
- mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
- spin_unlock(&mem->pcp_counter_lock);
+ per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
+ memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
+ spin_unlock(&memcg->pcp_counter_lock);
put_online_cpus();
synchronize_rcu();
}
-static void mem_cgroup_end_move(struct mem_cgroup *mem)
+static void mem_cgroup_end_move(struct mem_cgroup *memcg)
{
int cpu;
- if (!mem)
+ if (!memcg)
return;
get_online_cpus();
- spin_lock(&mem->pcp_counter_lock);
+ spin_lock(&memcg->pcp_counter_lock);
for_each_online_cpu(cpu)
- per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
- mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
- spin_unlock(&mem->pcp_counter_lock);
+ per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
+ memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
+ spin_unlock(&memcg->pcp_counter_lock);
put_online_cpus();
}
/*
@@ -1316,13 +1326,13 @@ static void mem_cgroup_end_move(struct mem_cgroup *mem)
* waiting at hith-memory prressure caused by "move".
*/
-static bool mem_cgroup_stealed(struct mem_cgroup *mem)
+static bool mem_cgroup_stealed(struct mem_cgroup *memcg)
{
VM_BUG_ON(!rcu_read_lock_held());
- return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
+ return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
}
-static bool mem_cgroup_under_move(struct mem_cgroup *mem)
+static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
{
struct mem_cgroup *from;
struct mem_cgroup *to;
@@ -1337,17 +1347,17 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem)
if (!from)
goto unlock;
- ret = mem_cgroup_same_or_subtree(mem, from)
- || mem_cgroup_same_or_subtree(mem, to);
+ ret = mem_cgroup_same_or_subtree(memcg, from)
+ || mem_cgroup_same_or_subtree(memcg, to);
unlock:
spin_unlock(&mc.lock);
return ret;
}
-static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
{
if (mc.moving_task && current != mc.moving_task) {
- if (mem_cgroup_under_move(mem)) {
+ if (mem_cgroup_under_move(memcg)) {
DEFINE_WAIT(wait);
prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
/* moving charge context might have finished. */
@@ -1431,12 +1441,12 @@ done:
* This function returns the number of memcg under hierarchy tree. Returns
* 1(self count) if no children.
*/
-static int mem_cgroup_count_children(struct mem_cgroup *mem)
+static int mem_cgroup_count_children(struct mem_cgroup *memcg)
{
int num = 0;
struct mem_cgroup *iter;
- for_each_mem_cgroup_tree(iter, mem)
+ for_each_mem_cgroup_tree(iter, memcg)
num++;
return num;
}
@@ -1466,21 +1476,21 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
* that to reclaim free pages from.
*/
static struct mem_cgroup *
-mem_cgroup_select_victim(struct mem_cgroup *root_mem)
+mem_cgroup_select_victim(struct mem_cgroup *root_memcg)
{
struct mem_cgroup *ret = NULL;
struct cgroup_subsys_state *css;
int nextid, found;
- if (!root_mem->use_hierarchy) {
- css_get(&root_mem->css);
- ret = root_mem;
+ if (!root_memcg->use_hierarchy) {
+ css_get(&root_memcg->css);
+ ret = root_memcg;
}
while (!ret) {
rcu_read_lock();
- nextid = root_mem->last_scanned_child + 1;
- css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
+ nextid = root_memcg->last_scanned_child + 1;
+ css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css,
&found);
if (css && css_tryget(css))
ret = container_of(css, struct mem_cgroup, css);
@@ -1489,9 +1499,9 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
/* Updates scanning parameter */
if (!css) {
/* this means start scan from ID:1 */
- root_mem->last_scanned_child = 0;
+ root_memcg->last_scanned_child = 0;
} else
- root_mem->last_scanned_child = found;
+ root_memcg->last_scanned_child = found;
}
return ret;
@@ -1507,14 +1517,14 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
* reclaimable pages on a node. Returns true if there are any reclaimable
* pages in the node.
*/
-static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
+static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
int nid, bool noswap)
{
- if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
+ if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
- if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
+ if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
return true;
return false;
@@ -1527,29 +1537,29 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
* nodes based on the zonelist. So update the list loosely once per 10 secs.
*
*/
-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
{
int nid;
/*
* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
* pagein/pageout changes since the last update.
*/
- if (!atomic_read(&mem->numainfo_events))
+ if (!atomic_read(&memcg->numainfo_events))
return;
- if (atomic_inc_return(&mem->numainfo_updating) > 1)
+ if (atomic_inc_return(&memcg->numainfo_updating) > 1)
return;
/* make a nodemask where this memcg uses memory from */
- mem->scan_nodes = node_states[N_HIGH_MEMORY];
+ memcg->scan_nodes = node_states[N_HIGH_MEMORY];
for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
- if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
- node_clear(nid, mem->scan_nodes);
+ if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
+ node_clear(nid, memcg->scan_nodes);
}
- atomic_set(&mem->numainfo_events, 0);
- atomic_set(&mem->numainfo_updating, 0);
+ atomic_set(&memcg->numainfo_events, 0);
+ atomic_set(&memcg->numainfo_updating, 0);
}
/*
@@ -1564,16 +1574,16 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
*
* Now, we use round-robin. Better algorithm is welcomed.
*/
-int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
int node;
- mem_cgroup_may_update_nodemask(mem);
- node = mem->last_scanned_node;
+ mem_cgroup_may_update_nodemask(memcg);
+ node = memcg->last_scanned_node;
- node = next_node(node, mem->scan_nodes);
+ node = next_node(node, memcg->scan_nodes);
if (node == MAX_NUMNODES)
- node = first_node(mem->scan_nodes);
+ node = first_node(memcg->scan_nodes);
/*
* We call this when we hit limit, not when pages are added to LRU.
* No LRU may hold pages because all pages are UNEVICTABLE or
@@ -1583,7 +1593,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
if (unlikely(node == MAX_NUMNODES))
node = numa_node_id();
- mem->last_scanned_node = node;
+ memcg->last_scanned_node = node;
return node;
}
@@ -1593,7 +1603,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
* unused nodes. But scan_nodes is lazily updated and may not cotain
* enough new information. We need to do double check.
*/
-bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
int nid;
@@ -1601,12 +1611,12 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
* quick check...making use of scan_node.
* We can skip unused nodes.
*/
- if (!nodes_empty(mem->scan_nodes)) {
- for (nid = first_node(mem->scan_nodes);
+ if (!nodes_empty(memcg->scan_nodes)) {
+ for (nid = first_node(memcg->scan_nodes);
nid < MAX_NUMNODES;
- nid = next_node(nid, mem->scan_nodes)) {
+ nid = next_node(nid, memcg->scan_nodes)) {
- if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+ if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
return true;
}
}
@@ -1614,23 +1624,23 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
* Check rest of nodes.
*/
for_each_node_state(nid, N_HIGH_MEMORY) {
- if (node_isset(nid, mem->scan_nodes))
+ if (node_isset(nid, memcg->scan_nodes))
continue;
- if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+ if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
return true;
}
return false;
}
#else
-int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
return 0;
}
-bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
- return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
+ return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
}
#endif
@@ -1639,14 +1649,14 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
* we reclaimed from, so that we don't end up penalizing one child extensively
* based on its position in the children list.
*
- * root_mem is the original ancestor that we've been reclaim from.
+ * root_memcg is the original ancestor that we've been reclaim from.
*
- * We give up and return to the caller when we visit root_mem twice.
+ * We give up and return to the caller when we visit root_memcg twice.
* (other groups can be removed while we're walking....)
*
* If shrink==true, for avoiding to free too much, this returns immedieately.
*/
-static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
+static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
struct zone *zone,
gfp_t gfp_mask,
unsigned long reclaim_options,
@@ -1661,15 +1671,15 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
unsigned long excess;
unsigned long nr_scanned;
- excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
+ excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
/* If memsw_is_minimum==1, swap-out is of-no-use. */
- if (!check_soft && !shrink && root_mem->memsw_is_minimum)
+ if (!check_soft && !shrink && root_memcg->memsw_is_minimum)
noswap = true;
while (1) {
- victim = mem_cgroup_select_victim(root_mem);
- if (victim == root_mem) {
+ victim = mem_cgroup_select_victim(root_memcg);
+ if (victim == root_memcg) {
loop++;
/*
* We are not draining per cpu cached charges during
@@ -1678,7 +1688,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
* charges will not give any.
*/
if (!check_soft && loop >= 1)
- drain_all_stock_async(root_mem);
+ drain_all_stock_async(root_memcg);
if (loop >= 2) {
/*
* If we have not been able to reclaim
@@ -1725,9 +1735,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
return ret;
total += ret;
if (check_soft) {
- if (!res_counter_soft_limit_excess(&root_mem->res))
+ if (!res_counter_soft_limit_excess(&root_memcg->res))
return total;
- } else if (mem_cgroup_margin(root_mem))
+ } else if (mem_cgroup_margin(root_memcg))
return total;
}
return total;
@@ -1738,12 +1748,12 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
* If someone is running, return false.
* Has to be called with memcg_oom_lock
*/
-static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
+static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter, *failed = NULL;
bool cond = true;
- for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+ for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
if (iter->oom_lock) {
/*
* this subtree of our hierarchy is already locked
@@ -1763,7 +1773,7 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
* what we set up to the failing subtree
*/
cond = true;
- for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+ for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
if (iter == failed) {
cond = false;
continue;
@@ -1776,24 +1786,24 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
/*
* Has to be called with memcg_oom_lock
*/
-static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
+static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
- for_each_mem_cgroup_tree(iter, mem)
+ for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
return 0;
}
-static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
+static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
- for_each_mem_cgroup_tree(iter, mem)
+ for_each_mem_cgroup_tree(iter, memcg)
atomic_inc(&iter->under_oom);
}
-static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
+static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
@@ -1802,7 +1812,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
* mem_cgroup_oom_lock() may not be called. We have to use
* atomic_add_unless() here.
*/
- for_each_mem_cgroup_tree(iter, mem)
+ for_each_mem_cgroup_tree(iter, memcg)
atomic_add_unless(&iter->under_oom, -1, 0);
}
@@ -1817,85 +1827,85 @@ struct oom_wait_info {
static int memcg_oom_wake_function(wait_queue_t *wait,
unsigned mode, int sync, void *arg)
{
- struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,
- *oom_wait_mem;
+ struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
+ *oom_wait_memcg;
struct oom_wait_info *oom_wait_info;
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
- oom_wait_mem = oom_wait_info->mem;
+ oom_wait_memcg = oom_wait_info->mem;
/*
* Both of oom_wait_info->mem and wake_mem are stable under us.
* Then we can use css_is_ancestor without taking care of RCU.
*/
- if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)
- && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))
+ if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
+ && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
return 0;
return autoremove_wake_function(wait, mode, sync, arg);
}
-static void memcg_wakeup_oom(struct mem_cgroup *mem)
+static void memcg_wakeup_oom(struct mem_cgroup *memcg)
{
- /* for filtering, pass "mem" as argument. */
- __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
+ /* for filtering, pass "memcg" as argument. */
+ __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}
-static void memcg_oom_recover(struct mem_cgroup *mem)
+static void memcg_oom_recover(struct mem_cgroup *memcg)
{
- if (mem && atomic_read(&mem->under_oom))
- memcg_wakeup_oom(mem);
+ if (memcg && atomic_read(&memcg->under_oom))
+ memcg_wakeup_oom(memcg);
}
/*
* try to call OOM killer. returns false if we should exit memory-reclaim loop.
*/
-bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
+bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
{
struct oom_wait_info owait;
bool locked, need_to_kill;
- owait.mem = mem;
+ owait.mem = memcg;
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
INIT_LIST_HEAD(&owait.wait.task_list);
need_to_kill = true;
- mem_cgroup_mark_under_oom(mem);
+ mem_cgroup_mark_under_oom(memcg);
- /* At first, try to OOM lock hierarchy under mem.*/
+ /* At first, try to OOM lock hierarchy under memcg.*/
spin_lock(&memcg_oom_lock);
- locked = mem_cgroup_oom_lock(mem);
+ locked = mem_cgroup_oom_lock(memcg);
/*
* Even if signal_pending(), we can't quit charge() loop without
* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
* under OOM is always welcomed, use TASK_KILLABLE here.
*/
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
- if (!locked || mem->oom_kill_disable)
+ if (!locked || memcg->oom_kill_disable)
need_to_kill = false;
if (locked)
- mem_cgroup_oom_notify(mem);
+ mem_cgroup_oom_notify(memcg);
spin_unlock(&memcg_oom_lock);
if (need_to_kill) {
finish_wait(&memcg_oom_waitq, &owait.wait);
- mem_cgroup_out_of_memory(mem, mask);
+ mem_cgroup_out_of_memory(memcg, mask);
} else {
schedule();
finish_wait(&memcg_oom_waitq, &owait.wait);
}
spin_lock(&memcg_oom_lock);
if (locked)
- mem_cgroup_oom_unlock(mem);
- memcg_wakeup_oom(mem);
+ mem_cgroup_oom_unlock(memcg);
+ memcg_wakeup_oom(memcg);
spin_unlock(&memcg_oom_lock);
- mem_cgroup_unmark_under_oom(mem);
+ mem_cgroup_unmark_under_oom(memcg);
if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
return false;
/* Give chance to dying process */
- schedule_timeout(1);
+ schedule_timeout_uninterruptible(1);
return true;
}
@@ -1926,7 +1936,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
void mem_cgroup_update_page_stat(struct page *page,
enum mem_cgroup_page_stat_item idx, int val)
{
- struct mem_cgroup *mem;
+ struct mem_cgroup *memcg;
struct page_cgroup *pc = lookup_page_cgroup(page);
bool need_unlock = false;
unsigned long uninitialized_var(flags);
@@ -1935,16 +1945,16 @@ void mem_cgroup_update_page_stat(struct page *page,
return;
rcu_read_lock();
- mem = pc->mem_cgroup;
- if (unlikely(!mem || !PageCgroupUsed(pc)))
+ memcg = pc->mem_cgroup;
+ if (unlikely(!memcg || !PageCgroupUsed(pc)))
goto out;
/* pc->mem_cgroup is unstable ? */
- if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
+ if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
/* take a lock against to access pc->mem_cgroup */
move_lock_page_cgroup(pc, &flags);
need_unlock = true;
- mem = pc->mem_cgroup;
- if (!mem || !PageCgroupUsed(pc))
+ memcg = pc->mem_cgroup;
+ if (!memcg || !PageCgroupUsed(pc))
goto out;
}
@@ -1960,7 +1970,7 @@ void mem_cgroup_update_page_stat(struct page *page,
BUG();
}
- this_cpu_add(mem->stat->count[idx], val);
+ this_cpu_add(memcg->stat->count[idx], val);
out:
if (unlikely(need_unlock))
@@ -1991,13 +2001,13 @@ static DEFINE_MUTEX(percpu_charge_mutex);
* cgroup which is not current target, returns false. This stock will be
* refilled.
*/
-static bool consume_stock(struct mem_cgroup *mem)
+static bool consume_stock(struct mem_cgroup *memcg)
{
struct memcg_stock_pcp *stock;
bool ret = true;
stock = &get_cpu_var(memcg_stock);
- if (mem == stock->cached && stock->nr_pages)
+ if (memcg == stock->cached && stock->nr_pages)
stock->nr_pages--;
else /* need to call res_counter_charge */
ret = false;
@@ -2038,24 +2048,24 @@ static void drain_local_stock(struct work_struct *dummy)
* Cache charges(val) which is from res_counter, to local per_cpu area.
* This will be consumed by consume_stock() function, later.
*/
-static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
- if (stock->cached != mem) { /* reset if necessary */
+ if (stock->cached != memcg) { /* reset if necessary */
drain_stock(stock);
- stock->cached = mem;
+ stock->cached = memcg;
}
stock->nr_pages += nr_pages;
put_cpu_var(memcg_stock);
}
/*
- * Drains all per-CPU charge caches for given root_mem resp. subtree
+ * Drains all per-CPU charge caches for given root_memcg resp. subtree
* of the hierarchy under it. sync flag says whether we should block
* until the work is done.
*/
-static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
+static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
{
int cpu, curcpu;
@@ -2064,12 +2074,12 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
curcpu = get_cpu();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
- struct mem_cgroup *mem;
+ struct mem_cgroup *memcg;
- mem = stock->cached;
- if (!mem || !stock->nr_pages)
+ memcg = stock->cached;
+ if (!memcg || !stock->nr_pages)
continue;
- if (!mem_cgroup_same_or_subtree(root_mem, mem))
+ if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
continue;
if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
if (cpu == curcpu)
@@ -2098,23 +2108,23 @@ out:
* expects some charges will be back to res_counter later but cannot wait for
* it.
*/
-static void drain_all_stock_async(struct mem_cgroup *root_mem)
+static void drain_all_stock_async(struct mem_cgroup *root_memcg)
{
/*
* If someone calls draining, avoid adding more kworker runs.
*/
if (!mutex_trylock(&percpu_charge_mutex))
return;
- drain_all_stock(root_mem, false);
+ drain_all_stock(root_memcg, false);
mutex_unlock(&percpu_charge_mutex);
}
/* This is a synchronous drain interface. */
-static void drain_all_stock_sync(struct mem_cgroup *root_mem)
+static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
{
/* called when force_empty is called */
mutex_lock(&percpu_charge_mutex);
- drain_all_stock(root_mem, true);
+ drain_all_stock(root_memcg, true);
mutex_unlock(&percpu_charge_mutex);
}
@@ -2122,35 +2132,35 @@ static void drain_all_stock_sync(struct mem_cgroup *root_mem)
* This function drains percpu counter value from DEAD cpu and
* move it to local cpu. Note that this function can be preempted.
*/
-static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
+static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
{
int i;
- spin_lock(&mem->pcp_counter_lock);
+ spin_lock(&memcg->pcp_counter_lock);
for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
- long x = per_cpu(mem->stat->count[i], cpu);
+ long x = per_cpu(memcg->stat->count[i], cpu);
- per_cpu(mem->stat->count[i], cpu) = 0;
- mem->nocpu_base.count[i] += x;
+ per_cpu(memcg->stat->count[i], cpu) = 0;
+ memcg->nocpu_base.count[i] += x;
}
for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
- unsigned long x = per_cpu(mem->stat->events[i], cpu);
+ unsigned long x = per_cpu(memcg->stat->events[i], cpu);
- per_cpu(mem->stat->events[i], cpu) = 0;
- mem->nocpu_base.events[i] += x;
+ per_cpu(memcg->stat->events[i], cpu) = 0;
+ memcg->nocpu_base.events[i] += x;
}
/* need to clear ON_MOVE value, works as a kind of lock. */
- per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
- spin_unlock(&mem->pcp_counter_lock);
+ per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
+ spin_unlock(&memcg->pcp_counter_lock);
}
-static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
+static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
{
int idx = MEM_CGROUP_ON_MOVE;
- spin_lock(&mem->pcp_counter_lock);
- per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
- spin_unlock(&mem->pcp_counter_lock);
+ spin_lock(&memcg->pcp_counter_lock);
+ per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
+ spin_unlock(&memcg->pcp_counter_lock);
}
static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
@@ -2188,7 +2198,7 @@ enum {
CHARGE_OOM_DIE, /* the current is killed because of OOM */
};
-static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages, bool oom_check)
{
unsigned long csize = nr_pages * PAGE_SIZE;
@@ -2197,16 +2207,16 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
unsigned long flags = 0;
int ret;
- ret = res_counter_charge(&mem->res, csize, &fail_res);
+ ret = res_counter_charge(&memcg->res, csize, &fail_res);
if (likely(!ret)) {
if (!do_swap_account)
return CHARGE_OK;
- ret = res_counter_charge(&mem->memsw, csize, &fail_res);
+ ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
if (likely(!ret))
return CHARGE_OK;
- res_counter_uncharge(&mem->res, csize);
+ res_counter_uncharge(&memcg->res, csize);
mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
flags |= MEM_CGROUP_RECLAIM_NOSWAP;
} else
@@ -2264,12 +2274,12 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
static int __mem_cgroup_try_charge(struct mm_struct *mm,
gfp_t gfp_mask,
unsigned int nr_pages,
- struct mem_cgroup **memcg,
+ struct mem_cgroup **ptr,
bool oom)
{
unsigned int batch = max(CHARGE_BATCH, nr_pages);
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *memcg = NULL;
int ret;
/*
@@ -2287,17 +2297,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
- if (!*memcg && !mm)
+ if (!*ptr && !mm)
goto bypass;
again:
- if (*memcg) { /* css should be a valid one */
- mem = *memcg;
- VM_BUG_ON(css_is_removed(&mem->css));
- if (mem_cgroup_is_root(mem))
+ if (*ptr) { /* css should be a valid one */
+ memcg = *ptr;
+ VM_BUG_ON(css_is_removed(&memcg->css));
+ if (mem_cgroup_is_root(memcg))
goto done;
- if (nr_pages == 1 && consume_stock(mem))
+ if (nr_pages == 1 && consume_stock(memcg))
goto done;
- css_get(&mem->css);
+ css_get(&memcg->css);
} else {
struct task_struct *p;
@@ -2305,7 +2315,7 @@ again:
p = rcu_dereference(mm->owner);
/*
* Because we don't have task_lock(), "p" can exit.
- * In that case, "mem" can point to root or p can be NULL with
+ * In that case, "memcg" can point to root or p can be NULL with
* race with swapoff. Then, we have small risk of mis-accouning.
* But such kind of mis-account by race always happens because
* we don't have cgroup_mutex(). It's overkill and we allo that
@@ -2313,12 +2323,12 @@ again:
* (*) swapoff at el will charge against mm-struct not against
* task-struct. So, mm->owner can be NULL.
*/
- mem = mem_cgroup_from_task(p);
- if (!mem || mem_cgroup_is_root(mem)) {
+ memcg = mem_cgroup_from_task(p);
+ if (!memcg || mem_cgroup_is_root(memcg)) {
rcu_read_unlock();
goto done;
}
- if (nr_pages == 1 && consume_stock(mem)) {
+ if (nr_pages == 1 && consume_stock(memcg)) {
/*
* It seems dagerous to access memcg without css_get().
* But considering how consume_stok works, it's not
@@ -2331,7 +2341,7 @@ again:
goto done;
}
/* after here, we may be blocked. we need to get refcnt */
- if (!css_tryget(&mem->css)) {
+ if (!css_tryget(&memcg->css)) {
rcu_read_unlock();
goto again;
}
@@ -2343,7 +2353,7 @@ again:
/* If killed, bypass charge */
if (fatal_signal_pending(current)) {
- css_put(&mem->css);
+ css_put(&memcg->css);
goto bypass;
}
@@ -2353,43 +2363,43 @@ again:
nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
}
- ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
+ ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
switch (ret) {
case CHARGE_OK:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
batch = nr_pages;
- css_put(&mem->css);
- mem = NULL;
+ css_put(&memcg->css);
+ memcg = NULL;
goto again;
case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
- css_put(&mem->css);
+ css_put(&memcg->css);
goto nomem;
case CHARGE_NOMEM: /* OOM routine works */
if (!oom) {
- css_put(&mem->css);
+ css_put(&memcg->css);
goto nomem;
}
/* If oom, we never return -ENOMEM */
nr_oom_retries--;
break;
case CHARGE_OOM_DIE: /* Killed by OOM Killer */
- css_put(&mem->css);
+ css_put(&memcg->css);
goto bypass;
}
} while (ret != CHARGE_OK);
if (batch > nr_pages)
- refill_stock(mem, batch - nr_pages);
- css_put(&mem->css);
+ refill_stock(memcg, batch - nr_pages);
+ css_put(&memcg->css);
done:
- *memcg = mem;
+ *ptr = memcg;
return 0;
nomem:
- *memcg = NULL;
+ *ptr = NULL;
return -ENOMEM;
bypass:
- *memcg = NULL;
+ *ptr = NULL;
return 0;
}
@@ -2398,15 +2408,15 @@ bypass:
* This function is for that and do uncharge, put css's refcnt.
* gotten by try_charge().
*/
-static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
+static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
unsigned int nr_pages)
{
- if (!mem_cgroup_is_root(mem)) {
+ if (!mem_cgroup_is_root(memcg)) {
unsigned long bytes = nr_pages * PAGE_SIZE;
- res_counter_uncharge(&mem->res, bytes);
+ res_counter_uncharge(&memcg->res, bytes);
if (do_swap_account)
- res_counter_uncharge(&mem->memsw, bytes);
+ res_counter_uncharge(&memcg->memsw, bytes);
}
}
@@ -2431,7 +2441,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
{
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *memcg = NULL;
struct page_cgroup *pc;
unsigned short id;
swp_entry_t ent;
@@ -2441,23 +2451,23 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
- mem = pc->mem_cgroup;
- if (mem && !css_tryget(&mem->css))
- mem = NULL;
+ memcg = pc->mem_cgroup;
+ if (memcg && !css_tryget(&memcg->css))
+ memcg = NULL;
} else if (PageSwapCache(page)) {
ent.val = page_private(page);
id = lookup_swap_cgroup(ent);
rcu_read_lock();
- mem = mem_cgroup_lookup(id);
- if (mem && !css_tryget(&mem->css))
- mem = NULL;
+ memcg = mem_cgroup_lookup(id);
+ if (memcg && !css_tryget(&memcg->css))
+ memcg = NULL;
rcu_read_unlock();
}
unlock_page_cgroup(pc);
- return mem;
+ return memcg;
}
-static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
struct page *page,
unsigned int nr_pages,
struct page_cgroup *pc,
@@ -2466,14 +2476,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
- __mem_cgroup_cancel_charge(mem, nr_pages);
+ __mem_cgroup_cancel_charge(memcg, nr_pages);
return;
}
/*
* we don't need page_cgroup_lock about tail pages, becase they are not
* accessed by any other context at this point.
*/
- pc->mem_cgroup = mem;
+ pc->mem_cgroup = memcg;
/*
* We access a page_cgroup asynchronously without lock_page_cgroup().
* Especially when a page_cgroup is taken from a page, pc->mem_cgroup
@@ -2496,14 +2506,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
break;
}
- mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
+ mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
unlock_page_cgroup(pc);
/*
* "charge_statistics" updated event counter. Then, check it.
* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
* if they exceeds softlimit.
*/
- memcg_check_events(mem, page);
+ memcg_check_events(memcg, page);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2690,7 +2700,7 @@ out:
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask, enum charge_type ctype)
{
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *memcg = NULL;
unsigned int nr_pages = 1;
struct page_cgroup *pc;
bool oom = true;
@@ -2709,11 +2719,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
pc = lookup_page_cgroup(page);
BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
- ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
- if (ret || !mem)
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
+ if (ret || !memcg)
return ret;
- __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
+ __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);
return 0;
}
@@ -2742,7 +2752,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
enum charge_type ctype);
static void
-__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
+__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
enum charge_type ctype)
{
struct page_cgroup *pc = lookup_page_cgroup(page);
@@ -2752,7 +2762,7 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
* LRU. Take care of it.
*/
mem_cgroup_lru_del_before_commit(page);
- __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
+ __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
mem_cgroup_lru_add_after_commit(page);
return;
}
@@ -2760,7 +2770,7 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *memcg = NULL;
int ret;
if (mem_cgroup_disabled())
@@ -2772,8 +2782,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
mm = &init_mm;
if (page_is_file_cache(page)) {
- ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
- if (ret || !mem)
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true);
+ if (ret || !memcg)
return ret;
/*
@@ -2781,15 +2791,15 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
* put that would remove them from the LRU list, make
* sure that they get relinked properly.
*/
- __mem_cgroup_commit_charge_lrucare(page, mem,
+ __mem_cgroup_commit_charge_lrucare(page, memcg,
MEM_CGROUP_CHARGE_TYPE_CACHE);
return ret;
}
/* shmem */
if (PageSwapCache(page)) {
- ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
+ ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
if (!ret)
- __mem_cgroup_commit_charge_swapin(page, mem,
+ __mem_cgroup_commit_charge_swapin(page, memcg,
MEM_CGROUP_CHARGE_TYPE_SHMEM);
} else
ret = mem_cgroup_charge_common(page, mm, gfp_mask,
@@ -2808,7 +2818,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
struct page *page,
gfp_t mask, struct mem_cgroup **ptr)
{
- struct mem_cgroup *mem;
+ struct mem_cgroup *memcg;
int ret;
*ptr = NULL;
@@ -2826,12 +2836,12 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
*/
if (!PageSwapCache(page))
goto charge_cur_mm;
- mem = try_get_mem_cgroup_from_page(page);
- if (!mem)
+ memcg = try_get_mem_cgroup_from_page(page);
+ if (!memcg)
goto charge_cur_mm;
- *ptr = mem;
+ *ptr = memcg;
ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
- css_put(&mem->css);
+ css_put(&memcg->css);
return ret;
charge_cur_mm:
if (unlikely(!mm))
@@ -2891,16 +2901,16 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
MEM_CGROUP_CHARGE_TYPE_MAPPED);
}
-void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
+void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
{
if (mem_cgroup_disabled())
return;
- if (!mem)
+ if (!memcg)
return;
- __mem_cgroup_cancel_charge(mem, 1);
+ __mem_cgroup_cancel_charge(memcg, 1);
}
-static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
+static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
unsigned int nr_pages,
const enum charge_type ctype)
{
@@ -2918,7 +2928,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
* uncharges. Then, it's ok to ignore memcg's refcnt.
*/
if (!batch->memcg)
- batch->memcg = mem;
+ batch->memcg = memcg;
/*
* do_batch > 0 when unmapping pages or inode invalidate/truncate.
* In those cases, all pages freed continuously can be expected to be in
@@ -2938,7 +2948,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
* merge a series of uncharges to an uncharge of res_counter.
* If not, we uncharge res_counter ony by one.
*/
- if (batch->memcg != mem)
+ if (batch->memcg != memcg)
goto direct_uncharge;
/* remember freed charge and uncharge it later */
batch->nr_pages++;
@@ -2946,11 +2956,11 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
batch->memsw_nr_pages++;
return;
direct_uncharge:
- res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
+ res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
if (uncharge_memsw)
- res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
- if (unlikely(batch->memcg != mem))
- memcg_oom_recover(mem);
+ res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
+ if (unlikely(batch->memcg != memcg))
+ memcg_oom_recover(memcg);
return;
}
@@ -2960,7 +2970,7 @@ direct_uncharge:
static struct mem_cgroup *
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
{
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *memcg = NULL;
unsigned int nr_pages = 1;
struct page_cgroup *pc;
@@ -2983,7 +2993,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
lock_page_cgroup(pc);
- mem = pc->mem_cgroup;
+ memcg = pc->mem_cgroup;
if (!PageCgroupUsed(pc))
goto unlock_out;
@@ -3006,7 +3016,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
break;
}
- mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
+ mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages);
ClearPageCgroupUsed(pc);
/*
@@ -3018,18 +3028,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
unlock_page_cgroup(pc);
/*
- * even after unlock, we have mem->res.usage here and this memcg
+ * even after unlock, we have memcg->res.usage here and this memcg
* will never be freed.
*/
- memcg_check_events(mem, page);
+ memcg_check_events(memcg, page);
if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
- mem_cgroup_swap_statistics(mem, true);
- mem_cgroup_get(mem);
+ mem_cgroup_swap_statistics(memcg, true);
+ mem_cgroup_get(memcg);
}
- if (!mem_cgroup_is_root(mem))
- mem_cgroup_do_uncharge(mem, nr_pages, ctype);
+ if (!mem_cgroup_is_root(memcg))
+ mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
- return mem;
+ return memcg;
unlock_out:
unlock_page_cgroup(pc);
@@ -3219,7 +3229,7 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
int mem_cgroup_prepare_migration(struct page *page,
struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
{
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *memcg = NULL;
struct page_cgroup *pc;
enum charge_type ctype;
int ret = 0;
@@ -3233,8 +3243,8 @@ int mem_cgroup_prepare_migration(struct page *page,
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
- mem = pc->mem_cgroup;
- css_get(&mem->css);
+ memcg = pc->mem_cgroup;
+ css_get(&memcg->css);
/*
* At migrating an anonymous page, its mapcount goes down
* to 0 and uncharge() will be called. But, even if it's fully
@@ -3272,12 +3282,12 @@ int mem_cgroup_prepare_migration(struct page *page,
* If the page is not charged at this point,
* we return here.
*/
- if (!mem)
+ if (!memcg)
return 0;
- *ptr = mem;
+ *ptr = memcg;
ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
- css_put(&mem->css);/* drop extra refcnt */
+ css_put(&memcg->css);/* drop extra refcnt */
if (ret || *ptr == NULL) {
if (PageAnon(page)) {
lock_page_cgroup(pc);
@@ -3303,21 +3313,21 @@ int mem_cgroup_prepare_migration(struct page *page,
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
else
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
- __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
+ __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
return ret;
}
/* remove redundant charge if migration failed*/
-void mem_cgroup_end_migration(struct mem_cgroup *mem,
+void mem_cgroup_end_migration(struct mem_cgroup *memcg,
struct page *oldpage, struct page *newpage, bool migration_ok)
{
struct page *used, *unused;
struct page_cgroup *pc;
- if (!mem)
+ if (!memcg)
return;
/* blocks rmdir() */
- cgroup_exclude_rmdir(&mem->css);
+ cgroup_exclude_rmdir(&memcg->css);
if (!migration_ok) {
used = oldpage;
unused = newpage;
@@ -3353,7 +3363,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
* So, rmdir()->pre_destroy() can be called while we do this charge.
* In that case, we need to call pre_destroy() again. check it here.
*/
- cgroup_release_and_wakeup_rmdir(&mem->css);
+ cgroup_release_and_wakeup_rmdir(&memcg->css);
}
#ifdef CONFIG_DEBUG_VM
@@ -3432,7 +3442,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
/*
* Rather than hide all in some function, I do this in
* open coded manner. You see what this really does.
- * We have to guarantee mem->res.limit < mem->memsw.limit.
+ * We have to guarantee memcg->res.limit < memcg->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3494,7 +3504,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
/*
* Rather than hide all in some function, I do this in
* open coded manner. You see what this really does.
- * We have to guarantee mem->res.limit < mem->memsw.limit.
+ * We have to guarantee memcg->res.limit < memcg->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -3632,7 +3642,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
* This routine traverse page_cgroup in given list and drop them all.
* *And* this routine doesn't reclaim page itself, just removes page_cgroup.
*/
-static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
+static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
int node, int zid, enum lru_list lru)
{
struct zone *zone;
@@ -3643,7 +3653,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
int ret = 0;
zone = &NODE_DATA(node)->node_zones[zid];
- mz = mem_cgroup_zoneinfo(mem, node, zid);
+ mz = mem_cgroup_zoneinfo(memcg, node, zid);
list = &mz->lists[lru];
loop = MEM_CGROUP_ZSTAT(mz, lru);
@@ -3670,7 +3680,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
page = lookup_cgroup_page(pc);
- ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
+ ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
if (ret == -ENOMEM)
break;
@@ -3691,14 +3701,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
* make mem_cgroup's charge to be 0 if there is no task.
* This enables deleting this mem_cgroup.
*/
-static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
+static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
{
int ret;
int node, zid, shrink;
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct cgroup *cgrp = mem->css.cgroup;
+ struct cgroup *cgrp = memcg->css.cgroup;
- css_get(&mem->css);
+ css_get(&memcg->css);
shrink = 0;
/* should free all ? */
@@ -3714,14 +3724,14 @@ move_account:
goto out;
/* This is for making all *used* pages to be on LRU. */
lru_add_drain_all();
- drain_all_stock_sync(mem);
+ drain_all_stock_sync(memcg);
ret = 0;
- mem_cgroup_start_move(mem);
+ mem_cgroup_start_move(memcg);
for_each_node_state(node, N_HIGH_MEMORY) {
for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
enum lru_list l;
for_each_lru(l) {
- ret = mem_cgroup_force_empty_list(mem,
+ ret = mem_cgroup_force_empty_list(memcg,
node, zid, l);
if (ret)
break;
@@ -3730,16 +3740,16 @@ move_account:
if (ret)
break;
}
- mem_cgroup_end_move(mem);
- memcg_oom_recover(mem);
+ mem_cgroup_end_move(memcg);
+ memcg_oom_recover(memcg);
/* it seems parent cgroup doesn't have enough mem */
if (ret == -ENOMEM)
goto try_to_free;
cond_resched();
/* "ret" should also be checked to ensure all lists are empty. */
- } while (mem->res.usage > 0 || ret);
+ } while (memcg->res.usage > 0 || ret);
out:
- css_put(&mem->css);
+ css_put(&memcg->css);
return ret;
try_to_free:
@@ -3752,14 +3762,14 @@ try_to_free:
lru_add_drain_all();
/* try to free all pages in this cgroup */
shrink = 1;
- while (nr_retries && mem->res.usage > 0) {
+ while (nr_retries && memcg->res.usage > 0) {
int progress;
if (signal_pending(current)) {
ret = -EINTR;
goto out;
}
- progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
+ progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
false);
if (!progress) {
nr_retries--;
@@ -3788,12 +3798,12 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
u64 val)
{
int retval = 0;
- struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
struct cgroup *parent = cont->parent;
- struct mem_cgroup *parent_mem = NULL;
+ struct mem_cgroup *parent_memcg = NULL;
if (parent)
- parent_mem = mem_cgroup_from_cont(parent);
+ parent_memcg = mem_cgroup_from_cont(parent);
cgroup_lock();
/*
@@ -3804,10 +3814,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
* For the root cgroup, parent_mem is NULL, we allow value to be
* set if there are no children.
*/
- if ((!parent_mem || !parent_mem->use_hierarchy) &&
+ if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
(val == 1 || val == 0)) {
if (list_empty(&cont->children))
- mem->use_hierarchy = val;
+ memcg->use_hierarchy = val;
else
retval = -EBUSY;
} else
@@ -3818,14 +3828,14 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
}
-static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
{
struct mem_cgroup *iter;
long val = 0;
/* Per-cpu values can be negative, use a signed accumulator */
- for_each_mem_cgroup_tree(iter, mem)
+ for_each_mem_cgroup_tree(iter, memcg)
val += mem_cgroup_read_stat(iter, idx);
if (val < 0) /* race ? */
@@ -3833,29 +3843,29 @@ static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
return val;
}
-static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
+static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
u64 val;
- if (!mem_cgroup_is_root(mem)) {
+ if (!mem_cgroup_is_root(memcg)) {
if (!swap)
- return res_counter_read_u64(&mem->res, RES_USAGE);
+ return res_counter_read_u64(&memcg->res, RES_USAGE);
else
- return res_counter_read_u64(&mem->memsw, RES_USAGE);
+ return res_counter_read_u64(&memcg->memsw, RES_USAGE);
}
- val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
- val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
+ val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
if (swap)
- val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
+ val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
return val << PAGE_SHIFT;
}
static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
u64 val;
int type, name;
@@ -3864,15 +3874,15 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
switch (type) {
case _MEM:
if (name == RES_USAGE)
- val = mem_cgroup_usage(mem, false);
+ val = mem_cgroup_usage(memcg, false);
else
- val = res_counter_read_u64(&mem->res, name);
+ val = res_counter_read_u64(&memcg->res, name);
break;
case _MEMSWAP:
if (name == RES_USAGE)
- val = mem_cgroup_usage(mem, true);
+ val = mem_cgroup_usage(memcg, true);
else
- val = res_counter_read_u64(&mem->memsw, name);
+ val = res_counter_read_u64(&memcg->memsw, name);
break;
default:
BUG();
@@ -3960,24 +3970,24 @@ out:
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
{
- struct mem_cgroup *mem;
+ struct mem_cgroup *memcg;
int type, name;
- mem = mem_cgroup_from_cont(cont);
+ memcg = mem_cgroup_from_cont(cont);
type = MEMFILE_TYPE(event);
name = MEMFILE_ATTR(event);
switch (name) {
case RES_MAX_USAGE:
if (type == _MEM)
- res_counter_reset_max(&mem->res);
+ res_counter_reset_max(&memcg->res);
else
- res_counter_reset_max(&mem->memsw);
+ res_counter_reset_max(&memcg->memsw);
break;
case RES_FAILCNT:
if (type == _MEM)
- res_counter_reset_failcnt(&mem->res);
+ res_counter_reset_failcnt(&memcg->res);
else
- res_counter_reset_failcnt(&mem->memsw);
+ res_counter_reset_failcnt(&memcg->memsw);
break;
}
@@ -3994,7 +4004,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
if (val >= (1 << NR_MOVE_TYPE))
return -EINVAL;
@@ -4004,7 +4014,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
* inconsistent.
*/
cgroup_lock();
- mem->move_charge_at_immigrate = val;
+ memcg->move_charge_at_immigrate = val;
cgroup_unlock();
return 0;
@@ -4061,49 +4071,49 @@ struct {
static void
-mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
+mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
{
s64 val;
/* per cpu stat */
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
+ val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
s->stat[MCS_CACHE] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
+ val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
s->stat[MCS_RSS] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
+ val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
- val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
+ val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
s->stat[MCS_PGPGIN] += val;
- val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
+ val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
s->stat[MCS_PGPGOUT] += val;
if (do_swap_account) {
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
+ val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
s->stat[MCS_SWAP] += val * PAGE_SIZE;
}
- val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
+ val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
s->stat[MCS_PGFAULT] += val;
- val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
+ val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
s->stat[MCS_PGMAJFAULT] += val;
/* per zone stat */
- val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));
+ val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
- val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));
+ val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
- val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));
+ val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
- val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));
+ val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
- val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));
+ val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
}
static void
-mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
+mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
{
struct mem_cgroup *iter;
- for_each_mem_cgroup_tree(iter, mem)
+ for_each_mem_cgroup_tree(iter, memcg)
mem_cgroup_get_local_stat(iter, s);
}
@@ -4189,8 +4199,6 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
}
#ifdef CONFIG_DEBUG_VM
- cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
-
{
int nid, zid;
struct mem_cgroup_per_zone *mz;
@@ -4327,20 +4335,20 @@ static int compare_thresholds(const void *a, const void *b)
return _a->threshold - _b->threshold;
}
-static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
+static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
{
struct mem_cgroup_eventfd_list *ev;
- list_for_each_entry(ev, &mem->oom_notify, list)
+ list_for_each_entry(ev, &memcg->oom_notify, list)
eventfd_signal(ev->eventfd, 1);
return 0;
}
-static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
+static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
- for_each_mem_cgroup_tree(iter, mem)
+ for_each_mem_cgroup_tree(iter, memcg)
mem_cgroup_oom_notify_cb(iter);
}
@@ -4530,7 +4538,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
struct cftype *cft, struct eventfd_ctx *eventfd)
{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_eventfd_list *ev, *tmp;
int type = MEMFILE_TYPE(cft->private);
@@ -4538,7 +4546,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
spin_lock(&memcg_oom_lock);
- list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
+ list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
if (ev->eventfd == eventfd) {
list_del(&ev->list);
kfree(ev);
@@ -4551,11 +4559,11 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
struct cftype *cft, struct cgroup_map_cb *cb)
{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
- cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
+ cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
- if (atomic_read(&mem->under_oom))
+ if (atomic_read(&memcg->under_oom))
cb->fill(cb, "under_oom", 1);
else
cb->fill(cb, "under_oom", 0);
@@ -4565,7 +4573,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup *parent;
/* cannot set to root cgroup and only 0 and 1 are allowed */
@@ -4577,13 +4585,13 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
cgroup_lock();
/* oom-kill-disable is a flag for subhierarchy. */
if ((parent->use_hierarchy) ||
- (mem->use_hierarchy && !list_empty(&cgrp->children))) {
+ (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
cgroup_unlock();
return -EINVAL;
}
- mem->oom_kill_disable = val;
+ memcg->oom_kill_disable = val;
if (!val)
- memcg_oom_recover(mem);
+ memcg_oom_recover(memcg);
cgroup_unlock();
return 0;
}
@@ -4719,7 +4727,7 @@ static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
}
#endif
-static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
+static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup_per_zone *mz;
@@ -4739,21 +4747,21 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
if (!pn)
return 1;
- mem->info.nodeinfo[node] = pn;
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
for_each_lru(l)
INIT_LIST_HEAD(&mz->lists[l]);
mz->usage_in_excess = 0;
mz->on_tree = false;
- mz->mem = mem;
+ mz->mem = memcg;
}
+ memcg->info.nodeinfo[node] = pn;
return 0;
}
-static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
+static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
- kfree(mem->info.nodeinfo[node]);
+ kfree(memcg->info.nodeinfo[node]);
}
static struct mem_cgroup *mem_cgroup_alloc(void)
@@ -4795,51 +4803,51 @@ out_free:
* Removal of cgroup itself succeeds regardless of refs from swap.
*/
-static void __mem_cgroup_free(struct mem_cgroup *mem)
+static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- mem_cgroup_remove_from_trees(mem);
- free_css_id(&mem_cgroup_subsys, &mem->css);
+ mem_cgroup_remove_from_trees(memcg);
+ free_css_id(&mem_cgroup_subsys, &memcg->css);
for_each_node_state(node, N_POSSIBLE)
- free_mem_cgroup_per_zone_info(mem, node);
+ free_mem_cgroup_per_zone_info(memcg, node);
- free_percpu(mem->stat);
+ free_percpu(memcg->stat);
if (sizeof(struct mem_cgroup) < PAGE_SIZE)
- kfree(mem);
+ kfree(memcg);
else
- vfree(mem);
+ vfree(memcg);
}
-static void mem_cgroup_get(struct mem_cgroup *mem)
+static void mem_cgroup_get(struct mem_cgroup *memcg)
{
- atomic_inc(&mem->refcnt);
+ atomic_inc(&memcg->refcnt);
}
-static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
+static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
{
- if (atomic_sub_and_test(count, &mem->refcnt)) {
- struct mem_cgroup *parent = parent_mem_cgroup(mem);
- __mem_cgroup_free(mem);
+ if (atomic_sub_and_test(count, &memcg->refcnt)) {
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ __mem_cgroup_free(memcg);
if (parent)
mem_cgroup_put(parent);
}
}
-static void mem_cgroup_put(struct mem_cgroup *mem)
+static void mem_cgroup_put(struct mem_cgroup *memcg)
{
- __mem_cgroup_put(mem, 1);
+ __mem_cgroup_put(memcg, 1);
}
/*
* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
*/
-static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
+static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
- if (!mem->res.parent)
+ if (!memcg->res.parent)
return NULL;
- return mem_cgroup_from_res_counter(mem->res.parent, res);
+ return mem_cgroup_from_res_counter(memcg->res.parent, res);
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -4882,16 +4890,16 @@ static int mem_cgroup_soft_limit_tree_init(void)
static struct cgroup_subsys_state * __ref
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
- struct mem_cgroup *mem, *parent;
+ struct mem_cgroup *memcg, *parent;
long error = -ENOMEM;
int node;
- mem = mem_cgroup_alloc();
- if (!mem)
+ memcg = mem_cgroup_alloc();
+ if (!memcg)
return ERR_PTR(error);
for_each_node_state(node, N_POSSIBLE)
- if (alloc_mem_cgroup_per_zone_info(mem, node))
+ if (alloc_mem_cgroup_per_zone_info(memcg, node))
goto free_out;
/* root ? */
@@ -4899,7 +4907,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
int cpu;
enable_swap_cgroup();
parent = NULL;
- root_mem_cgroup = mem;
+ root_mem_cgroup = memcg;
if (mem_cgroup_soft_limit_tree_init())
goto free_out;
for_each_possible_cpu(cpu) {
@@ -4910,13 +4918,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
} else {
parent = mem_cgroup_from_cont(cont->parent);
- mem->use_hierarchy = parent->use_hierarchy;
- mem->oom_kill_disable = parent->oom_kill_disable;
+ memcg->use_hierarchy = parent->use_hierarchy;
+ memcg->oom_kill_disable = parent->oom_kill_disable;
}
if (parent && parent->use_hierarchy) {
- res_counter_init(&mem->res, &parent->res);
- res_counter_init(&mem->memsw, &parent->memsw);
+ res_counter_init(&memcg->res, &parent->res);
+ res_counter_init(&memcg->memsw, &parent->memsw);
/*
* We increment refcnt of the parent to ensure that we can
* safely access it on res_counter_charge/uncharge.
@@ -4925,21 +4933,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
*/
mem_cgroup_get(parent);
} else {
- res_counter_init(&mem->res, NULL);
- res_counter_init(&mem->memsw, NULL);
+ res_counter_init(&memcg->res, NULL);
+ res_counter_init(&memcg->memsw, NULL);
}
- mem->last_scanned_child = 0;
- mem->last_scanned_node = MAX_NUMNODES;
- INIT_LIST_HEAD(&mem->oom_notify);
+ memcg->last_scanned_child = 0;
+ memcg->last_scanned_node = MAX_NUMNODES;
+ INIT_LIST_HEAD(&memcg->oom_notify);
if (parent)
- mem->swappiness = mem_cgroup_swappiness(parent);
- atomic_set(&mem->refcnt, 1);
- mem->move_charge_at_immigrate = 0;
- mutex_init(&mem->thresholds_lock);
- return &mem->css;
+ memcg->swappiness = mem_cgroup_swappiness(parent);
+ atomic_set(&memcg->refcnt, 1);
+ memcg->move_charge_at_immigrate = 0;
+ mutex_init(&memcg->thresholds_lock);
+ return &memcg->css;
free_out:
- __mem_cgroup_free(mem);
+ __mem_cgroup_free(memcg);
root_mem_cgroup = NULL;
return ERR_PTR(error);
}
@@ -4947,17 +4955,17 @@ free_out:
static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
struct cgroup *cont)
{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- return mem_cgroup_force_empty(mem, false);
+ return mem_cgroup_force_empty(memcg, false);
}
static void mem_cgroup_destroy(struct cgroup_subsys *ss,
struct cgroup *cont)
{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- mem_cgroup_put(mem);
+ mem_cgroup_put(memcg);
}
static int mem_cgroup_populate(struct cgroup_subsys *ss,
@@ -4980,9 +4988,9 @@ static int mem_cgroup_do_precharge(unsigned long count)
{
int ret = 0;
int batch_count = PRECHARGE_COUNT_AT_ONCE;
- struct mem_cgroup *mem = mc.to;
+ struct mem_cgroup *memcg = mc.to;
- if (mem_cgroup_is_root(mem)) {
+ if (mem_cgroup_is_root(memcg)) {
mc.precharge += count;
/* we don't need css_get for root */
return ret;
@@ -4991,16 +4999,16 @@ static int mem_cgroup_do_precharge(unsigned long count)
if (count > 1) {
struct res_counter *dummy;
/*
- * "mem" cannot be under rmdir() because we've already checked
+ * "memcg" cannot be under rmdir() because we've already checked
* by cgroup_lock_live_cgroup() that it is not removed and we
* are still under the same cgroup_mutex. So we can postpone
* css_get().
*/
- if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
+ if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
goto one_by_one;
- if (do_swap_account && res_counter_charge(&mem->memsw,
+ if (do_swap_account && res_counter_charge(&memcg->memsw,
PAGE_SIZE * count, &dummy)) {
- res_counter_uncharge(&mem->res, PAGE_SIZE * count);
+ res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
goto one_by_one;
}
mc.precharge += count;
@@ -5017,8 +5025,9 @@ one_by_one:
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
- if (ret || !mem)
+ ret = __mem_cgroup_try_charge(NULL,
+ GFP_KERNEL, 1, &memcg, false);
+ if (ret || !memcg)
/* mem_cgroup_clear_mc() will do uncharge later */
return -ENOMEM;
mc.precharge++;
@@ -5292,13 +5301,13 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct task_struct *p)
{
int ret = 0;
- struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
- if (mem->move_charge_at_immigrate) {
+ if (memcg->move_charge_at_immigrate) {
struct mm_struct *mm;
struct mem_cgroup *from = mem_cgroup_from_task(p);
- VM_BUG_ON(from == mem);
+ VM_BUG_ON(from == memcg);
mm = get_task_mm(p);
if (!mm)
@@ -5313,7 +5322,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
mem_cgroup_start_move(from);
spin_lock(&mc.lock);
mc.from = from;
- mc.to = mem;
+ mc.to = memcg;
spin_unlock(&mc.lock);
/* We set mc.moving_task later */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index edc388db730a..06d3479513aa 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -42,6 +42,7 @@
#include <linux/sched.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
+#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/backing-dev.h>
diff --git a/mm/memory.c b/mm/memory.c
index a56e3ba816b2..829d43735402 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -47,7 +47,7 @@
#include <linux/pagemap.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/writeback.h>
@@ -1503,7 +1503,7 @@ split_fallthrough:
}
if (flags & FOLL_GET)
- get_page(page);
+ get_page_foll(page);
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6e7d8b21dbfa..2168489c0bc9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -11,7 +11,7 @@
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/compiler.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/slab.h>
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cd237f478304..adc395481813 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -75,7 +75,7 @@
#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/string.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
diff --git a/mm/mempool.c b/mm/mempool.c
index 1a3bc3d4d554..e73641b79bb5 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -10,7 +10,7 @@
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
diff --git a/mm/migrate.c b/mm/migrate.c
index 33358f878111..578e29174fa6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -13,7 +13,7 @@
*/
#include <linux/migrate.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pagemap.h>
diff --git a/mm/mlock.c b/mm/mlock.c
index bd34b3a10852..4f4f53bdc65d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -14,7 +14,7 @@
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/rmap.h>
#include <linux/mmzone.h>
#include <linux/hugetlb.h>
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4e0e26591dfa..1ffd97ae26d7 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -8,7 +8,7 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/kobject.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include "internal.h"
#ifdef CONFIG_DEBUG_MEMORY_INIT
diff --git a/mm/mmap.c b/mm/mmap.c
index 3c0061f744f5..eae90af60ea6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -22,7 +22,7 @@
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/profile.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 9e82e937000e..cf332bc0080a 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -5,7 +5,7 @@
#include <linux/mm.h>
#include <linux/mmu_context.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/sched.h>
#include <asm/mmu_context.h>
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8d032de4088e..9a611d3a1848 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -11,7 +11,7 @@
#include <linux/rculist.h>
#include <linux/mmu_notifier.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/rcupdate.h>
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f5b7d1760213..7cf7b7ddc7c5 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -8,7 +8,6 @@
#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
-#include <linux/module.h>
struct pglist_data *first_online_pgdat(void)
{
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 6e93dc7f2586..7fa41b4a07bf 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -12,7 +12,7 @@
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/bootmem.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kmemleak.h>
#include <linux/range.h>
#include <linux/memblock.h>
diff --git a/mm/nommu.c b/mm/nommu.c
index 4358032566e9..73419c55eda6 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -13,7 +13,7 @@
* Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/swap.h>
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e916168b6e0a..471dedb463ab 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,7 +26,7 @@
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/cpuset.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/notifier.h>
#include <linux/memcontrol.h>
#include <linux/mempolicy.h>
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 793e9874de51..a3278f005230 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -12,7 +12,7 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/mm.h>
@@ -46,26 +46,14 @@
*/
#define BANDWIDTH_INTERVAL max(HZ/5, 1)
+#define RATELIMIT_CALC_SHIFT 10
+
/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
* will look to see if it needs to force writeback or throttling.
*/
static long ratelimit_pages = 32;
-/*
- * When balance_dirty_pages decides that the caller needs to perform some
- * non-background writeback, this is how many pages it will attempt to write.
- * It should be somewhat larger than dirtied pages to ensure that reasonably
- * large amounts of I/O are submitted.
- */
-static inline long sync_writeback_pages(unsigned long dirtied)
-{
- if (dirtied < ratelimit_pages)
- dirtied = ratelimit_pages;
-
- return dirtied + dirtied / 2;
-}
-
/* The following parameters are exported via /proc/sys/vm */
/*
@@ -167,6 +155,8 @@ static void update_completion_period(void)
int shift = calc_period_shift();
prop_change_shift(&vm_completions, shift);
prop_change_shift(&vm_dirties, shift);
+
+ writeback_set_ratelimit();
}
int dirty_background_ratio_handler(struct ctl_table *table, int write,
@@ -260,50 +250,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
numerator, denominator);
}
-static inline void task_dirties_fraction(struct task_struct *tsk,
- long *numerator, long *denominator)
-{
- prop_fraction_single(&vm_dirties, &tsk->dirties,
- numerator, denominator);
-}
-
-/*
- * task_dirty_limit - scale down dirty throttling threshold for one task
- *
- * task specific dirty limit:
- *
- * dirty -= (dirty/8) * p_{t}
- *
- * To protect light/slow dirtying tasks from heavier/fast ones, we start
- * throttling individual tasks before reaching the bdi dirty limit.
- * Relatively low thresholds will be allocated to heavy dirtiers. So when
- * dirty pages grow large, heavy dirtiers will be throttled first, which will
- * effectively curb the growth of dirty pages. Light dirtiers with high enough
- * dirty threshold may never get throttled.
- */
-#define TASK_LIMIT_FRACTION 8
-static unsigned long task_dirty_limit(struct task_struct *tsk,
- unsigned long bdi_dirty)
-{
- long numerator, denominator;
- unsigned long dirty = bdi_dirty;
- u64 inv = dirty / TASK_LIMIT_FRACTION;
-
- task_dirties_fraction(tsk, &numerator, &denominator);
- inv *= numerator;
- do_div(inv, denominator);
-
- dirty -= inv;
-
- return max(dirty, bdi_dirty/2);
-}
-
-/* Minimum limit for any task */
-static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
-{
- return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
-}
-
/*
* bdi_min_ratio keeps the sum of the minimum dirty shares of all
* registered backing devices, which, for obvious reasons, can not
@@ -413,6 +359,12 @@ unsigned long determine_dirtyable_memory(void)
return x + 1; /* Ensure that we never return 0 */
}
+static unsigned long dirty_freerun_ceiling(unsigned long thresh,
+ unsigned long bg_thresh)
+{
+ return (thresh + bg_thresh) / 2;
+}
+
static unsigned long hard_dirty_limit(unsigned long thresh)
{
return max(thresh, global_dirty_limit);
@@ -497,6 +449,198 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
return bdi_dirty;
}
+/*
+ * Dirty position control.
+ *
+ * (o) global/bdi setpoints
+ *
+ * We want the dirty pages be balanced around the global/bdi setpoints.
+ * When the number of dirty pages is higher/lower than the setpoint, the
+ * dirty position control ratio (and hence task dirty ratelimit) will be
+ * decreased/increased to bring the dirty pages back to the setpoint.
+ *
+ * pos_ratio = 1 << RATELIMIT_CALC_SHIFT
+ *
+ * if (dirty < setpoint) scale up pos_ratio
+ * if (dirty > setpoint) scale down pos_ratio
+ *
+ * if (bdi_dirty < bdi_setpoint) scale up pos_ratio
+ * if (bdi_dirty > bdi_setpoint) scale down pos_ratio
+ *
+ * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
+ *
+ * (o) global control line
+ *
+ * ^ pos_ratio
+ * |
+ * | |<===== global dirty control scope ======>|
+ * 2.0 .............*
+ * | .*
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * 1.0 ................................*
+ * | . . *
+ * | . . *
+ * | . . *
+ * | . . *
+ * | . . *
+ * 0 +------------.------------------.----------------------*------------->
+ * freerun^ setpoint^ limit^ dirty pages
+ *
+ * (o) bdi control line
+ *
+ * ^ pos_ratio
+ * |
+ * | *
+ * | *
+ * | *
+ * | *
+ * | * |<=========== span ============>|
+ * 1.0 .......................*
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * 1/4 ...............................................* * * * * * * * * * * *
+ * | . .
+ * | . .
+ * | . .
+ * 0 +----------------------.-------------------------------.------------->
+ * bdi_setpoint^ x_intercept^
+ *
+ * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can
+ * be smoothly throttled down to normal if it starts high in situations like
+ * - start writing to a slow SD card and a fast disk at the same time. The SD
+ * card's bdi_dirty may rush to many times higher than bdi_setpoint.
+ * - the bdi dirty thresh drops quickly due to change of JBOD workload
+ */
+static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
+ unsigned long thresh,
+ unsigned long bg_thresh,
+ unsigned long dirty,
+ unsigned long bdi_thresh,
+ unsigned long bdi_dirty)
+{
+ unsigned long write_bw = bdi->avg_write_bandwidth;
+ unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
+ unsigned long limit = hard_dirty_limit(thresh);
+ unsigned long x_intercept;
+ unsigned long setpoint; /* dirty pages' target balance point */
+ unsigned long bdi_setpoint;
+ unsigned long span;
+ long long pos_ratio; /* for scaling up/down the rate limit */
+ long x;
+
+ if (unlikely(dirty >= limit))
+ return 0;
+
+ /*
+ * global setpoint
+ *
+ * setpoint - dirty 3
+ * f(dirty) := 1.0 + (----------------)
+ * limit - setpoint
+ *
+ * it's a 3rd order polynomial that subjects to
+ *
+ * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
+ * (2) f(setpoint) = 1.0 => the balance point
+ * (3) f(limit) = 0 => the hard limit
+ * (4) df/dx <= 0 => negative feedback control
+ * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
+ * => fast response on large errors; small oscillation near setpoint
+ */
+ setpoint = (freerun + limit) / 2;
+ x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
+ limit - setpoint + 1);
+ pos_ratio = x;
+ pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+ pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+ pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
+
+ /*
+ * We have computed basic pos_ratio above based on global situation. If
+ * the bdi is over/under its share of dirty pages, we want to scale
+ * pos_ratio further down/up. That is done by the following mechanism.
+ */
+
+ /*
+ * bdi setpoint
+ *
+ * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
+ *
+ * x_intercept - bdi_dirty
+ * := --------------------------
+ * x_intercept - bdi_setpoint
+ *
+ * The main bdi control line is a linear function that subjects to
+ *
+ * (1) f(bdi_setpoint) = 1.0
+ * (2) k = - 1 / (8 * write_bw) (in single bdi case)
+ * or equally: x_intercept = bdi_setpoint + 8 * write_bw
+ *
+ * For single bdi case, the dirty pages are observed to fluctuate
+ * regularly within range
+ * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
+ * for various filesystems, where (2) can yield in a reasonable 12.5%
+ * fluctuation range for pos_ratio.
+ *
+ * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
+ * own size, so move the slope over accordingly and choose a slope that
+ * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
+ */
+ if (unlikely(bdi_thresh > thresh))
+ bdi_thresh = thresh;
+ bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
+ /*
+ * scale global setpoint to bdi's:
+ * bdi_setpoint = setpoint * bdi_thresh / thresh
+ */
+ x = div_u64((u64)bdi_thresh << 16, thresh + 1);
+ bdi_setpoint = setpoint * (u64)x >> 16;
+ /*
+ * Use span=(8*write_bw) in single bdi case as indicated by
+ * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
+ *
+ * bdi_thresh thresh - bdi_thresh
+ * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
+ * thresh thresh
+ */
+ span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
+ x_intercept = bdi_setpoint + span;
+
+ if (bdi_dirty < x_intercept - span / 4) {
+ pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty),
+ x_intercept - bdi_setpoint + 1);
+ } else
+ pos_ratio /= 4;
+
+ /*
+ * bdi reserve area, safeguard against dirty pool underrun and disk idle
+ * It may push the desired control point of global dirty pages higher
+ * than setpoint.
+ */
+ x_intercept = bdi_thresh / 2;
+ if (bdi_dirty < x_intercept) {
+ if (bdi_dirty > x_intercept / 8)
+ pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty);
+ else
+ pos_ratio *= 8;
+ }
+
+ return pos_ratio;
+}
+
static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
unsigned long elapsed,
unsigned long written)
@@ -593,8 +737,153 @@ static void global_update_bandwidth(unsigned long thresh,
spin_unlock(&dirty_lock);
}
+/*
+ * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
+ *
+ * Normal bdi tasks will be curbed at or below it in long term.
+ * Obviously it should be around (write_bw / N) when there are N dd tasks.
+ */
+static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
+ unsigned long thresh,
+ unsigned long bg_thresh,
+ unsigned long dirty,
+ unsigned long bdi_thresh,
+ unsigned long bdi_dirty,
+ unsigned long dirtied,
+ unsigned long elapsed)
+{
+ unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
+ unsigned long limit = hard_dirty_limit(thresh);
+ unsigned long setpoint = (freerun + limit) / 2;
+ unsigned long write_bw = bdi->avg_write_bandwidth;
+ unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
+ unsigned long dirty_rate;
+ unsigned long task_ratelimit;
+ unsigned long balanced_dirty_ratelimit;
+ unsigned long pos_ratio;
+ unsigned long step;
+ unsigned long x;
+
+ /*
+ * The dirty rate will match the writeout rate in long term, except
+ * when dirty pages are truncated by userspace or re-dirtied by FS.
+ */
+ dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+
+ pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
+ bdi_thresh, bdi_dirty);
+ /*
+ * task_ratelimit reflects each dd's dirty rate for the past 200ms.
+ */
+ task_ratelimit = (u64)dirty_ratelimit *
+ pos_ratio >> RATELIMIT_CALC_SHIFT;
+ task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
+
+ /*
+ * A linear estimation of the "balanced" throttle rate. The theory is,
+ * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
+ * dirty_rate will be measured to be (N * task_ratelimit). So the below
+ * formula will yield the balanced rate limit (write_bw / N).
+ *
+ * Note that the expanded form is not a pure rate feedback:
+ * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1)
+ * but also takes pos_ratio into account:
+ * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2)
+ *
+ * (1) is not realistic because pos_ratio also takes part in balancing
+ * the dirty rate. Consider the state
+ * pos_ratio = 0.5 (3)
+ * rate = 2 * (write_bw / N) (4)
+ * If (1) is used, it will stuck in that state! Because each dd will
+ * be throttled at
+ * task_ratelimit = pos_ratio * rate = (write_bw / N) (5)
+ * yielding
+ * dirty_rate = N * task_ratelimit = write_bw (6)
+ * put (6) into (1) we get
+ * rate_(i+1) = rate_(i) (7)
+ *
+ * So we end up using (2) to always keep
+ * rate_(i+1) ~= (write_bw / N) (8)
+ * regardless of the value of pos_ratio. As long as (8) is satisfied,
+ * pos_ratio is able to drive itself to 1.0, which is not only where
+ * the dirty count meet the setpoint, but also where the slope of
+ * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
+ */
+ balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
+ dirty_rate | 1);
+
+ /*
+ * We could safely do this and return immediately:
+ *
+ * bdi->dirty_ratelimit = balanced_dirty_ratelimit;
+ *
+ * However to get a more stable dirty_ratelimit, the below elaborated
+ * code makes use of task_ratelimit to filter out sigular points and
+ * limit the step size.
+ *
+ * The below code essentially only uses the relative value of
+ *
+ * task_ratelimit - dirty_ratelimit
+ * = (pos_ratio - 1) * dirty_ratelimit
+ *
+ * which reflects the direction and size of dirty position error.
+ */
+
+ /*
+ * dirty_ratelimit will follow balanced_dirty_ratelimit iff
+ * task_ratelimit is on the same side of dirty_ratelimit, too.
+ * For example, when
+ * - dirty_ratelimit > balanced_dirty_ratelimit
+ * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
+ * lowering dirty_ratelimit will help meet both the position and rate
+ * control targets. Otherwise, don't update dirty_ratelimit if it will
+ * only help meet the rate target. After all, what the users ultimately
+ * feel and care are stable dirty rate and small position error.
+ *
+ * |task_ratelimit - dirty_ratelimit| is used to limit the step size
+ * and filter out the sigular points of balanced_dirty_ratelimit. Which
+ * keeps jumping around randomly and can even leap far away at times
+ * due to the small 200ms estimation period of dirty_rate (we want to
+ * keep that period small to reduce time lags).
+ */
+ step = 0;
+ if (dirty < setpoint) {
+ x = min(bdi->balanced_dirty_ratelimit,
+ min(balanced_dirty_ratelimit, task_ratelimit));
+ if (dirty_ratelimit < x)
+ step = x - dirty_ratelimit;
+ } else {
+ x = max(bdi->balanced_dirty_ratelimit,
+ max(balanced_dirty_ratelimit, task_ratelimit));
+ if (dirty_ratelimit > x)
+ step = dirty_ratelimit - x;
+ }
+
+ /*
+ * Don't pursue 100% rate matching. It's impossible since the balanced
+ * rate itself is constantly fluctuating. So decrease the track speed
+ * when it gets close to the target. Helps eliminate pointless tremors.
+ */
+ step >>= dirty_ratelimit / (2 * step + 1);
+ /*
+ * Limit the tracking speed to avoid overshooting.
+ */
+ step = (step + 7) / 8;
+
+ if (dirty_ratelimit < balanced_dirty_ratelimit)
+ dirty_ratelimit += step;
+ else
+ dirty_ratelimit -= step;
+
+ bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+ bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
+
+ trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
+}
+
void __bdi_update_bandwidth(struct backing_dev_info *bdi,
unsigned long thresh,
+ unsigned long bg_thresh,
unsigned long dirty,
unsigned long bdi_thresh,
unsigned long bdi_dirty,
@@ -602,6 +891,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
{
unsigned long now = jiffies;
unsigned long elapsed = now - bdi->bw_time_stamp;
+ unsigned long dirtied;
unsigned long written;
/*
@@ -610,6 +900,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
if (elapsed < BANDWIDTH_INTERVAL)
return;
+ dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
/*
@@ -619,18 +910,23 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
goto snapshot;
- if (thresh)
+ if (thresh) {
global_update_bandwidth(thresh, dirty, now);
-
+ bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
+ bdi_thresh, bdi_dirty,
+ dirtied, elapsed);
+ }
bdi_update_write_bandwidth(bdi, elapsed, written);
snapshot:
+ bdi->dirtied_stamp = dirtied;
bdi->written_stamp = written;
bdi->bw_time_stamp = now;
}
static void bdi_update_bandwidth(struct backing_dev_info *bdi,
unsigned long thresh,
+ unsigned long bg_thresh,
unsigned long dirty,
unsigned long bdi_thresh,
unsigned long bdi_dirty,
@@ -639,37 +935,99 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
return;
spin_lock(&bdi->wb.list_lock);
- __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
- start_time);
+ __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
+ bdi_thresh, bdi_dirty, start_time);
spin_unlock(&bdi->wb.list_lock);
}
/*
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
+ * will look to see if it needs to start dirty throttling.
+ *
+ * If dirty_poll_interval is too low, big NUMA machines will call the expensive
+ * global_page_state() too often. So scale it near-sqrt to the safety margin
+ * (the number of pages we may dirty without exceeding the dirty limits).
+ */
+static unsigned long dirty_poll_interval(unsigned long dirty,
+ unsigned long thresh)
+{
+ if (thresh > dirty)
+ return 1UL << (ilog2(thresh - dirty) >> 1);
+
+ return 1;
+}
+
+static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
+ unsigned long bdi_dirty)
+{
+ unsigned long bw = bdi->avg_write_bandwidth;
+ unsigned long hi = ilog2(bw);
+ unsigned long lo = ilog2(bdi->dirty_ratelimit);
+ unsigned long t;
+
+ /* target for 20ms max pause on 1-dd case */
+ t = HZ / 50;
+
+ /*
+ * Scale up pause time for concurrent dirtiers in order to reduce CPU
+ * overheads.
+ *
+ * (N * 20ms) on 2^N concurrent tasks.
+ */
+ if (hi > lo)
+ t += (hi - lo) * (20 * HZ) / 1024;
+
+ /*
+ * Limit pause time for small memory systems. If sleeping for too long
+ * time, a small pool of dirty/writeback pages may go empty and disk go
+ * idle.
+ *
+ * 8 serves as the safety ratio.
+ */
+ if (bdi_dirty)
+ t = min(t, bdi_dirty * HZ / (8 * bw + 1));
+
+ /*
+ * The pause time will be settled within range (max_pause/4, max_pause).
+ * Apply a minimal value of 4 to get a non-zero max_pause/4.
+ */
+ return clamp_val(t, 4, MAX_PAUSE);
+}
+
+/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
- * the caller to perform writeback if the system is over `vm_dirty_ratio'.
+ * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
* If we're over `background_thresh' then the writeback threads are woken to
* perform some writeout.
*/
static void balance_dirty_pages(struct address_space *mapping,
- unsigned long write_chunk)
+ unsigned long pages_dirtied)
{
- unsigned long nr_reclaimable, bdi_nr_reclaimable;
+ unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
+ unsigned long bdi_reclaimable;
unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
unsigned long bdi_dirty;
+ unsigned long freerun;
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
- unsigned long task_bdi_thresh;
- unsigned long min_task_bdi_thresh;
- unsigned long pages_written = 0;
- unsigned long pause = 1;
+ long pause = 0;
+ long uninitialized_var(max_pause);
bool dirty_exceeded = false;
- bool clear_dirty_exceeded = true;
+ unsigned long task_ratelimit;
+ unsigned long uninitialized_var(dirty_ratelimit);
+ unsigned long pos_ratio;
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long start_time = jiffies;
for (;;) {
+ /*
+ * Unstable writes are a feature of certain networked
+ * filesystems (i.e. NFS) in which data may have been
+ * written to the server's write cache, but has not yet
+ * been flushed to permanent storage.
+ */
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
@@ -681,12 +1039,28 @@ static void balance_dirty_pages(struct address_space *mapping,
* catch-up. This avoids (excessively) small writeouts
* when the bdi limits are ramping up.
*/
- if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
+ freerun = dirty_freerun_ceiling(dirty_thresh,
+ background_thresh);
+ if (nr_dirty <= freerun)
break;
+ if (unlikely(!writeback_in_progress(bdi)))
+ bdi_start_background_writeback(bdi);
+
+ /*
+ * bdi_thresh is not treated as some limiting factor as
+ * dirty_thresh, due to reasons
+ * - in JBOD setup, bdi_thresh can fluctuate a lot
+ * - in a system with HDD and USB key, the USB key may somehow
+ * go into state (bdi_dirty >> bdi_thresh) either because
+ * bdi_dirty starts high, or because bdi_thresh drops low.
+ * In this case we don't want to hard throttle the USB key
+ * dirtiers for 100 seconds until bdi_dirty drops under
+ * bdi_thresh. Instead the auxiliary bdi control line in
+ * bdi_position_ratio() will let the dirtier task progress
+ * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
+ */
bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
- min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
- task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
/*
* In order to avoid the stacked BDI deadlock we need
@@ -698,56 +1072,69 @@ static void balance_dirty_pages(struct address_space *mapping,
* actually dirty; with m+n sitting in the percpu
* deltas.
*/
- if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
- bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
- bdi_dirty = bdi_nr_reclaimable +
+ if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
+ bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+ bdi_dirty = bdi_reclaimable +
bdi_stat_sum(bdi, BDI_WRITEBACK);
} else {
- bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
- bdi_dirty = bdi_nr_reclaimable +
+ bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+ bdi_dirty = bdi_reclaimable +
bdi_stat(bdi, BDI_WRITEBACK);
}
- /*
- * The bdi thresh is somehow "soft" limit derived from the
- * global "hard" limit. The former helps to prevent heavy IO
- * bdi or process from holding back light ones; The latter is
- * the last resort safeguard.
- */
- dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
+ dirty_exceeded = (bdi_dirty > bdi_thresh) ||
(nr_dirty > dirty_thresh);
- clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
- (nr_dirty <= dirty_thresh);
-
- if (!dirty_exceeded)
- break;
-
- if (!bdi->dirty_exceeded)
+ if (dirty_exceeded && !bdi->dirty_exceeded)
bdi->dirty_exceeded = 1;
- bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
- bdi_thresh, bdi_dirty, start_time);
-
- /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
- * Unstable writes are a feature of certain networked
- * filesystems (i.e. NFS) in which data may have been
- * written to the server's write cache, but has not yet
- * been flushed to permanent storage.
- * Only move pages to writeback if this bdi is over its
- * threshold otherwise wait until the disk writes catch
- * up.
- */
- trace_balance_dirty_start(bdi);
- if (bdi_nr_reclaimable > task_bdi_thresh) {
- pages_written += writeback_inodes_wb(&bdi->wb,
- write_chunk);
- trace_balance_dirty_written(bdi, pages_written);
- if (pages_written >= write_chunk)
- break; /* We've done our duty */
+ bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
+ nr_dirty, bdi_thresh, bdi_dirty,
+ start_time);
+
+ max_pause = bdi_max_pause(bdi, bdi_dirty);
+
+ dirty_ratelimit = bdi->dirty_ratelimit;
+ pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
+ background_thresh, nr_dirty,
+ bdi_thresh, bdi_dirty);
+ task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
+ RATELIMIT_CALC_SHIFT;
+ if (unlikely(task_ratelimit == 0)) {
+ pause = max_pause;
+ goto pause;
+ }
+ pause = HZ * pages_dirtied / task_ratelimit;
+ if (unlikely(pause <= 0)) {
+ trace_balance_dirty_pages(bdi,
+ dirty_thresh,
+ background_thresh,
+ nr_dirty,
+ bdi_thresh,
+ bdi_dirty,
+ dirty_ratelimit,
+ task_ratelimit,
+ pages_dirtied,
+ pause,
+ start_time);
+ pause = 1; /* avoid resetting nr_dirtied_pause below */
+ break;
}
+ pause = min(pause, max_pause);
+
+pause:
+ trace_balance_dirty_pages(bdi,
+ dirty_thresh,
+ background_thresh,
+ nr_dirty,
+ bdi_thresh,
+ bdi_dirty,
+ dirty_ratelimit,
+ task_ratelimit,
+ pages_dirtied,
+ pause,
+ start_time);
__set_current_state(TASK_UNINTERRUPTIBLE);
io_schedule_timeout(pause);
- trace_balance_dirty_wait(bdi);
dirty_thresh = hard_dirty_limit(dirty_thresh);
/*
@@ -756,24 +1143,30 @@ static void balance_dirty_pages(struct address_space *mapping,
* 200ms is typically more than enough to curb heavy dirtiers;
* (b) the pause time limit makes the dirtiers more responsive.
*/
- if (nr_dirty < dirty_thresh &&
- bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 &&
- time_after(jiffies, start_time + MAX_PAUSE))
+ if (nr_dirty < dirty_thresh)
break;
-
- /*
- * Increase the delay for each loop, up to our previous
- * default of taking a 100ms nap.
- */
- pause <<= 1;
- if (pause > HZ / 10)
- pause = HZ / 10;
}
- /* Clear dirty_exceeded flag only when no task can exceed the limit */
- if (clear_dirty_exceeded && bdi->dirty_exceeded)
+ if (!dirty_exceeded && bdi->dirty_exceeded)
bdi->dirty_exceeded = 0;
+ current->nr_dirtied = 0;
+ if (pause == 0) { /* in freerun area */
+ current->nr_dirtied_pause =
+ dirty_poll_interval(nr_dirty, dirty_thresh);
+ } else if (pause <= max_pause / 4 &&
+ pages_dirtied >= current->nr_dirtied_pause) {
+ current->nr_dirtied_pause = clamp_val(
+ dirty_ratelimit * (max_pause / 2) / HZ,
+ pages_dirtied + pages_dirtied / 8,
+ pages_dirtied * 4);
+ } else if (pause >= max_pause) {
+ current->nr_dirtied_pause = 1 | clamp_val(
+ dirty_ratelimit * (max_pause / 2) / HZ,
+ pages_dirtied / 4,
+ pages_dirtied - pages_dirtied / 8);
+ }
+
if (writeback_in_progress(bdi))
return;
@@ -785,8 +1178,10 @@ static void balance_dirty_pages(struct address_space *mapping,
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
- if ((laptop_mode && pages_written) ||
- (!laptop_mode && (nr_reclaimable > background_thresh)))
+ if (laptop_mode)
+ return;
+
+ if (nr_reclaimable > background_thresh)
bdi_start_background_writeback(bdi);
}
@@ -800,7 +1195,7 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
}
}
-static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
+static DEFINE_PER_CPU(int, bdp_ratelimits);
/**
* balance_dirty_pages_ratelimited_nr - balance dirty memory state
@@ -820,31 +1215,39 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long nr_pages_dirtied)
{
struct backing_dev_info *bdi = mapping->backing_dev_info;
- unsigned long ratelimit;
- unsigned long *p;
+ int ratelimit;
+ int *p;
if (!bdi_cap_account_dirty(bdi))
return;
- ratelimit = ratelimit_pages;
- if (mapping->backing_dev_info->dirty_exceeded)
- ratelimit = 8;
+ ratelimit = current->nr_dirtied_pause;
+ if (bdi->dirty_exceeded)
+ ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
+
+ current->nr_dirtied += nr_pages_dirtied;
+ preempt_disable();
/*
- * Check the rate limiting. Also, we do not want to throttle real-time
- * tasks in balance_dirty_pages(). Period.
+ * This prevents one CPU to accumulate too many dirtied pages without
+ * calling into balance_dirty_pages(), which can happen when there are
+ * 1000+ tasks, all of them start dirtying pages at exactly the same
+ * time, hence all honoured too large initial task->nr_dirtied_pause.
*/
- preempt_disable();
p = &__get_cpu_var(bdp_ratelimits);
- *p += nr_pages_dirtied;
- if (unlikely(*p >= ratelimit)) {
- ratelimit = sync_writeback_pages(*p);
+ if (unlikely(current->nr_dirtied >= ratelimit))
*p = 0;
- preempt_enable();
- balance_dirty_pages(mapping, ratelimit);
- return;
+ else {
+ *p += nr_pages_dirtied;
+ if (unlikely(*p >= ratelimit_pages)) {
+ *p = 0;
+ ratelimit = 0;
+ }
}
preempt_enable();
+
+ if (unlikely(current->nr_dirtied >= ratelimit))
+ balance_dirty_pages(mapping, current->nr_dirtied);
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
@@ -900,7 +1303,8 @@ void laptop_mode_timer_fn(unsigned long data)
* threshold
*/
if (bdi_has_dirty_io(&q->backing_dev_info))
- bdi_start_writeback(&q->backing_dev_info, nr_pages);
+ bdi_start_writeback(&q->backing_dev_info, nr_pages,
+ WB_REASON_LAPTOP_TIMER);
}
/*
@@ -939,22 +1343,17 @@ void laptop_sync_completion(void)
*
* Here we set ratelimit_pages to a level which ensures that when all CPUs are
* dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
- * thresholds before writeback cuts in.
- *
- * But the limit should not be set too high. Because it also controls the
- * amount of memory which the balance_dirty_pages() caller has to write back.
- * If this is too large then the caller will block on the IO queue all the
- * time. So limit it to four megabytes - the balance_dirty_pages() caller
- * will write six megabyte chunks, max.
+ * thresholds.
*/
void writeback_set_ratelimit(void)
{
- ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
if (ratelimit_pages < 16)
ratelimit_pages = 16;
- if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
- ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
}
static int __cpuinit
@@ -1324,6 +1723,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED);
__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+ __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
task_dirty_inc(current);
task_io_account_write(PAGE_CACHE_SIZE);
}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 6bdc67dbbc28..2d123f94a8df 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -133,10 +133,13 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc)
static void *__meminit alloc_page_cgroup(size_t size, int nid)
{
void *addr = NULL;
+ gfp_t flags = GFP_KERNEL | __GFP_NOWARN;
- addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN);
- if (addr)
+ addr = alloc_pages_exact_nid(nid, size, flags);
+ if (addr) {
+ kmemleak_alloc(addr, size, 1, flags);
return addr;
+ }
if (node_state(nid, N_HIGH_MEMORY))
addr = vmalloc_node(size, nid);
@@ -357,7 +360,7 @@ struct swap_cgroup_ctrl {
spinlock_t lock;
};
-struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
struct swap_cgroup {
unsigned short id;
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 2876349339a7..942212970529 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -17,7 +17,6 @@
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
-#include <linux/module.h>
#include <linux/quicklist.h>
DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
diff --git a/mm/readahead.c b/mm/readahead.c
index 867f9dd82dcd..cbcbb02f3e28 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -11,7 +11,7 @@
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
diff --git a/mm/rmap.c b/mm/rmap.c
index 6541cf7fd1d3..a4fd3680038b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -51,7 +51,7 @@
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/rcupdate.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
diff --git a/mm/shmem.c b/mm/shmem.c
index fa4fa6ce13bc..d6722506d2da 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,7 +28,7 @@
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/swap.h>
static struct vfsmount *shm_mnt;
@@ -2503,7 +2503,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
d_instantiate(path.dentry, inode);
inode->i_size = size;
- inode->i_nlink = 0; /* It is unlinked */
+ clear_nlink(inode); /* It is unlinked */
#ifndef CONFIG_MMU
error = ramfs_nommu_expand_for_mapping(inode, size);
if (error)
diff --git a/mm/slob.c b/mm/slob.c
index bf3918187165..8105be42cad1 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -63,7 +63,7 @@
#include <linux/swap.h> /* struct reclaim_state */
#include <linux/cache.h>
#include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <linux/kmemleak.h>
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 64b984091edb..1b7e22ab9b09 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -21,7 +21,6 @@
#include <linux/mmzone.h>
#include <linux/bootmem.h>
#include <linux/highmem.h>
-#include <linux/module.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
diff --git a/mm/sparse.c b/mm/sparse.c
index 858e1dff9b2a..61d7cde23111 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -6,7 +6,7 @@
#include <linux/mmzone.h>
#include <linux/bootmem.h>
#include <linux/highmem.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include "internal.h"
diff --git a/mm/swap.c b/mm/swap.c
index 3a442f18b0b3..a91caf754d9b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -21,7 +21,7 @@
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mm_inline.h>
#include <linux/buffer_head.h> /* for try_to_release_page() */
#include <linux/percpu_counter.h>
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page)
{
if (unlikely(PageTail(page))) {
/* __split_huge_page_refcount can run under us */
- struct page *page_head = page->first_page;
- smp_rmb();
- /*
- * If PageTail is still set after smp_rmb() we can be sure
- * that the page->first_page we read wasn't a dangling pointer.
- * See __split_huge_page_refcount() smp_wmb().
- */
- if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
+ struct page *page_head = compound_trans_head(page);
+
+ if (likely(page != page_head &&
+ get_page_unless_zero(page_head))) {
unsigned long flags;
/*
- * Verify that our page_head wasn't converted
- * to a a regular page before we got a
- * reference on it.
+ * page_head wasn't a dangling pointer but it
+ * may not be a head page anymore by the time
+ * we obtain the lock. That is ok as long as it
+ * can't be freed from under us.
*/
- if (unlikely(!PageHead(page_head))) {
- /* PageHead is cleared after PageTail */
- smp_rmb();
- VM_BUG_ON(PageTail(page));
- goto out_put_head;
- }
- /*
- * Only run compound_lock on a valid PageHead,
- * after having it pinned with
- * get_page_unless_zero() above.
- */
- smp_mb();
- /* page_head wasn't a dangling pointer */
flags = compound_lock_irqsave(page_head);
if (unlikely(!PageTail(page))) {
/* __split_huge_page_refcount run before us */
compound_unlock_irqrestore(page_head, flags);
VM_BUG_ON(PageHead(page_head));
- out_put_head:
if (put_page_testzero(page_head))
__put_single_page(page_head);
out_put_single:
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page)
VM_BUG_ON(page_head != page->first_page);
/*
* We can release the refcount taken by
- * get_page_unless_zero now that
- * split_huge_page_refcount is blocked on the
- * compound_lock.
+ * get_page_unless_zero() now that
+ * __split_huge_page_refcount() is blocked on
+ * the compound_lock.
*/
if (put_page_testzero(page_head))
VM_BUG_ON(1);
/* __split_huge_page_refcount will wait now */
- VM_BUG_ON(atomic_read(&page->_count) <= 0);
- atomic_dec(&page->_count);
+ VM_BUG_ON(page_mapcount(page) <= 0);
+ atomic_dec(&page->_mapcount);
VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+ VM_BUG_ON(atomic_read(&page->_count) != 0);
compound_unlock_irqrestore(page_head, flags);
if (put_page_testzero(page_head)) {
if (PageHead(page_head))
@@ -160,6 +144,45 @@ void put_page(struct page *page)
}
EXPORT_SYMBOL(put_page);
+/*
+ * This function is exported but must not be called by anything other
+ * than get_page(). It implements the slow path of get_page().
+ */
+bool __get_page_tail(struct page *page)
+{
+ /*
+ * This takes care of get_page() if run on a tail page
+ * returned by one of the get_user_pages/follow_page variants.
+ * get_user_pages/follow_page itself doesn't need the compound
+ * lock because it runs __get_page_tail_foll() under the
+ * proper PT lock that already serializes against
+ * split_huge_page().
+ */
+ unsigned long flags;
+ bool got = false;
+ struct page *page_head = compound_trans_head(page);
+
+ if (likely(page != page_head && get_page_unless_zero(page_head))) {
+ /*
+ * page_head wasn't a dangling pointer but it
+ * may not be a head page anymore by the time
+ * we obtain the lock. That is ok as long as it
+ * can't be freed from under us.
+ */
+ flags = compound_lock_irqsave(page_head);
+ /* here __split_huge_page_refcount won't run anymore */
+ if (likely(PageTail(page))) {
+ __get_page_tail_foll(page, false);
+ got = true;
+ }
+ compound_unlock_irqrestore(page_head, flags);
+ if (unlikely(!got))
+ put_page(page_head);
+ }
+ return got;
+}
+EXPORT_SYMBOL(__get_page_tail);
+
/**
* put_pages_list() - release a list of pages
* @pages: list of pages threaded on page->lru
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 46680461785b..78cc4d1f6cce 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -6,7 +6,6 @@
*
* Rewritten to use page cache, (C) 1998 Stephen Tweedie
*/
-#include <linux/module.h>
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/kernel_stat.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c9d654009125..b1cd12060723 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -21,7 +21,6 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
-#include <linux/module.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
diff --git a/mm/truncate.c b/mm/truncate.c
index b40ac6d4e86e..632b15e29f74 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -12,7 +12,7 @@
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/pagevec.h>
diff --git a/mm/util.c b/mm/util.c
index 88ea1bd661c0..136ac4f322b8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,7 +1,7 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <asm/uaccess.h>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a90c603a8d02..a1893c050795 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1767,7 +1767,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
if (scanning_global_lru(sc))
low = inactive_anon_is_low_global(zone);
else
- low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
+ low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
return low;
}
#else
@@ -1810,7 +1810,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
if (scanning_global_lru(sc))
low = inactive_file_is_low_global(zone);
else
- low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
+ low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
return low;
}
@@ -2266,7 +2266,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
*/
writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
if (total_scanned > writeback_threshold) {
- wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
+ wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
+ WB_REASON_TRY_TO_FREE_PAGES);
sc->may_writepage = 1;
}