summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorMichal Marek <mmarek@suse.cz>2011-03-09 18:15:44 +0300
committerMichal Marek <mmarek@suse.cz>2011-03-09 18:15:44 +0300
commit2d8ad8719591fa803b0d589ed057fa46f49b7155 (patch)
tree4ae051577dad1161c91dafbf4207bb10a9dc91bb /mm
parent9b4ce7bce5f30712fd926ab4599a803314a07719 (diff)
parentc56eb8fb6dccb83d9fe62fd4dc00c834de9bc470 (diff)
downloadlinux-2d8ad8719591fa803b0d589ed057fa46f49b7155.tar.xz
Merge commit 'v2.6.38-rc1' into kbuild/packaging
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig74
-rw-r--r--mm/Makefile9
-rw-r--r--mm/backing-dev.c577
-rw-r--r--mm/bootmem.c227
-rw-r--r--mm/bounce.c3
-rw-r--r--mm/compaction.c695
-rw-r--r--mm/dmapool.c18
-rw-r--r--mm/fadvise.c10
-rw-r--r--mm/failslab.c19
-rw-r--r--mm/filemap.c166
-rw-r--r--mm/filemap_xip.c3
-rw-r--r--mm/fremap.c9
-rw-r--r--mm/highmem.c63
-rw-r--r--mm/huge_memory.c2346
-rw-r--r--mm/hugetlb.c392
-rw-r--r--mm/hwpoison-inject.c15
-rw-r--r--mm/init-mm.c6
-rw-r--r--mm/internal.h9
-rw-r--r--mm/kmemleak.c101
-rw-r--r--mm/ksm.c190
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/madvise.c10
-rw-r--r--mm/memblock.c842
-rw-r--r--mm/memcontrol.c2612
-rw-r--r--mm/memory-failure.c347
-rw-r--r--mm/memory.c680
-rw-r--r--mm/memory_hotplug.c156
-rw-r--r--mm/mempolicy.c506
-rw-r--r--mm/migrate.c420
-rw-r--r--mm/mincore.c272
-rw-r--r--mm/mlock.c219
-rw-r--r--mm/mmap.c360
-rw-r--r--mm/mmu_context.c4
-rw-r--r--mm/mmu_notifier.c21
-rw-r--r--mm/mprotect.c23
-rw-r--r--mm/mremap.c23
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c163
-rw-r--r--mm/oom_kill.c735
-rw-r--r--mm/page-writeback.c360
-rw-r--r--mm/page_alloc.c989
-rw-r--r--mm/page_cgroup.c49
-rw-r--r--mm/page_io.c3
-rw-r--r--mm/page_isolation.c3
-rw-r--r--mm/pagewalk.c53
-rw-r--r--mm/percpu-km.c108
-rw-r--r--mm/percpu-vm.c451
-rw-r--r--mm/percpu.c1139
-rw-r--r--mm/pgtable-generic.c120
-rw-r--r--mm/quicklist.c1
-rw-r--r--mm/readahead.c9
-rw-r--r--mm/rmap.c530
-rw-r--r--mm/shmem.c300
-rw-r--r--mm/slab.c366
-rw-r--r--mm/slob.c31
-rw-r--r--mm/slub.c1260
-rw-r--r--mm/sparse-vmemmap.c68
-rw-r--r--mm/sparse.c208
-rw-r--r--mm/swap.c136
-rw-r--r--mm/swap_state.c7
-rw-r--r--mm/swapfile.c160
-rw-r--r--mm/truncate.c43
-rw-r--r--mm/util.c24
-rw-r--r--mm/vmalloc.c300
-rw-r--r--mm/vmscan.c1392
-rw-r--r--mm/vmstat.c529
66 files changed, 15718 insertions, 5250 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 17b8947aa7da..3ad483bdf505 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME
config SPARSEMEM_VMEMMAP_ENABLE
bool
+config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+ def_bool y
+ depends on SPARSEMEM && X86_64
+
config SPARSEMEM_VMEMMAP
bool "Sparse Memory virtual memmap"
depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
@@ -124,6 +128,9 @@ config SPARSEMEM_VMEMMAP
pfn_to_page and page_to_pfn operations. This is the most
efficient option when sufficient kernel resources are available.
+config HAVE_MEMBLOCK
+ boolean
+
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
@@ -168,17 +175,28 @@ config SPLIT_PTLOCK_CPUS
default "4"
#
+# support for memory compaction
+config COMPACTION
+ bool "Allow for memory compaction"
+ select MIGRATION
+ depends on EXPERIMENTAL && HUGETLB_PAGE && MMU
+ help
+ Allows the compaction of memory for the allocation of huge pages.
+
+#
# support for page migration
#
config MIGRATION
bool "Page migration"
def_bool y
- depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
+ depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION
help
Allows the migration of the physical location of pages of processes
- while the virtual addresses are not changed. This is useful for
- example on NUMA systems to put pages nearer to the processors accessing
- the page.
+ while the virtual addresses are not changed. This is useful in
+ two situations. The first is on NUMA systems to put pages nearer
+ to the processors accessing. The second is when allocating huge
+ pages as migration can relocate pages to satisfy a huge page
+ allocation instead of reclaiming.
config PHYS_ADDR_T_64BIT
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
@@ -195,7 +213,7 @@ config BOUNCE
config NR_QUICK
int
depends on QUICKLIST
- default "2" if SUPERH || AVR32
+ default "2" if AVR32
default "1"
config VIRT_TO_BUS
@@ -283,3 +301,49 @@ config NOMMU_INITIAL_TRIM_EXCESS
of 1 says that all excess pages should be trimmed.
See Documentation/nommu-mmap.txt for more information.
+
+config TRANSPARENT_HUGEPAGE
+ bool "Transparent Hugepage Support"
+ depends on X86 && MMU
+ select COMPACTION
+ help
+ Transparent Hugepages allows the kernel to use huge pages and
+ huge tlb transparently to the applications whenever possible.
+ This feature can improve computing performance to certain
+ applications by speeding up page faults during memory
+ allocation, by reducing the number of tlb misses and by speeding
+ up the pagetable walking.
+
+ If memory constrained on embedded, you may want to say N.
+
+choice
+ prompt "Transparent Hugepage Support sysfs defaults"
+ depends on TRANSPARENT_HUGEPAGE
+ default TRANSPARENT_HUGEPAGE_ALWAYS
+ help
+ Selects the sysfs defaults for Transparent Hugepage Support.
+
+ config TRANSPARENT_HUGEPAGE_ALWAYS
+ bool "always"
+ help
+ Enabling Transparent Hugepage always, can increase the
+ memory footprint of applications without a guaranteed
+ benefit but it will work automatically for all applications.
+
+ config TRANSPARENT_HUGEPAGE_MADVISE
+ bool "madvise"
+ help
+ Enabling Transparent Hugepage madvise, will only provide a
+ performance improvement benefit to the applications using
+ madvise(MADV_HUGEPAGE) but it won't risk to increase the
+ memory footprint of applications without a guaranteed
+ benefit.
+endchoice
+
+#
+# UP and nommu archs use km based percpu allocator
+#
+config NEED_PER_CPU_KM
+ depends on !SMP
+ bool
+ default y
diff --git a/mm/Makefile b/mm/Makefile
index 7a68d2ab5560..2b1b575ae712 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,16 +5,18 @@
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o pagewalk.o
+ vmalloc.o pagewalk.o pgtable-generic.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
- page_isolation.o mm_init.o mmu_context.o \
+ page_isolation.o mm_init.o mmu_context.o percpu.o \
$(mmu-y)
obj-y += init-mm.o
+obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
+
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
@@ -23,6 +25,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_COMPACTION) += compaction.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_KSM) += ksm.o
obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
@@ -33,8 +36,8 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
-obj-$(CONFIG_SMP) += percpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0e8ca0347707..027100d30227 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -10,6 +10,9 @@
#include <linux/module.h>
#include <linux/writeback.h>
#include <linux/device.h>
+#include <trace/events/writeback.h>
+
+static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
{
@@ -25,6 +28,12 @@ struct backing_dev_info default_backing_dev_info = {
};
EXPORT_SYMBOL_GPL(default_backing_dev_info);
+struct backing_dev_info noop_backing_dev_info = {
+ .name = "noop",
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+EXPORT_SYMBOL_GPL(noop_backing_dev_info);
+
static struct class *bdi_class;
/*
@@ -41,9 +50,6 @@ static struct timer_list sync_supers_timer;
static int bdi_sync_supers(void *);
static void sync_supers_timer_fn(unsigned long);
-static void arm_supers_timer(void);
-
-static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
@@ -59,31 +65,25 @@ static void bdi_debug_init(void)
static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
struct backing_dev_info *bdi = m->private;
- struct bdi_writeback *wb;
+ struct bdi_writeback *wb = &bdi->wb;
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
struct inode *inode;
- /*
- * inode lock is enough here, the bdi->wb_list is protected by
- * RCU on the reader side
- */
nr_wb = nr_dirty = nr_io = nr_more_io = 0;
spin_lock(&inode_lock);
- list_for_each_entry(wb, &bdi->wb_list, list) {
- nr_wb++;
- list_for_each_entry(inode, &wb->b_dirty, i_list)
- nr_dirty++;
- list_for_each_entry(inode, &wb->b_io, i_list)
- nr_io++;
- list_for_each_entry(inode, &wb->b_more_io, i_list)
- nr_more_io++;
- }
+ list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
+ nr_dirty++;
+ list_for_each_entry(inode, &wb->b_io, i_wb_list)
+ nr_io++;
+ list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
+ nr_more_io++;
spin_unlock(&inode_lock);
- get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
#define K(x) ((x) << (PAGE_SHIFT - 10))
seq_printf(m,
@@ -92,21 +92,16 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"BdiDirtyThresh: %8lu kB\n"
"DirtyThresh: %8lu kB\n"
"BackgroundThresh: %8lu kB\n"
- "WritebackThreads: %8lu\n"
"b_dirty: %8lu\n"
"b_io: %8lu\n"
"b_more_io: %8lu\n"
"bdi_list: %8u\n"
- "state: %8lx\n"
- "wb_mask: %8lx\n"
- "wb_list: %8u\n"
- "wb_cnt: %8u\n",
+ "state: %8lx\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
(unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
K(bdi_thresh), K(dirty_thresh),
- K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
- !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
- !list_empty(&bdi->wb_list), bdi->wb_cnt);
+ K(background_thresh), nr_dirty, nr_io, nr_more_io,
+ !list_empty(&bdi->bdi_list), bdi->state);
#undef K
return 0;
@@ -227,6 +222,9 @@ static struct device_attribute bdi_dev_attrs[] = {
static __init int bdi_class_init(void)
{
bdi_class = class_create(THIS_MODULE, "bdi");
+ if (IS_ERR(bdi_class))
+ return PTR_ERR(bdi_class);
+
bdi_class->dev_attrs = bdi_dev_attrs;
bdi_debug_init();
return 0;
@@ -240,89 +238,18 @@ static int __init default_bdi_init(void)
sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
BUG_ON(IS_ERR(sync_supers_tsk));
- init_timer(&sync_supers_timer);
setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
- arm_supers_timer();
+ bdi_arm_supers_timer();
err = bdi_init(&default_backing_dev_info);
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default");
+ err = bdi_init(&noop_backing_dev_info);
return err;
}
subsys_initcall(default_bdi_init);
-static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
-{
- memset(wb, 0, sizeof(*wb));
-
- wb->bdi = bdi;
- wb->last_old_flush = jiffies;
- INIT_LIST_HEAD(&wb->b_dirty);
- INIT_LIST_HEAD(&wb->b_io);
- INIT_LIST_HEAD(&wb->b_more_io);
-}
-
-static void bdi_task_init(struct backing_dev_info *bdi,
- struct bdi_writeback *wb)
-{
- struct task_struct *tsk = current;
-
- spin_lock(&bdi->wb_lock);
- list_add_tail_rcu(&wb->list, &bdi->wb_list);
- spin_unlock(&bdi->wb_lock);
-
- tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
- set_freezable();
-
- /*
- * Our parent may run at a different priority, just set us to normal
- */
- set_user_nice(tsk, 0);
-}
-
-static int bdi_start_fn(void *ptr)
-{
- struct bdi_writeback *wb = ptr;
- struct backing_dev_info *bdi = wb->bdi;
- int ret;
-
- /*
- * Add us to the active bdi_list
- */
- spin_lock_bh(&bdi_lock);
- list_add_rcu(&bdi->bdi_list, &bdi_list);
- spin_unlock_bh(&bdi_lock);
-
- bdi_task_init(bdi, wb);
-
- /*
- * Clear pending bit and wakeup anybody waiting to tear us down
- */
- clear_bit(BDI_pending, &bdi->state);
- smp_mb__after_clear_bit();
- wake_up_bit(&bdi->state, BDI_pending);
-
- ret = bdi_writeback_task(wb);
-
- /*
- * Remove us from the list
- */
- spin_lock(&bdi->wb_lock);
- list_del_rcu(&wb->list);
- spin_unlock(&bdi->wb_lock);
-
- /*
- * Flush any work that raced with us exiting. No new work
- * will be added, since this bdi isn't discoverable anymore.
- */
- if (!list_empty(&bdi->work_list))
- wb_do_writeback(wb, 1);
-
- wb->task = NULL;
- return ret;
-}
-
int bdi_has_dirty_io(struct backing_dev_info *bdi)
{
return wb_has_dirty_io(&bdi->wb);
@@ -331,21 +258,20 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
static void bdi_flush_io(struct backing_dev_info *bdi)
{
struct writeback_control wbc = {
- .bdi = bdi,
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.range_cyclic = 1,
.nr_to_write = 1024,
};
- writeback_inodes_wbc(&wbc);
+ writeback_inodes_wb(&bdi->wb, &wbc);
}
/*
- * kupdated() used to do this. We cannot do it from the bdi_forker_task()
+ * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
* or we risk deadlocking on ->s_umount. The longer term solution would be
* to implement sync_supers_bdi() or similar and simply do it from the
- * bdi writeback tasks individually.
+ * bdi writeback thread individually.
*/
static int bdi_sync_supers(void *unused)
{
@@ -364,10 +290,13 @@ static int bdi_sync_supers(void *unused)
return 0;
}
-static void arm_supers_timer(void)
+void bdi_arm_supers_timer(void)
{
unsigned long next;
+ if (!dirty_writeback_interval)
+ return;
+
next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
mod_timer(&sync_supers_timer, round_jiffies_up(next));
}
@@ -375,142 +304,202 @@ static void arm_supers_timer(void)
static void sync_supers_timer_fn(unsigned long unused)
{
wake_up_process(sync_supers_tsk);
- arm_supers_timer();
+ bdi_arm_supers_timer();
}
-static int bdi_forker_task(void *ptr)
+static void wakeup_timer_fn(unsigned long data)
{
- struct bdi_writeback *me = ptr;
-
- bdi_task_init(me->bdi, me);
-
- for (;;) {
- struct backing_dev_info *bdi, *tmp;
- struct bdi_writeback *wb;
+ struct backing_dev_info *bdi = (struct backing_dev_info *)data;
+ spin_lock_bh(&bdi->wb_lock);
+ if (bdi->wb.task) {
+ trace_writeback_wake_thread(bdi);
+ wake_up_process(bdi->wb.task);
+ } else {
/*
- * Temporary measure, we want to make sure we don't see
- * dirty data on the default backing_dev_info
+ * When bdi tasks are inactive for long time, they are killed.
+ * In this case we have to wake-up the forker thread which
+ * should create and run the bdi thread.
*/
- if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
- wb_do_writeback(me, 0);
+ trace_writeback_wake_forker_thread(bdi);
+ wake_up_process(default_backing_dev_info.wb.task);
+ }
+ spin_unlock_bh(&bdi->wb_lock);
+}
- spin_lock_bh(&bdi_lock);
+/*
+ * This function is used when the first inode for this bdi is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ */
+void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+{
+ unsigned long timeout;
- /*
- * Check if any existing bdi's have dirty data without
- * a thread registered. If so, set that up.
- */
- list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
- if (bdi->wb.task)
- continue;
- if (list_empty(&bdi->work_list) &&
- !bdi_has_dirty_io(bdi))
- continue;
+ timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+ mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
+}
- bdi_add_default_flusher_task(bdi);
- }
+/*
+ * Calculate the longest interval (jiffies) bdi threads are allowed to be
+ * inactive.
+ */
+static unsigned long bdi_longest_inactive(void)
+{
+ unsigned long interval;
- set_current_state(TASK_INTERRUPTIBLE);
+ interval = msecs_to_jiffies(dirty_writeback_interval * 10);
+ return max(5UL * 60 * HZ, interval);
+}
- if (list_empty(&bdi_pending_list)) {
- unsigned long wait;
+static int bdi_forker_thread(void *ptr)
+{
+ struct bdi_writeback *me = ptr;
- spin_unlock_bh(&bdi_lock);
- wait = msecs_to_jiffies(dirty_writeback_interval * 10);
- schedule_timeout(wait);
- try_to_freeze();
- continue;
- }
+ current->flags |= PF_SWAPWRITE;
+ set_freezable();
- __set_current_state(TASK_RUNNING);
+ /*
+ * Our parent may run at a different priority, just set us to normal
+ */
+ set_user_nice(current, 0);
- /*
- * This is our real job - check for pending entries in
- * bdi_pending_list, and create the tasks that got added
- */
- bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
- bdi_list);
- list_del_init(&bdi->bdi_list);
- spin_unlock_bh(&bdi_lock);
+ for (;;) {
+ struct task_struct *task = NULL;
+ struct backing_dev_info *bdi;
+ enum {
+ NO_ACTION, /* Nothing to do */
+ FORK_THREAD, /* Fork bdi thread */
+ KILL_THREAD, /* Kill inactive bdi thread */
+ } action = NO_ACTION;
- wb = &bdi->wb;
- wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
- dev_name(bdi->dev));
/*
- * If task creation fails, then readd the bdi to
- * the pending list and force writeout of the bdi
- * from this forker thread. That will free some memory
- * and we can try again.
+ * Temporary measure, we want to make sure we don't see
+ * dirty data on the default backing_dev_info
*/
- if (IS_ERR(wb->task)) {
- wb->task = NULL;
-
- /*
- * Add this 'bdi' to the back, so we get
- * a chance to flush other bdi's to free
- * memory.
- */
- spin_lock_bh(&bdi_lock);
- list_add_tail(&bdi->bdi_list, &bdi_pending_list);
- spin_unlock_bh(&bdi_lock);
-
- bdi_flush_io(bdi);
+ if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
+ del_timer(&me->wakeup_timer);
+ wb_do_writeback(me, 0);
}
- }
- return 0;
-}
+ spin_lock_bh(&bdi_lock);
+ set_current_state(TASK_INTERRUPTIBLE);
-static void bdi_add_to_pending(struct rcu_head *head)
-{
- struct backing_dev_info *bdi;
+ list_for_each_entry(bdi, &bdi_list, bdi_list) {
+ bool have_dirty_io;
- bdi = container_of(head, struct backing_dev_info, rcu_head);
- INIT_LIST_HEAD(&bdi->bdi_list);
+ if (!bdi_cap_writeback_dirty(bdi) ||
+ bdi_cap_flush_forker(bdi))
+ continue;
- spin_lock(&bdi_lock);
- list_add_tail(&bdi->bdi_list, &bdi_pending_list);
- spin_unlock(&bdi_lock);
+ WARN(!test_bit(BDI_registered, &bdi->state),
+ "bdi %p/%s is not registered!\n", bdi, bdi->name);
- /*
- * We are now on the pending list, wake up bdi_forker_task()
- * to finish the job and add us back to the active bdi_list
- */
- wake_up_process(default_backing_dev_info.wb.task);
-}
+ have_dirty_io = !list_empty(&bdi->work_list) ||
+ wb_has_dirty_io(&bdi->wb);
-/*
- * Add the default flusher task that gets created for any bdi
- * that has dirty data pending writeout
- */
-void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
-{
- if (!bdi_cap_writeback_dirty(bdi))
- return;
+ /*
+ * If the bdi has work to do, but the thread does not
+ * exist - create it.
+ */
+ if (!bdi->wb.task && have_dirty_io) {
+ /*
+ * Set the pending bit - if someone will try to
+ * unregister this bdi - it'll wait on this bit.
+ */
+ set_bit(BDI_pending, &bdi->state);
+ action = FORK_THREAD;
+ break;
+ }
+
+ spin_lock(&bdi->wb_lock);
- if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
- printk(KERN_ERR "bdi %p/%s is not registered!\n",
- bdi, bdi->name);
- return;
- }
+ /*
+ * If there is no work to do and the bdi thread was
+ * inactive long enough - kill it. The wb_lock is taken
+ * to make sure no-one adds more work to this bdi and
+ * wakes the bdi thread up.
+ */
+ if (bdi->wb.task && !have_dirty_io &&
+ time_after(jiffies, bdi->wb.last_active +
+ bdi_longest_inactive())) {
+ task = bdi->wb.task;
+ bdi->wb.task = NULL;
+ spin_unlock(&bdi->wb_lock);
+ set_bit(BDI_pending, &bdi->state);
+ action = KILL_THREAD;
+ break;
+ }
+ spin_unlock(&bdi->wb_lock);
+ }
+ spin_unlock_bh(&bdi_lock);
- /*
- * Check with the helper whether to proceed adding a task. Will only
- * abort if we two or more simultanous calls to
- * bdi_add_default_flusher_task() occured, further additions will block
- * waiting for previous additions to finish.
- */
- if (!test_and_set_bit(BDI_pending, &bdi->state)) {
- list_del_rcu(&bdi->bdi_list);
+ /* Keep working if default bdi still has things to do */
+ if (!list_empty(&me->bdi->work_list))
+ __set_current_state(TASK_RUNNING);
+
+ switch (action) {
+ case FORK_THREAD:
+ __set_current_state(TASK_RUNNING);
+ task = kthread_create(bdi_writeback_thread, &bdi->wb,
+ "flush-%s", dev_name(bdi->dev));
+ if (IS_ERR(task)) {
+ /*
+ * If thread creation fails, force writeout of
+ * the bdi from the thread.
+ */
+ bdi_flush_io(bdi);
+ } else {
+ /*
+ * The spinlock makes sure we do not lose
+ * wake-ups when racing with 'bdi_queue_work()'.
+ * And as soon as the bdi thread is visible, we
+ * can start it.
+ */
+ spin_lock_bh(&bdi->wb_lock);
+ bdi->wb.task = task;
+ spin_unlock_bh(&bdi->wb_lock);
+ wake_up_process(task);
+ }
+ break;
+
+ case KILL_THREAD:
+ __set_current_state(TASK_RUNNING);
+ kthread_stop(task);
+ break;
+
+ case NO_ACTION:
+ if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
+ /*
+ * There are no dirty data. The only thing we
+ * should now care about is checking for
+ * inactive bdi threads and killing them. Thus,
+ * let's sleep for longer time, save energy and
+ * be friendly for battery-driven devices.
+ */
+ schedule_timeout(bdi_longest_inactive());
+ else
+ schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
+ try_to_freeze();
+ /* Back to the main loop */
+ continue;
+ }
/*
- * We must wait for the current RCU period to end before
- * moving to the pending list. So schedule that operation
- * from an RCU callback.
+ * Clear pending bit and wakeup anybody waiting to tear us down.
*/
- call_rcu(&bdi->rcu_head, bdi_add_to_pending);
+ clear_bit(BDI_pending, &bdi->state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&bdi->state, BDI_pending);
}
+
+ return 0;
}
/*
@@ -529,23 +518,16 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
va_list args;
- int ret = 0;
struct device *dev;
if (bdi->dev) /* The driver needs to use separate queues per device */
- goto exit;
+ return 0;
va_start(args, fmt);
dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
va_end(args);
- if (IS_ERR(dev)) {
- ret = PTR_ERR(dev);
- goto exit;
- }
-
- spin_lock_bh(&bdi_lock);
- list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
- spin_unlock_bh(&bdi_lock);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
bdi->dev = dev;
@@ -557,21 +539,21 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
if (bdi_cap_flush_forker(bdi)) {
struct bdi_writeback *wb = &bdi->wb;
- wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
+ wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
dev_name(dev));
- if (IS_ERR(wb->task)) {
- wb->task = NULL;
- ret = -ENOMEM;
-
- bdi_remove_from_list(bdi);
- goto exit;
- }
+ if (IS_ERR(wb->task))
+ return PTR_ERR(wb->task);
}
bdi_debug_register(bdi, dev_name(dev));
set_bit(BDI_registered, &bdi->state);
-exit:
- return ret;
+
+ spin_lock_bh(&bdi_lock);
+ list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+ spin_unlock_bh(&bdi_lock);
+
+ trace_writeback_bdi_register(bdi);
+ return 0;
}
EXPORT_SYMBOL(bdi_register);
@@ -586,31 +568,29 @@ EXPORT_SYMBOL(bdi_register_dev);
*/
static void bdi_wb_shutdown(struct backing_dev_info *bdi)
{
- struct bdi_writeback *wb;
-
if (!bdi_cap_writeback_dirty(bdi))
return;
/*
- * If setup is pending, wait for that to complete first
+ * Make sure nobody finds us on the bdi_list anymore
*/
- wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
- TASK_UNINTERRUPTIBLE);
+ bdi_remove_from_list(bdi);
/*
- * Make sure nobody finds us on the bdi_list anymore
+ * If setup is pending, wait for that to complete first
*/
- bdi_remove_from_list(bdi);
+ wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
+ TASK_UNINTERRUPTIBLE);
/*
- * Finally, kill the kernel threads. We don't need to be RCU
+ * Finally, kill the kernel thread. We don't need to be RCU
* safe anymore, since the bdi is gone from visibility. Force
* unfreeze of the thread before calling kthread_stop(), otherwise
* it would never exet if it is currently stuck in the refrigerator.
*/
- list_for_each_entry(wb, &bdi->wb_list, list) {
- thaw_process(wb->task);
- kthread_stop(wb->task);
+ if (bdi->wb.task) {
+ thaw_process(bdi->wb.task);
+ kthread_stop(bdi->wb.task);
}
}
@@ -632,7 +612,9 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
if (bdi->dev) {
+ trace_writeback_bdi_unregister(bdi);
bdi_prune_sb(bdi);
+ del_timer_sync(&bdi->wb.wakeup_timer);
if (!bdi_cap_flush_forker(bdi))
bdi_wb_shutdown(bdi);
@@ -643,6 +625,18 @@ void bdi_unregister(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL(bdi_unregister);
+static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+{
+ memset(wb, 0, sizeof(*wb));
+
+ wb->bdi = bdi;
+ wb->last_old_flush = jiffies;
+ INIT_LIST_HEAD(&wb->b_dirty);
+ INIT_LIST_HEAD(&wb->b_io);
+ INIT_LIST_HEAD(&wb->b_more_io);
+ setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
+}
+
int bdi_init(struct backing_dev_info *bdi)
{
int i, err;
@@ -653,19 +647,11 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->max_ratio = 100;
bdi->max_prop_frac = PROP_FRAC_BASE;
spin_lock_init(&bdi->wb_lock);
- INIT_RCU_HEAD(&bdi->rcu_head);
INIT_LIST_HEAD(&bdi->bdi_list);
- INIT_LIST_HEAD(&bdi->wb_list);
INIT_LIST_HEAD(&bdi->work_list);
bdi_wb_init(&bdi->wb, bdi);
- /*
- * Just one thread support for now, hard code mask and count
- */
- bdi->wb_mask = 1;
- bdi->wb_cnt = 1;
-
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
err = percpu_counter_init(&bdi->bdi_stat[i], 0);
if (err)
@@ -712,10 +698,38 @@ void bdi_destroy(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL(bdi_destroy);
+/*
+ * For use from filesystems to quickly init and register a bdi associated
+ * with dirty writeback
+ */
+int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
+ unsigned int cap)
+{
+ char tmp[32];
+ int err;
+
+ bdi->name = name;
+ bdi->capabilities = cap;
+ err = bdi_init(bdi);
+ if (err)
+ return err;
+
+ sprintf(tmp, "%.28s%s", name, "-%d");
+ err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
+ if (err) {
+ bdi_destroy(bdi);
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(bdi_setup_and_register);
+
static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
};
+static atomic_t nr_bdi_congested[2];
void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
{
@@ -723,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
wait_queue_head_t *wqh = &congestion_wqh[sync];
bit = sync ? BDI_sync_congested : BDI_async_congested;
- clear_bit(bit, &bdi->state);
+ if (test_and_clear_bit(bit, &bdi->state))
+ atomic_dec(&nr_bdi_congested[sync]);
smp_mb__after_clear_bit();
if (waitqueue_active(wqh))
wake_up(wqh);
@@ -735,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
enum bdi_state bit;
bit = sync ? BDI_sync_congested : BDI_async_congested;
- set_bit(bit, &bdi->state);
+ if (!test_and_set_bit(bit, &bdi->state))
+ atomic_inc(&nr_bdi_congested[sync]);
}
EXPORT_SYMBOL(set_bdi_congested);
@@ -751,13 +767,72 @@ EXPORT_SYMBOL(set_bdi_congested);
long congestion_wait(int sync, long timeout)
{
long ret;
+ unsigned long start = jiffies;
DEFINE_WAIT(wait);
wait_queue_head_t *wqh = &congestion_wqh[sync];
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
ret = io_schedule_timeout(timeout);
finish_wait(wqh, &wait);
+
+ trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
return ret;
}
EXPORT_SYMBOL(congestion_wait);
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ * @zone: A zone to check if it is heavily congested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) and the given
+ * @zone has experienced recent congestion, this waits for up to @timeout
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+ * In the absense of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(struct zone *zone, int sync, long timeout)
+{
+ long ret;
+ unsigned long start = jiffies;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ /*
+ * If there is no congestion, or heavy congestion is not being
+ * encountered in the current zone, yield if necessary instead
+ * of sleeping on the congestion queue
+ */
+ if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+ !zone_is_reclaim_congested(zone)) {
+ cond_resched();
+
+ /* In case we scheduled, work out time remaining */
+ ret = timeout - (jiffies - start);
+ if (ret < 0)
+ ret = 0;
+
+ goto out;
+ }
+
+ /* Sleep until uncongested or a write happens */
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+
+out:
+ trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
+ return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 7d1486875e1c..13b0caa9793c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -10,9 +10,12 @@
*/
#include <linux/init.h>
#include <linux/pfn.h>
+#include <linux/slab.h>
#include <linux/bootmem.h>
#include <linux/module.h>
#include <linux/kmemleak.h>
+#include <linux/range.h>
+#include <linux/memblock.h>
#include <asm/bug.h>
#include <asm/io.h>
@@ -32,6 +35,7 @@ unsigned long max_pfn;
unsigned long saved_max_pfn;
#endif
+#ifndef CONFIG_NO_BOOTMEM
bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -142,7 +146,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
min_low_pfn = start;
return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
}
-
+#endif
/*
* free_bootmem_late - free bootmem pages directly to page allocator
* @addr: starting address of the range
@@ -167,6 +171,53 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
}
}
+#ifdef CONFIG_NO_BOOTMEM
+static void __init __free_pages_memory(unsigned long start, unsigned long end)
+{
+ int i;
+ unsigned long start_aligned, end_aligned;
+ int order = ilog2(BITS_PER_LONG);
+
+ start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
+ end_aligned = end & ~(BITS_PER_LONG - 1);
+
+ if (end_aligned <= start_aligned) {
+ for (i = start; i < end; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+
+ return;
+ }
+
+ for (i = start; i < start_aligned; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+
+ for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
+ __free_pages_bootmem(pfn_to_page(i), order);
+
+ for (i = end_aligned; i < end; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+}
+
+unsigned long __init free_all_memory_core_early(int nodeid)
+{
+ int i;
+ u64 start, end;
+ unsigned long count = 0;
+ struct range *range = NULL;
+ int nr_range;
+
+ nr_range = get_free_all_memory_range(&range, nodeid);
+
+ for (i = 0; i < nr_range; i++) {
+ start = range[i].start;
+ end = range[i].end;
+ count += end - start;
+ __free_pages_memory(start, end);
+ }
+
+ return count;
+}
+#else
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
int aligned;
@@ -227,6 +278,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
return count;
}
+#endif
/**
* free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -237,7 +289,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
{
register_page_bootmem_info_node(pgdat);
+#ifdef CONFIG_NO_BOOTMEM
+ /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
+ return 0;
+#else
return free_all_bootmem_core(pgdat->bdata);
+#endif
}
/**
@@ -247,9 +304,27 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
*/
unsigned long __init free_all_bootmem(void)
{
- return free_all_bootmem_core(NODE_DATA(0)->bdata);
+#ifdef CONFIG_NO_BOOTMEM
+ /*
+ * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
+ * because in some case like Node0 doesnt have RAM installed
+ * low ram will be on Node1
+ * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
+ * will be used instead of only Node0 related
+ */
+ return free_all_memory_core_early(MAX_NUMNODES);
+#else
+ unsigned long total_pages = 0;
+ bootmem_data_t *bdata;
+
+ list_for_each_entry(bdata, &bdata_list, list)
+ total_pages += free_all_bootmem_core(bdata);
+
+ return total_pages;
+#endif
}
+#ifndef CONFIG_NO_BOOTMEM
static void __init __free(bootmem_data_t *bdata,
unsigned long sidx, unsigned long eidx)
{
@@ -344,6 +419,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
}
BUG();
}
+#endif
/**
* free_bootmem_node - mark a page range as usable
@@ -358,6 +434,10 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
+#ifdef CONFIG_NO_BOOTMEM
+ kmemleak_free_part(__va(physaddr), size);
+ memblock_x86_free_range(physaddr, physaddr + size);
+#else
unsigned long start, end;
kmemleak_free_part(__va(physaddr), size);
@@ -366,6 +446,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
end = PFN_DOWN(physaddr + size);
mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
+#endif
}
/**
@@ -379,6 +460,10 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
*/
void __init free_bootmem(unsigned long addr, unsigned long size)
{
+#ifdef CONFIG_NO_BOOTMEM
+ kmemleak_free_part(__va(addr), size);
+ memblock_x86_free_range(addr, addr + size);
+#else
unsigned long start, end;
kmemleak_free_part(__va(addr), size);
@@ -387,6 +472,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
end = PFN_DOWN(addr + size);
mark_bootmem(start, end, 0, 0);
+#endif
}
/**
@@ -403,12 +489,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size, int flags)
{
+#ifdef CONFIG_NO_BOOTMEM
+ panic("no bootmem");
+ return 0;
+#else
unsigned long start, end;
start = PFN_DOWN(physaddr);
end = PFN_UP(physaddr + size);
return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
+#endif
}
/**
@@ -424,12 +515,24 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
int __init reserve_bootmem(unsigned long addr, unsigned long size,
int flags)
{
+#ifdef CONFIG_NO_BOOTMEM
+ panic("no bootmem");
+ return 0;
+#else
unsigned long start, end;
start = PFN_DOWN(addr);
end = PFN_UP(addr + size);
return mark_bootmem(start, end, 1, flags);
+#endif
+}
+
+#ifndef CONFIG_NO_BOOTMEM
+int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+ int flags)
+{
+ return reserve_bootmem(phys, len, flags);
}
static unsigned long __init align_idx(struct bootmem_data *bdata,
@@ -582,12 +685,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
#endif
return NULL;
}
+#endif
static void * __init ___alloc_bootmem_nopanic(unsigned long size,
unsigned long align,
unsigned long goal,
unsigned long limit)
{
+#ifdef CONFIG_NO_BOOTMEM
+ void *ptr;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
+
+restart:
+
+ ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+
+ if (ptr)
+ return ptr;
+
+ if (goal != 0) {
+ goal = 0;
+ goto restart;
+ }
+
+ return NULL;
+#else
bootmem_data_t *bdata;
void *region;
@@ -613,6 +737,7 @@ restart:
}
return NULL;
+#endif
}
/**
@@ -631,7 +756,13 @@ restart:
void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
unsigned long goal)
{
- return ___alloc_bootmem_nopanic(size, align, goal, 0);
+ unsigned long limit = 0;
+
+#ifdef CONFIG_NO_BOOTMEM
+ limit = -1UL;
+#endif
+
+ return ___alloc_bootmem_nopanic(size, align, goal, limit);
}
static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
@@ -665,9 +796,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
void * __init __alloc_bootmem(unsigned long size, unsigned long align,
unsigned long goal)
{
- return ___alloc_bootmem(size, align, goal, 0);
+ unsigned long limit = 0;
+
+#ifdef CONFIG_NO_BOOTMEM
+ limit = -1UL;
+#endif
+
+ return ___alloc_bootmem(size, align, goal, limit);
}
+#ifndef CONFIG_NO_BOOTMEM
static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
@@ -684,6 +822,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
return ___alloc_bootmem(size, align, goal, limit);
}
+#endif
/**
* __alloc_bootmem_node - allocate boot memory from a specific node
@@ -703,10 +842,58 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
+ void *ptr;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+#ifdef CONFIG_NO_BOOTMEM
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, -1ULL);
+ if (ptr)
+ return ptr;
+
+ ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
+ goal, -1ULL);
+#else
+ ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
+#endif
+
+ return ptr;
+}
+
+void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+#ifdef MAX_DMA32_PFN
+ unsigned long end_pfn;
+
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
- return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
+ /* update goal according ...MAX_DMA32_PFN */
+ end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+
+ if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
+ (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
+ void *ptr;
+ unsigned long new_goal;
+
+ new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
+#ifdef CONFIG_NO_BOOTMEM
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ new_goal, -1ULL);
+#else
+ ptr = alloc_bootmem_core(pgdat->bdata, size, align,
+ new_goal, 0);
+#endif
+ if (ptr)
+ return ptr;
+ }
+#endif
+
+ return __alloc_bootmem_node(pgdat, size, align, goal);
+
}
#ifdef CONFIG_SPARSEMEM
@@ -720,6 +907,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
void * __init alloc_bootmem_section(unsigned long size,
unsigned long section_nr)
{
+#ifdef CONFIG_NO_BOOTMEM
+ unsigned long pfn, goal, limit;
+
+ pfn = section_nr_to_pfn(section_nr);
+ goal = pfn << PAGE_SHIFT;
+ limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
+
+ return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
+ SMP_CACHE_BYTES, goal, limit);
+#else
bootmem_data_t *bdata;
unsigned long pfn, goal, limit;
@@ -729,6 +926,7 @@ void * __init alloc_bootmem_section(unsigned long size,
bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
+#endif
}
#endif
@@ -740,11 +938,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+#ifdef CONFIG_NO_BOOTMEM
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, -1ULL);
+#else
ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
if (ptr)
return ptr;
ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+#endif
if (ptr)
return ptr;
@@ -792,9 +995,21 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
+ void *ptr;
+
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
- return ___alloc_bootmem_node(pgdat->bdata, size, align,
+#ifdef CONFIG_NO_BOOTMEM
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, ARCH_LOW_ADDRESS_LIMIT);
+ if (ptr)
+ return ptr;
+ ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
+ goal, ARCH_LOW_ADDRESS_LIMIT);
+#else
+ ptr = ___alloc_bootmem_node(pgdat->bdata, size, align,
goal, ARCH_LOW_ADDRESS_LIMIT);
+#endif
+ return ptr;
}
diff --git a/mm/bounce.c b/mm/bounce.c
index a2b76a588e34..1481de68184b 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -6,6 +6,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/swap.h>
+#include <linux/gfp.h>
#include <linux/bio.h>
#include <linux/pagemap.h>
#include <linux/mempool.h>
@@ -115,8 +116,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
*/
vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
- flush_dcache_page(tovec->bv_page);
bounce_copy_vec(tovec, vfrom);
+ flush_dcache_page(tovec->bv_page);
}
}
diff --git a/mm/compaction.c b/mm/compaction.c
new file mode 100644
index 000000000000..6d592a021072
--- /dev/null
+++ b/mm/compaction.c
@@ -0,0 +1,695 @@
+/*
+ * linux/mm/compaction.c
+ *
+ * Memory compaction for the reduction of external fragmentation. Note that
+ * this heavily depends upon page migration to do all the real heavy
+ * lifting
+ *
+ * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
+ */
+#include <linux/swap.h>
+#include <linux/migrate.h>
+#include <linux/compaction.h>
+#include <linux/mm_inline.h>
+#include <linux/backing-dev.h>
+#include <linux/sysctl.h>
+#include <linux/sysfs.h>
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/compaction.h>
+
+/*
+ * compact_control is used to track pages being migrated and the free pages
+ * they are being migrated to during memory compaction. The free_pfn starts
+ * at the end of a zone and migrate_pfn begins at the start. Movable pages
+ * are moved to the end of a zone during a compaction run and the run
+ * completes when free_pfn <= migrate_pfn
+ */
+struct compact_control {
+ struct list_head freepages; /* List of free pages to migrate to */
+ struct list_head migratepages; /* List of pages being migrated */
+ unsigned long nr_freepages; /* Number of isolated free pages */
+ unsigned long nr_migratepages; /* Number of pages to migrate */
+ unsigned long free_pfn; /* isolate_freepages search base */
+ unsigned long migrate_pfn; /* isolate_migratepages search base */
+ bool sync; /* Synchronous migration */
+
+ /* Account for isolated anon and file pages */
+ unsigned long nr_anon;
+ unsigned long nr_file;
+
+ unsigned int order; /* order a direct compactor needs */
+ int migratetype; /* MOVABLE, RECLAIMABLE etc */
+ struct zone *zone;
+
+ int compact_mode;
+};
+
+static unsigned long release_freepages(struct list_head *freelist)
+{
+ struct page *page, *next;
+ unsigned long count = 0;
+
+ list_for_each_entry_safe(page, next, freelist, lru) {
+ list_del(&page->lru);
+ __free_page(page);
+ count++;
+ }
+
+ return count;
+}
+
+/* Isolate free pages onto a private freelist. Must hold zone->lock */
+static unsigned long isolate_freepages_block(struct zone *zone,
+ unsigned long blockpfn,
+ struct list_head *freelist)
+{
+ unsigned long zone_end_pfn, end_pfn;
+ int nr_scanned = 0, total_isolated = 0;
+ struct page *cursor;
+
+ /* Get the last PFN we should scan for free pages at */
+ zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);
+
+ /* Find the first usable PFN in the block to initialse page cursor */
+ for (; blockpfn < end_pfn; blockpfn++) {
+ if (pfn_valid_within(blockpfn))
+ break;
+ }
+ cursor = pfn_to_page(blockpfn);
+
+ /* Isolate free pages. This assumes the block is valid */
+ for (; blockpfn < end_pfn; blockpfn++, cursor++) {
+ int isolated, i;
+ struct page *page = cursor;
+
+ if (!pfn_valid_within(blockpfn))
+ continue;
+ nr_scanned++;
+
+ if (!PageBuddy(page))
+ continue;
+
+ /* Found a free page, break it into order-0 pages */
+ isolated = split_free_page(page);
+ total_isolated += isolated;
+ for (i = 0; i < isolated; i++) {
+ list_add(&page->lru, freelist);
+ page++;
+ }
+
+ /* If a page was split, advance to the end of it */
+ if (isolated) {
+ blockpfn += isolated - 1;
+ cursor += isolated - 1;
+ }
+ }
+
+ trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
+ return total_isolated;
+}
+
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+
+ int migratetype = get_pageblock_migratetype(page);
+
+ /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+ if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
+ return false;
+
+ /* If the page is a large free page, then allow migration */
+ if (PageBuddy(page) && page_order(page) >= pageblock_order)
+ return true;
+
+ /* If the block is MIGRATE_MOVABLE, allow migration */
+ if (migratetype == MIGRATE_MOVABLE)
+ return true;
+
+ /* Otherwise skip the block */
+ return false;
+}
+
+/*
+ * Based on information in the current compact_control, find blocks
+ * suitable for isolating free pages from and then isolate them.
+ */
+static void isolate_freepages(struct zone *zone,
+ struct compact_control *cc)
+{
+ struct page *page;
+ unsigned long high_pfn, low_pfn, pfn;
+ unsigned long flags;
+ int nr_freepages = cc->nr_freepages;
+ struct list_head *freelist = &cc->freepages;
+
+ pfn = cc->free_pfn;
+ low_pfn = cc->migrate_pfn + pageblock_nr_pages;
+ high_pfn = low_pfn;
+
+ /*
+ * Isolate free pages until enough are available to migrate the
+ * pages on cc->migratepages. We stop searching if the migrate
+ * and free page scanners meet or enough free pages are isolated.
+ */
+ spin_lock_irqsave(&zone->lock, flags);
+ for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
+ pfn -= pageblock_nr_pages) {
+ unsigned long isolated;
+
+ if (!pfn_valid(pfn))
+ continue;
+
+ /*
+ * Check for overlapping nodes/zones. It's possible on some
+ * configurations to have a setup like
+ * node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of
+ * pages do not belong to a single zone.
+ */
+ page = pfn_to_page(pfn);
+ if (page_zone(page) != zone)
+ continue;
+
+ /* Check the block is suitable for migration */
+ if (!suitable_migration_target(page))
+ continue;
+
+ /* Found a block suitable for isolating free pages from */
+ isolated = isolate_freepages_block(zone, pfn, freelist);
+ nr_freepages += isolated;
+
+ /*
+ * Record the highest PFN we isolated pages from. When next
+ * looking for free pages, the search will restart here as
+ * page migration may have returned some pages to the allocator
+ */
+ if (isolated)
+ high_pfn = max(high_pfn, pfn);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ /* split_free_page does not map the pages */
+ list_for_each_entry(page, freelist, lru) {
+ arch_alloc_page(page, 0);
+ kernel_map_pages(page, 1, 1);
+ }
+
+ cc->free_pfn = high_pfn;
+ cc->nr_freepages = nr_freepages;
+}
+
+/* Update the number of anon and file isolated pages in the zone */
+static void acct_isolated(struct zone *zone, struct compact_control *cc)
+{
+ struct page *page;
+ unsigned int count[NR_LRU_LISTS] = { 0, };
+
+ list_for_each_entry(page, &cc->migratepages, lru) {
+ int lru = page_lru_base_type(page);
+ count[lru]++;
+ }
+
+ cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+ cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
+}
+
+/* Similar to reclaim, but different enough that they don't share logic */
+static bool too_many_isolated(struct zone *zone)
+{
+ unsigned long active, inactive, isolated;
+
+ inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_ANON);
+ active = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_ACTIVE_ANON);
+ isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
+ zone_page_state(zone, NR_ISOLATED_ANON);
+
+ return isolated > (inactive + active) / 2;
+}
+
+/*
+ * Isolate all pages that can be migrated from the block pointed to by
+ * the migrate scanner within compact_control.
+ */
+static unsigned long isolate_migratepages(struct zone *zone,
+ struct compact_control *cc)
+{
+ unsigned long low_pfn, end_pfn;
+ unsigned long last_pageblock_nr = 0, pageblock_nr;
+ unsigned long nr_scanned = 0, nr_isolated = 0;
+ struct list_head *migratelist = &cc->migratepages;
+
+ /* Do not scan outside zone boundaries */
+ low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
+
+ /* Only scan within a pageblock boundary */
+ end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
+
+ /* Do not cross the free scanner or scan within a memory hole */
+ if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
+ cc->migrate_pfn = end_pfn;
+ return 0;
+ }
+
+ /*
+ * Ensure that there are not too many pages isolated from the LRU
+ * list by either parallel reclaimers or compaction. If there are,
+ * delay for some time until fewer pages are isolated
+ */
+ while (unlikely(too_many_isolated(zone))) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+
+ if (fatal_signal_pending(current))
+ return 0;
+ }
+
+ /* Time to isolate some pages for migration */
+ spin_lock_irq(&zone->lru_lock);
+ for (; low_pfn < end_pfn; low_pfn++) {
+ struct page *page;
+ if (!pfn_valid_within(low_pfn))
+ continue;
+ nr_scanned++;
+
+ /* Get the page and skip if free */
+ page = pfn_to_page(low_pfn);
+ if (PageBuddy(page))
+ continue;
+
+ /*
+ * For async migration, also only scan in MOVABLE blocks. Async
+ * migration is optimistic to see if the minimum amount of work
+ * satisfies the allocation
+ */
+ pageblock_nr = low_pfn >> pageblock_order;
+ if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+ get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
+ low_pfn += pageblock_nr_pages;
+ low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+ last_pageblock_nr = pageblock_nr;
+ continue;
+ }
+
+ if (!PageLRU(page))
+ continue;
+
+ /*
+ * PageLRU is set, and lru_lock excludes isolation,
+ * splitting and collapsing (collapsing has already
+ * happened if PageLRU is set).
+ */
+ if (PageTransHuge(page)) {
+ low_pfn += (1 << compound_order(page)) - 1;
+ continue;
+ }
+
+ /* Try isolate the page */
+ if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
+ continue;
+
+ VM_BUG_ON(PageTransCompound(page));
+
+ /* Successfully isolated */
+ del_page_from_lru_list(zone, page, page_lru(page));
+ list_add(&page->lru, migratelist);
+ cc->nr_migratepages++;
+ nr_isolated++;
+
+ /* Avoid isolating too much */
+ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
+ break;
+ }
+
+ acct_isolated(zone, cc);
+
+ spin_unlock_irq(&zone->lru_lock);
+ cc->migrate_pfn = low_pfn;
+
+ trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+
+ return cc->nr_migratepages;
+}
+
+/*
+ * This is a migrate-callback that "allocates" freepages by taking pages
+ * from the isolated freelists in the block we are migrating to.
+ */
+static struct page *compaction_alloc(struct page *migratepage,
+ unsigned long data,
+ int **result)
+{
+ struct compact_control *cc = (struct compact_control *)data;
+ struct page *freepage;
+
+ /* Isolate free pages if necessary */
+ if (list_empty(&cc->freepages)) {
+ isolate_freepages(cc->zone, cc);
+
+ if (list_empty(&cc->freepages))
+ return NULL;
+ }
+
+ freepage = list_entry(cc->freepages.next, struct page, lru);
+ list_del(&freepage->lru);
+ cc->nr_freepages--;
+
+ return freepage;
+}
+
+/*
+ * We cannot control nr_migratepages and nr_freepages fully when migration is
+ * running as migrate_pages() has no knowledge of compact_control. When
+ * migration is complete, we count the number of pages on the lists by hand.
+ */
+static void update_nr_listpages(struct compact_control *cc)
+{
+ int nr_migratepages = 0;
+ int nr_freepages = 0;
+ struct page *page;
+
+ list_for_each_entry(page, &cc->migratepages, lru)
+ nr_migratepages++;
+ list_for_each_entry(page, &cc->freepages, lru)
+ nr_freepages++;
+
+ cc->nr_migratepages = nr_migratepages;
+ cc->nr_freepages = nr_freepages;
+}
+
+static int compact_finished(struct zone *zone,
+ struct compact_control *cc)
+{
+ unsigned int order;
+ unsigned long watermark;
+
+ if (fatal_signal_pending(current))
+ return COMPACT_PARTIAL;
+
+ /* Compaction run completes if the migrate and free scanner meet */
+ if (cc->free_pfn <= cc->migrate_pfn)
+ return COMPACT_COMPLETE;
+
+ /* Compaction run is not finished if the watermark is not met */
+ if (cc->compact_mode != COMPACT_MODE_KSWAPD)
+ watermark = low_wmark_pages(zone);
+ else
+ watermark = high_wmark_pages(zone);
+ watermark += (1 << cc->order);
+
+ if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
+ return COMPACT_CONTINUE;
+
+ if (cc->order == -1)
+ return COMPACT_CONTINUE;
+
+ /*
+ * Generating only one page of the right order is not enough
+ * for kswapd, we must continue until we're above the high
+ * watermark as a pool for high order GFP_ATOMIC allocations
+ * too.
+ */
+ if (cc->compact_mode == COMPACT_MODE_KSWAPD)
+ return COMPACT_CONTINUE;
+
+ /* Direct compactor: Is a suitable page free? */
+ for (order = cc->order; order < MAX_ORDER; order++) {
+ /* Job done if page is free of the right migratetype */
+ if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
+ return COMPACT_PARTIAL;
+
+ /* Job done if allocation would set block type */
+ if (order >= pageblock_order && zone->free_area[order].nr_free)
+ return COMPACT_PARTIAL;
+ }
+
+ return COMPACT_CONTINUE;
+}
+
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ * COMPACT_SKIPPED - If there are too few free pages for compaction
+ * COMPACT_PARTIAL - If the allocation would succeed without compaction
+ * COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+ int fragindex;
+ unsigned long watermark;
+
+ /*
+ * Watermarks for order-0 must be met for compaction. Note the 2UL.
+ * This is because during migration, copies of pages need to be
+ * allocated and for a short time, the footprint is higher
+ */
+ watermark = low_wmark_pages(zone) + (2UL << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ return COMPACT_SKIPPED;
+
+ /*
+ * fragmentation index determines if allocation failures are due to
+ * low memory or external fragmentation
+ *
+ * index of -1 implies allocations might succeed dependingon watermarks
+ * index towards 0 implies failure is due to lack of memory
+ * index towards 1000 implies failure is due to fragmentation
+ *
+ * Only compact if a failure would be due to fragmentation.
+ */
+ fragindex = fragmentation_index(zone, order);
+ if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+ return COMPACT_SKIPPED;
+
+ if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+ return COMPACT_PARTIAL;
+
+ return COMPACT_CONTINUE;
+}
+
+static int compact_zone(struct zone *zone, struct compact_control *cc)
+{
+ int ret;
+
+ ret = compaction_suitable(zone, cc->order);
+ switch (ret) {
+ case COMPACT_PARTIAL:
+ case COMPACT_SKIPPED:
+ /* Compaction is likely to fail */
+ return ret;
+ case COMPACT_CONTINUE:
+ /* Fall through to compaction */
+ ;
+ }
+
+ /* Setup to move all movable pages to the end of the zone */
+ cc->migrate_pfn = zone->zone_start_pfn;
+ cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
+ cc->free_pfn &= ~(pageblock_nr_pages-1);
+
+ migrate_prep_local();
+
+ while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+ unsigned long nr_migrate, nr_remaining;
+
+ if (!isolate_migratepages(zone, cc))
+ continue;
+
+ nr_migrate = cc->nr_migratepages;
+ migrate_pages(&cc->migratepages, compaction_alloc,
+ (unsigned long)cc, false,
+ cc->sync);
+ update_nr_listpages(cc);
+ nr_remaining = cc->nr_migratepages;
+
+ count_vm_event(COMPACTBLOCKS);
+ count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
+ if (nr_remaining)
+ count_vm_events(COMPACTPAGEFAILED, nr_remaining);
+ trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
+ nr_remaining);
+
+ /* Release LRU pages not migrated */
+ if (!list_empty(&cc->migratepages)) {
+ putback_lru_pages(&cc->migratepages);
+ cc->nr_migratepages = 0;
+ }
+
+ }
+
+ /* Release free pages and check accounting */
+ cc->nr_freepages -= release_freepages(&cc->freepages);
+ VM_BUG_ON(cc->nr_freepages != 0);
+
+ return ret;
+}
+
+unsigned long compact_zone_order(struct zone *zone,
+ int order, gfp_t gfp_mask,
+ bool sync,
+ int compact_mode)
+{
+ struct compact_control cc = {
+ .nr_freepages = 0,
+ .nr_migratepages = 0,
+ .order = order,
+ .migratetype = allocflags_to_migratetype(gfp_mask),
+ .zone = zone,
+ .sync = sync,
+ .compact_mode = compact_mode,
+ };
+ INIT_LIST_HEAD(&cc.freepages);
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ return compact_zone(zone, &cc);
+}
+
+int sysctl_extfrag_threshold = 500;
+
+/**
+ * try_to_compact_pages - Direct compact to satisfy a high-order allocation
+ * @zonelist: The zonelist used for the current allocation
+ * @order: The order of the current allocation
+ * @gfp_mask: The GFP mask of the current allocation
+ * @nodemask: The allowed nodes to allocate from
+ * @sync: Whether migration is synchronous or not
+ *
+ * This is the main entry point for direct page compaction.
+ */
+unsigned long try_to_compact_pages(struct zonelist *zonelist,
+ int order, gfp_t gfp_mask, nodemask_t *nodemask,
+ bool sync)
+{
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ int may_enter_fs = gfp_mask & __GFP_FS;
+ int may_perform_io = gfp_mask & __GFP_IO;
+ struct zoneref *z;
+ struct zone *zone;
+ int rc = COMPACT_SKIPPED;
+
+ /*
+ * Check whether it is worth even starting compaction. The order check is
+ * made because an assumption is made that the page allocator can satisfy
+ * the "cheaper" orders without taking special steps
+ */
+ if (!order || !may_enter_fs || !may_perform_io)
+ return rc;
+
+ count_vm_event(COMPACTSTALL);
+
+ /* Compact each zone in the list */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
+ nodemask) {
+ int status;
+
+ status = compact_zone_order(zone, order, gfp_mask, sync,
+ COMPACT_MODE_DIRECT_RECLAIM);
+ rc = max(status, rc);
+
+ /* If a normal allocation would succeed, stop compacting */
+ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
+ break;
+ }
+
+ return rc;
+}
+
+
+/* Compact all zones within a node */
+static int compact_node(int nid)
+{
+ int zoneid;
+ pg_data_t *pgdat;
+ struct zone *zone;
+
+ if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
+ return -EINVAL;
+ pgdat = NODE_DATA(nid);
+
+ /* Flush pending updates to the LRU lists */
+ lru_add_drain_all();
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ struct compact_control cc = {
+ .nr_freepages = 0,
+ .nr_migratepages = 0,
+ .order = -1,
+ .compact_mode = COMPACT_MODE_DIRECT_RECLAIM,
+ };
+
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ cc.zone = zone;
+ INIT_LIST_HEAD(&cc.freepages);
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ compact_zone(zone, &cc);
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+ }
+
+ return 0;
+}
+
+/* Compact all nodes in the system */
+static int compact_nodes(void)
+{
+ int nid;
+
+ for_each_online_node(nid)
+ compact_node(nid);
+
+ return COMPACT_COMPLETE;
+}
+
+/* The written value is actually unused, all memory is compacted */
+int sysctl_compact_memory;
+
+/* This is the entry point for compacting all nodes via /proc/sys/vm */
+int sysctl_compaction_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ if (write)
+ return compact_nodes();
+
+ return 0;
+}
+
+int sysctl_extfrag_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ proc_dointvec_minmax(table, write, buffer, length, ppos);
+
+ return 0;
+}
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+ssize_t sysfs_compact_node(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ const char *buf, size_t count)
+{
+ compact_node(dev->id);
+
+ return count;
+}
+static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
+
+int compaction_register_node(struct node *node)
+{
+ return sysdev_create_file(&node->sysdev, &attr_compact);
+}
+
+void compaction_unregister_node(struct node *node)
+{
+ return sysdev_remove_file(&node->sysdev, &attr_compact);
+}
+#endif /* CONFIG_SYSFS && CONFIG_NUMA */
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 3df063706f53..03bf3bb4519a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -311,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
size_t offset;
void *retval;
+ might_sleep_if(mem_flags & __GFP_WAIT);
+
spin_lock_irqsave(&pool->lock, flags);
restart:
list_for_each_entry(page, &pool->page_list, page_list) {
@@ -322,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
if (mem_flags & __GFP_WAIT) {
DECLARE_WAITQUEUE(wait, current);
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
__add_wait_queue(&pool->waitq, &wait);
spin_unlock_irqrestore(&pool->lock, flags);
@@ -353,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc);
static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
{
- unsigned long flags;
struct dma_page *page;
- spin_lock_irqsave(&pool->lock, flags);
list_for_each_entry(page, &pool->page_list, page_list) {
if (dma < page->dma)
continue;
if (dma < (page->dma + pool->allocation))
- goto done;
+ return page;
}
- page = NULL;
- done:
- spin_unlock_irqrestore(&pool->lock, flags);
- return page;
+ return NULL;
}
/**
@@ -384,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
unsigned long flags;
unsigned int offset;
+ spin_lock_irqsave(&pool->lock, flags);
page = pool_find_page(pool, dma);
if (!page) {
+ spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
dev_err(pool->dev,
"dma_pool_free %s, %p/%lx (bad dma)\n",
@@ -399,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
offset = vaddr - page->vaddr;
#ifdef DMAPOOL_DEBUG
if ((dma - page->dma) != offset) {
+ spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
dev_err(pool->dev,
"dma_pool_free %s, %p (bad vaddr)/%Lx\n",
@@ -416,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
chain = *(int *)(page->vaddr + chain);
continue;
}
+ spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
"already free\n", pool->name,
@@ -430,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
memset(vaddr, POOL_POISON_FREED, pool->size);
#endif
- spin_lock_irqsave(&pool->lock, flags);
page->in_use--;
*(int *)vaddr = page->offset;
page->offset = offset;
diff --git a/mm/fadvise.c b/mm/fadvise.c
index e43359214f6f..8d723c9e8b75 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -77,12 +77,20 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
switch (advice) {
case POSIX_FADV_NORMAL:
file->f_ra.ra_pages = bdi->ra_pages;
+ spin_lock(&file->f_lock);
+ file->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&file->f_lock);
break;
case POSIX_FADV_RANDOM:
- file->f_ra.ra_pages = 0;
+ spin_lock(&file->f_lock);
+ file->f_mode |= FMODE_RANDOM;
+ spin_unlock(&file->f_lock);
break;
case POSIX_FADV_SEQUENTIAL:
file->f_ra.ra_pages = bdi->ra_pages * 2;
+ spin_lock(&file->f_lock);
+ file->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&file->f_lock);
break;
case POSIX_FADV_WILLNEED:
if (!mapping->a_ops->readpage) {
diff --git a/mm/failslab.c b/mm/failslab.c
index 9339de5f0a91..c5f88f240ddc 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,18 +1,21 @@
#include <linux/fault-inject.h>
-#include <linux/gfp.h>
+#include <linux/slab.h>
static struct {
struct fault_attr attr;
u32 ignore_gfp_wait;
+ int cache_filter;
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
struct dentry *ignore_gfp_wait_file;
+ struct dentry *cache_filter_file;
#endif
} failslab = {
.attr = FAULT_ATTR_INITIALIZER,
.ignore_gfp_wait = 1,
+ .cache_filter = 0,
};
-bool should_failslab(size_t size, gfp_t gfpflags)
+bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
{
if (gfpflags & __GFP_NOFAIL)
return false;
@@ -20,6 +23,9 @@ bool should_failslab(size_t size, gfp_t gfpflags)
if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
return false;
+ if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
+ return false;
+
return should_fail(&failslab.attr, size);
}
@@ -30,7 +36,6 @@ static int __init setup_failslab(char *str)
__setup("failslab=", setup_failslab);
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-
static int __init failslab_debugfs_init(void)
{
mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
@@ -46,8 +51,14 @@ static int __init failslab_debugfs_init(void)
debugfs_create_bool("ignore-gfp-wait", mode, dir,
&failslab.ignore_gfp_wait);
- if (!failslab.ignore_gfp_wait_file) {
+ failslab.cache_filter_file =
+ debugfs_create_bool("cache-filter", mode, dir,
+ &failslab.cache_filter);
+
+ if (!failslab.ignore_gfp_wait_file ||
+ !failslab.cache_filter_file) {
err = -ENOMEM;
+ debugfs_remove(failslab.cache_filter_file);
debugfs_remove(failslab.ignore_gfp_wait_file);
cleanup_fault_attr_dentries(&failslab.attr);
}
diff --git a/mm/filemap.c b/mm/filemap.c
index e3736923220e..83a45d35468b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -10,13 +10,13 @@
* the NFS filesystem used to do this differently, for example)
*/
#include <linux/module.h>
-#include <linux/slab.h>
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/aio.h>
#include <linux/capability.h>
#include <linux/kernel_stat.h>
+#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/mman.h>
@@ -102,9 +102,6 @@
* ->inode_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
- * ->task->proc_lock
- * ->dcache_lock (proc_pid_lookup)
- *
* (code doesn't rely on that order, so you could switch it around)
* ->tasklist_lock (memory_failure, collect_procs_ao)
* ->i_mmap_lock
@@ -143,14 +140,20 @@ void __remove_from_page_cache(struct page *page)
void remove_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
+ void (*freepage)(struct page *);
BUG_ON(!PageLocked(page));
+ freepage = mapping->a_ops->freepage;
spin_lock_irq(&mapping->tree_lock);
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (freepage)
+ freepage(page);
}
+EXPORT_SYMBOL(remove_from_page_cache);
static int sync_page(void *word)
{
@@ -295,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
continue;
wait_on_page_writeback(page);
- if (PageError(page))
+ if (TestClearPageError(page))
ret = -EIO;
}
pagevec_release(&pvec);
@@ -441,7 +444,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
/*
* Splice_read and readahead add shmem/tmpfs pages into the page cache
* before shmem_readpage has a chance to mark them as SwapBacked: they
- * need to go on the active_anon lru below, and mem_cgroup_cache_charge
+ * need to go on the anon lru below, and mem_cgroup_cache_charge
* (called in add_to_page_cache) needs to know where they're going too.
*/
if (mapping_cap_swap_backed(mapping))
@@ -452,7 +455,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
if (page_is_file_cache(page))
lru_cache_add_file(page);
else
- lru_cache_add_active_anon(page);
+ lru_cache_add_anon(page);
}
return ret;
}
@@ -461,9 +464,15 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
#ifdef CONFIG_NUMA
struct page *__page_cache_alloc(gfp_t gfp)
{
+ int n;
+ struct page *page;
+
if (cpuset_do_page_mem_spread()) {
- int n = cpuset_mem_spread_node();
- return alloc_pages_exact_node(n, gfp, 0);
+ get_mems_allowed();
+ n = cpuset_mem_spread_node();
+ page = alloc_pages_exact_node(n, gfp, 0);
+ put_mems_allowed();
+ return page;
}
return alloc_pages(gfp, 0);
}
@@ -605,6 +614,19 @@ void __lock_page_nosync(struct page *page)
TASK_UNINTERRUPTIBLE);
}
+int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+ unsigned int flags)
+{
+ if (!(flags & FAULT_FLAG_ALLOW_RETRY)) {
+ __lock_page(page);
+ return 1;
+ } else {
+ up_read(&mm->mmap_sem);
+ wait_on_page_locked(page);
+ return 0;
+ }
+}
+
/**
* find_get_page - find and get a page reference
* @mapping: the address_space to search
@@ -624,7 +646,9 @@ repeat:
pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
if (pagep) {
page = radix_tree_deref_slot(pagep);
- if (unlikely(!page || page == RADIX_TREE_RETRY))
+ if (unlikely(!page))
+ goto out;
+ if (radix_tree_deref_retry(page))
goto repeat;
if (!page_cache_get_speculative(page))
@@ -640,6 +664,7 @@ repeat:
goto repeat;
}
}
+out:
rcu_read_unlock();
return page;
@@ -757,12 +782,11 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- /*
- * this can only trigger if nr_found == 1, making livelock
- * a non issue.
- */
- if (unlikely(page == RADIX_TREE_RETRY))
+ if (radix_tree_deref_retry(page)) {
+ if (ret)
+ start = pages[ret-1]->index;
goto restart;
+ }
if (!page_cache_get_speculative(page))
goto repeat;
@@ -810,16 +834,9 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- /*
- * this can only trigger if nr_found == 1, making livelock
- * a non issue.
- */
- if (unlikely(page == RADIX_TREE_RETRY))
+ if (radix_tree_deref_retry(page))
goto restart;
- if (page->mapping == NULL || page->index != index)
- break;
-
if (!page_cache_get_speculative(page))
goto repeat;
@@ -829,6 +846,16 @@ repeat:
goto repeat;
}
+ /*
+ * must check mapping and index after taking the ref.
+ * otherwise we can get both false positives and false
+ * negatives, which is just confusing to the caller.
+ */
+ if (page->mapping == NULL || page->index != index) {
+ page_cache_release(page);
+ break;
+ }
+
pages[ret] = page;
ret++;
index++;
@@ -867,11 +894,7 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- /*
- * this can only trigger if nr_found == 1, making livelock
- * a non issue.
- */
- if (unlikely(page == RADIX_TREE_RETRY))
+ if (radix_tree_deref_retry(page))
goto restart;
if (!page_cache_get_speculative(page))
@@ -1009,6 +1032,9 @@ find_page:
goto page_not_up_to_date;
if (!trylock_page(page))
goto page_not_up_to_date;
+ /* Did it get truncated before we got the lock? */
+ if (!page->mapping)
+ goto page_not_up_to_date_locked;
if (!mapping->a_ops->is_partially_uptodate(page,
desc, offset))
goto page_not_up_to_date_locked;
@@ -1099,6 +1125,12 @@ page_not_up_to_date_locked:
}
readpage:
+ /*
+ * A previous I/O error may have been due to temporary
+ * failures, eg. multipath errors.
+ * PG_error will be set again if readpage fails.
+ */
+ ClearPageError(page);
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
@@ -1117,7 +1149,7 @@ readpage:
if (!PageUptodate(page)) {
if (page->mapping == NULL) {
/*
- * invalidate_inode_pages got it
+ * invalidate_mapping_pages got it
*/
unlock_page(page);
page_cache_release(page);
@@ -1263,7 +1295,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
{
struct file *filp = iocb->ki_filp;
ssize_t retval;
- unsigned long seg;
+ unsigned long seg = 0;
size_t count;
loff_t *ppos = &iocb->ki_pos;
@@ -1290,21 +1322,47 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
retval = mapping->a_ops->direct_IO(READ, iocb,
iov, pos, nr_segs);
}
- if (retval > 0)
+ if (retval > 0) {
*ppos = pos + retval;
- if (retval) {
+ count -= retval;
+ }
+
+ /*
+ * Btrfs can have a short DIO read if we encounter
+ * compressed extents, so if there was an error, or if
+ * we've already read everything we wanted to, or if
+ * there was a short read because we hit EOF, go ahead
+ * and return. Otherwise fallthrough to buffered io for
+ * the rest of the read.
+ */
+ if (retval < 0 || !count || *ppos >= size) {
file_accessed(filp);
goto out;
}
}
}
+ count = retval;
for (seg = 0; seg < nr_segs; seg++) {
read_descriptor_t desc;
+ loff_t offset = 0;
+
+ /*
+ * If we did a short DIO read we need to skip the section of the
+ * iov that we've already read data into.
+ */
+ if (count) {
+ if (count > iov[seg].iov_len) {
+ count -= iov[seg].iov_len;
+ continue;
+ }
+ offset = count;
+ count = 0;
+ }
desc.written = 0;
- desc.arg.buf = iov[seg].iov_base;
- desc.count = iov[seg].iov_len;
+ desc.arg.buf = iov[seg].iov_base + offset;
+ desc.count = iov[seg].iov_len - offset;
if (desc.count == 0)
continue;
desc.error = 0;
@@ -1500,25 +1558,30 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* waiting for the lock.
*/
do_async_mmap_readahead(vma, ra, file, page, offset);
- lock_page(page);
-
- /* Did it get truncated? */
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
- goto no_cached_page;
- }
} else {
/* No page in the page cache at all */
do_sync_mmap_readahead(vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
ret = VM_FAULT_MAJOR;
retry_find:
- page = find_lock_page(mapping, offset);
+ page = find_get_page(mapping, offset);
if (!page)
goto no_cached_page;
}
+ if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+ page_cache_release(page);
+ return ret | VM_FAULT_RETRY;
+ }
+
+ /* Did it get truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ put_page(page);
+ goto retry_find;
+ }
+ VM_BUG_ON(page->index != offset);
+
/*
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
@@ -1986,7 +2049,7 @@ EXPORT_SYMBOL(iov_iter_single_seg_count);
inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
{
struct inode *inode = file->f_mapping->host;
- unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ unsigned long limit = rlimit(RLIMIT_FSIZE);
if (unlikely(*pos < 0))
return -EINVAL;
@@ -2138,12 +2201,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
}
if (written > 0) {
- loff_t end = pos + written;
- if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
- i_size_write(inode, end);
+ pos += written;
+ if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+ i_size_write(inode, pos);
mark_inode_dirty(inode);
}
- *ppos = end;
+ *ppos = pos;
}
out:
return written;
@@ -2164,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
gfp_notmask = __GFP_FS;
repeat:
page = find_lock_page(mapping, index);
- if (likely(page))
+ if (page)
return page;
page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
@@ -2199,14 +2262,12 @@ static ssize_t generic_perform_write(struct file *file,
do {
struct page *page;
- pgoff_t index; /* Pagecache index for current page */
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
void *fsdata;
offset = (pos & (PAGE_CACHE_SIZE - 1));
- index = pos >> PAGE_CACHE_SHIFT;
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
iov_iter_count(i));
@@ -2232,6 +2293,9 @@ again:
if (unlikely(status))
break;
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+
pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
pagefault_enable();
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 1888b2d71bb8..83364df74a33 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -17,6 +17,7 @@
#include <linux/sched.h>
#include <linux/seqlock.h>
#include <linux/mutex.h>
+#include <linux/gfp.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
@@ -194,7 +195,7 @@ retry:
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush_notify(vma, address, pte);
page_remove_rmap(page);
- dec_mm_counter(mm, file_rss);
+ dec_mm_counter(mm, MM_FILEPAGES);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
page_cache_release(page);
diff --git a/mm/fremap.c b/mm/fremap.c
index b6ec85abbb39..ec520c7b28df 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
page_remove_rmap(page);
page_cache_release(page);
update_hiwater_rss(mm);
- dec_mm_counter(mm, file_rss);
+ dec_mm_counter(mm, MM_FILEPAGES);
}
} else {
if (!pte_file(pte))
@@ -125,7 +125,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
{
struct mm_struct *mm = current->mm;
struct address_space *mapping;
- unsigned long end = start + size;
struct vm_area_struct *vma;
int err = -EINVAL;
int has_write_lock = 0;
@@ -142,6 +141,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (start + size <= start)
return err;
+ /* Does pgoff wrap? */
+ if (pgoff + (size >> PAGE_SHIFT) < pgoff)
+ return err;
+
/* Can we represent this offset inside this architecture's pte's? */
#if PTE_FILE_MAX_BITS < BITS_PER_LONG
if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
@@ -168,7 +171,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (!(vma->vm_flags & VM_CAN_NONLINEAR))
goto out;
- if (end <= start || start < vma->vm_start || end > vma->vm_end)
+ if (start < vma->vm_start || start + size > vma->vm_end)
goto out;
/* Must set VM_NONLINEAR before any pages are populated. */
diff --git a/mm/highmem.c b/mm/highmem.c
index 9c1e627f282e..693394daa2ed 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,8 +26,14 @@
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
+#include <linux/kgdb.h>
#include <asm/tlbflush.h>
+
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
+DEFINE_PER_CPU(int, __kmap_atomic_idx);
+#endif
+
/*
* Virtual_count is not a pure "count".
* 0 means that it is not mapped, and has not been mapped
@@ -41,6 +47,9 @@
unsigned long totalhigh_pages __read_mostly;
EXPORT_SYMBOL(totalhigh_pages);
+
+EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
+
unsigned int nr_free_highpages (void)
{
pg_data_t *pgdat;
@@ -220,7 +229,7 @@ EXPORT_SYMBOL(kmap_high);
* @page: &struct page to pin
*
* Returns the page's current virtual memory address, or NULL if no mapping
- * exists. When and only when a non null address is returned then a
+ * exists. If and only if a non null address is returned then a
* matching call to kunmap_high() is necessary.
*
* This can be called from any context.
@@ -421,55 +430,3 @@ void __init page_address_init(void)
}
#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
-
-#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
-
-void debug_kmap_atomic(enum km_type type)
-{
- static int warn_count = 10;
-
- if (unlikely(warn_count < 0))
- return;
-
- if (unlikely(in_interrupt())) {
- if (in_nmi()) {
- if (type != KM_NMI && type != KM_NMI_PTE) {
- WARN_ON(1);
- warn_count--;
- }
- } else if (in_irq()) {
- if (type != KM_IRQ0 && type != KM_IRQ1 &&
- type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
- type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
- WARN_ON(1);
- warn_count--;
- }
- } else if (!irqs_disabled()) { /* softirq */
- if (type != KM_IRQ0 && type != KM_IRQ1 &&
- type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
- type != KM_SKB_SUNRPC_DATA &&
- type != KM_SKB_DATA_SOFTIRQ &&
- type != KM_BOUNCE_READ) {
- WARN_ON(1);
- warn_count--;
- }
- }
- }
-
- if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
- type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
- type == KM_IRQ_PTE || type == KM_NMI ||
- type == KM_NMI_PTE ) {
- if (!irqs_disabled()) {
- WARN_ON(1);
- warn_count--;
- }
- } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
- if (irq_count() == 0 && !irqs_disabled()) {
- WARN_ON(1);
- warn_count--;
- }
- }
-}
-
-#endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644
index 000000000000..004c9c2aac78
--- /dev/null
+++ b/mm/huge_memory.c
@@ -0,0 +1,2346 @@
+/*
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/mm_inline.h>
+#include <linux/kthread.h>
+#include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/mman.h>
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+
+/*
+ * By default transparent hugepage support is enabled for all mappings
+ * and khugepaged scans all mappings. Defrag is only invoked by
+ * khugepaged hugepage allocations and by page faults inside
+ * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
+ * allocations.
+ */
+unsigned long transparent_hugepage_flags __read_mostly =
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
+ (1<<TRANSPARENT_HUGEPAGE_FLAG)|
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+#endif
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+
+/* default scan 8*512 pte (or vmas) every 30 second */
+static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_collapsed;
+static unsigned int khugepaged_full_scans;
+static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
+/* during fragmentation poll the hugepage allocator once every minute */
+static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+static DEFINE_SPINLOCK(khugepaged_mm_lock);
+static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
+/*
+ * default collapse hugepages if there is at least one pte mapped like
+ * it would have happened if the vma was large enough during page
+ * fault.
+ */
+static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+
+static int khugepaged(void *none);
+static int mm_slots_hash_init(void);
+static int khugepaged_slab_init(void);
+static void khugepaged_slab_free(void);
+
+#define MM_SLOTS_HASH_HEADS 1024
+static struct hlist_head *mm_slots_hash __read_mostly;
+static struct kmem_cache *mm_slot_cache __read_mostly;
+
+/**
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: hash collision list
+ * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node hash;
+ struct list_head mm_node;
+ struct mm_struct *mm;
+};
+
+/**
+ * struct khugepaged_scan - cursor for scanning
+ * @mm_head: the head of the mm list to scan
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ *
+ * There is only the one khugepaged_scan instance of this cursor structure.
+ */
+struct khugepaged_scan {
+ struct list_head mm_head;
+ struct mm_slot *mm_slot;
+ unsigned long address;
+} khugepaged_scan = {
+ .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+};
+
+
+static int set_recommended_min_free_kbytes(void)
+{
+ struct zone *zone;
+ int nr_zones = 0;
+ unsigned long recommended_min;
+ extern int min_free_kbytes;
+
+ if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags) &&
+ !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags))
+ return 0;
+
+ for_each_populated_zone(zone)
+ nr_zones++;
+
+ /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+ recommended_min = pageblock_nr_pages * nr_zones * 2;
+
+ /*
+ * Make sure that on average at least two pageblocks are almost free
+ * of another type, one for a migratetype to fall back to and a
+ * second to avoid subsequent fallbacks of other types There are 3
+ * MIGRATE_TYPES we care about.
+ */
+ recommended_min += pageblock_nr_pages * nr_zones *
+ MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
+
+ /* don't ever allow to reserve more than 5% of the lowmem */
+ recommended_min = min(recommended_min,
+ (unsigned long) nr_free_buffer_pages() / 20);
+ recommended_min <<= (PAGE_SHIFT-10);
+
+ if (recommended_min > min_free_kbytes)
+ min_free_kbytes = recommended_min;
+ setup_per_zone_wmarks();
+ return 0;
+}
+late_initcall(set_recommended_min_free_kbytes);
+
+static int start_khugepaged(void)
+{
+ int err = 0;
+ if (khugepaged_enabled()) {
+ int wakeup;
+ if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
+ err = -ENOMEM;
+ goto out;
+ }
+ mutex_lock(&khugepaged_mutex);
+ if (!khugepaged_thread)
+ khugepaged_thread = kthread_run(khugepaged, NULL,
+ "khugepaged");
+ if (unlikely(IS_ERR(khugepaged_thread))) {
+ printk(KERN_ERR
+ "khugepaged: kthread_run(khugepaged) failed\n");
+ err = PTR_ERR(khugepaged_thread);
+ khugepaged_thread = NULL;
+ }
+ wakeup = !list_empty(&khugepaged_scan.mm_head);
+ mutex_unlock(&khugepaged_mutex);
+ if (wakeup)
+ wake_up_interruptible(&khugepaged_wait);
+
+ set_recommended_min_free_kbytes();
+ } else
+ /* wakeup to exit */
+ wake_up_interruptible(&khugepaged_wait);
+out:
+ return err;
+}
+
+#ifdef CONFIG_SYSFS
+
+static ssize_t double_flag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf,
+ enum transparent_hugepage_flag enabled,
+ enum transparent_hugepage_flag req_madv)
+{
+ if (test_bit(enabled, &transparent_hugepage_flags)) {
+ VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
+ return sprintf(buf, "[always] madvise never\n");
+ } else if (test_bit(req_madv, &transparent_hugepage_flags))
+ return sprintf(buf, "always [madvise] never\n");
+ else
+ return sprintf(buf, "always madvise [never]\n");
+}
+static ssize_t double_flag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count,
+ enum transparent_hugepage_flag enabled,
+ enum transparent_hugepage_flag req_madv)
+{
+ if (!memcmp("always", buf,
+ min(sizeof("always")-1, count))) {
+ set_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(req_madv, &transparent_hugepage_flags);
+ } else if (!memcmp("madvise", buf,
+ min(sizeof("madvise")-1, count))) {
+ clear_bit(enabled, &transparent_hugepage_flags);
+ set_bit(req_madv, &transparent_hugepage_flags);
+ } else if (!memcmp("never", buf,
+ min(sizeof("never")-1, count))) {
+ clear_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(req_madv, &transparent_hugepage_flags);
+ } else
+ return -EINVAL;
+
+ return count;
+}
+
+static ssize_t enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return double_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+}
+static ssize_t enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret;
+
+ ret = double_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+
+ if (ret > 0) {
+ int err = start_khugepaged();
+ if (err)
+ ret = err;
+ }
+
+ if (ret > 0 &&
+ (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags) ||
+ test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags)))
+ set_recommended_min_free_kbytes();
+
+ return ret;
+}
+static struct kobj_attribute enabled_attr =
+ __ATTR(enabled, 0644, enabled_show, enabled_store);
+
+static ssize_t single_flag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf,
+ enum transparent_hugepage_flag flag)
+{
+ if (test_bit(flag, &transparent_hugepage_flags))
+ return sprintf(buf, "[yes] no\n");
+ else
+ return sprintf(buf, "yes [no]\n");
+}
+static ssize_t single_flag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count,
+ enum transparent_hugepage_flag flag)
+{
+ if (!memcmp("yes", buf,
+ min(sizeof("yes")-1, count))) {
+ set_bit(flag, &transparent_hugepage_flags);
+ } else if (!memcmp("no", buf,
+ min(sizeof("no")-1, count))) {
+ clear_bit(flag, &transparent_hugepage_flags);
+ } else
+ return -EINVAL;
+
+ return count;
+}
+
+/*
+ * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
+ * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
+ * memory just to allocate one more hugepage.
+ */
+static ssize_t defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return double_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static ssize_t defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return double_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static struct kobj_attribute defrag_attr =
+ __ATTR(defrag, 0644, defrag_show, defrag_store);
+
+#ifdef CONFIG_DEBUG_VM
+static ssize_t debug_cow_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static ssize_t debug_cow_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static struct kobj_attribute debug_cow_attr =
+ __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
+#endif /* CONFIG_DEBUG_VM */
+
+static struct attribute *hugepage_attr[] = {
+ &enabled_attr.attr,
+ &defrag_attr.attr,
+#ifdef CONFIG_DEBUG_VM
+ &debug_cow_attr.attr,
+#endif
+ NULL,
+};
+
+static struct attribute_group hugepage_attr_group = {
+ .attrs = hugepage_attr,
+};
+
+static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
+}
+
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = strict_strtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_scan_sleep_millisecs = msecs;
+ wake_up_interruptible(&khugepaged_wait);
+
+ return count;
+}
+static struct kobj_attribute scan_sleep_millisecs_attr =
+ __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
+ scan_sleep_millisecs_store);
+
+static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
+}
+
+static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = strict_strtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_alloc_sleep_millisecs = msecs;
+ wake_up_interruptible(&khugepaged_wait);
+
+ return count;
+}
+static struct kobj_attribute alloc_sleep_millisecs_attr =
+ __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
+ alloc_sleep_millisecs_store);
+
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
+}
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long pages;
+
+ err = strict_strtoul(buf, 10, &pages);
+ if (err || !pages || pages > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_pages_to_scan = pages;
+
+ return count;
+}
+static struct kobj_attribute pages_to_scan_attr =
+ __ATTR(pages_to_scan, 0644, pages_to_scan_show,
+ pages_to_scan_store);
+
+static ssize_t pages_collapsed_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
+}
+static struct kobj_attribute pages_collapsed_attr =
+ __ATTR_RO(pages_collapsed);
+
+static ssize_t full_scans_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_full_scans);
+}
+static struct kobj_attribute full_scans_attr =
+ __ATTR_RO(full_scans);
+
+static ssize_t khugepaged_defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static ssize_t khugepaged_defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static struct kobj_attribute khugepaged_defrag_attr =
+ __ATTR(defrag, 0644, khugepaged_defrag_show,
+ khugepaged_defrag_store);
+
+/*
+ * max_ptes_none controls if khugepaged should collapse hugepages over
+ * any unmapped ptes in turn potentially increasing the memory
+ * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
+ * reduce the available free memory in the system as it
+ * runs. Increasing max_ptes_none will instead potentially reduce the
+ * free memory in the system during the khugepaged scan.
+ */
+static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
+}
+static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_none;
+
+ err = strict_strtoul(buf, 10, &max_ptes_none);
+ if (err || max_ptes_none > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_none = max_ptes_none;
+
+ return count;
+}
+static struct kobj_attribute khugepaged_max_ptes_none_attr =
+ __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
+ khugepaged_max_ptes_none_store);
+
+static struct attribute *khugepaged_attr[] = {
+ &khugepaged_defrag_attr.attr,
+ &khugepaged_max_ptes_none_attr.attr,
+ &pages_to_scan_attr.attr,
+ &pages_collapsed_attr.attr,
+ &full_scans_attr.attr,
+ &scan_sleep_millisecs_attr.attr,
+ &alloc_sleep_millisecs_attr.attr,
+ NULL,
+};
+
+static struct attribute_group khugepaged_attr_group = {
+ .attrs = khugepaged_attr,
+ .name = "khugepaged",
+};
+#endif /* CONFIG_SYSFS */
+
+static int __init hugepage_init(void)
+{
+ int err;
+#ifdef CONFIG_SYSFS
+ static struct kobject *hugepage_kobj;
+#endif
+
+ err = -EINVAL;
+ if (!has_transparent_hugepage()) {
+ transparent_hugepage_flags = 0;
+ goto out;
+ }
+
+#ifdef CONFIG_SYSFS
+ err = -ENOMEM;
+ hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
+ if (unlikely(!hugepage_kobj)) {
+ printk(KERN_ERR "hugepage: failed kobject create\n");
+ goto out;
+ }
+
+ err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
+ if (err) {
+ printk(KERN_ERR "hugepage: failed register hugeage group\n");
+ goto out;
+ }
+
+ err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
+ if (err) {
+ printk(KERN_ERR "hugepage: failed register hugeage group\n");
+ goto out;
+ }
+#endif
+
+ err = khugepaged_slab_init();
+ if (err)
+ goto out;
+
+ err = mm_slots_hash_init();
+ if (err) {
+ khugepaged_slab_free();
+ goto out;
+ }
+
+ /*
+ * By default disable transparent hugepages on smaller systems,
+ * where the extra memory used could hurt more than TLB overhead
+ * is likely to save. The admin can still enable it through /sys.
+ */
+ if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+ transparent_hugepage_flags = 0;
+
+ start_khugepaged();
+
+ set_recommended_min_free_kbytes();
+
+out:
+ return err;
+}
+module_init(hugepage_init)
+
+static int __init setup_transparent_hugepage(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+ if (!strcmp(str, "always")) {
+ set_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "madvise")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "never")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ printk(KERN_WARNING
+ "transparent_hugepage= cannot parse, ignored\n");
+ return ret;
+}
+__setup("transparent_hugepage=", setup_transparent_hugepage);
+
+static void prepare_pmd_huge_pte(pgtable_t pgtable,
+ struct mm_struct *mm)
+{
+ assert_spin_locked(&mm->page_table_lock);
+
+ /* FIFO */
+ if (!mm->pmd_huge_pte)
+ INIT_LIST_HEAD(&pgtable->lru);
+ else
+ list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+ mm->pmd_huge_pte = pgtable;
+}
+
+static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_flags & VM_WRITE))
+ pmd = pmd_mkwrite(pmd);
+ return pmd;
+}
+
+static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd,
+ struct page *page)
+{
+ int ret = 0;
+ pgtable_t pgtable;
+
+ VM_BUG_ON(!PageCompound(page));
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable)) {
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ return VM_FAULT_OOM;
+ }
+
+ clear_huge_page(page, haddr, HPAGE_PMD_NR);
+ __SetPageUptodate(page);
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_none(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ pte_free(mm, pgtable);
+ } else {
+ pmd_t entry;
+ entry = mk_pmd(page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+ /*
+ * The spinlocking to take the lru_lock inside
+ * page_add_new_anon_rmap() acts as a full memory
+ * barrier to be sure clear_huge_page writes become
+ * visible after the set_pmd_at() write.
+ */
+ page_add_new_anon_rmap(page, vma, haddr);
+ set_pmd_at(mm, haddr, pmd, entry);
+ prepare_pmd_huge_pte(pgtable, mm);
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ spin_unlock(&mm->page_table_lock);
+ }
+
+ return ret;
+}
+
+static inline gfp_t alloc_hugepage_gfpmask(int defrag)
+{
+ return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
+}
+
+static inline struct page *alloc_hugepage_vma(int defrag,
+ struct vm_area_struct *vma,
+ unsigned long haddr)
+{
+ return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
+ HPAGE_PMD_ORDER, vma, haddr);
+}
+
+#ifndef CONFIG_NUMA
+static inline struct page *alloc_hugepage(int defrag)
+{
+ return alloc_pages(alloc_hugepage_gfpmask(defrag),
+ HPAGE_PMD_ORDER);
+}
+#endif
+
+int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ unsigned int flags)
+{
+ struct page *page;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ pte_t *pte;
+
+ if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+ if (unlikely(khugepaged_enter(vma)))
+ return VM_FAULT_OOM;
+ page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr);
+ if (unlikely(!page))
+ goto out;
+ if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+ put_page(page);
+ goto out;
+ }
+
+ return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
+ }
+out:
+ /*
+ * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * run pte_offset_map on the pmd, if an huge pmd could
+ * materialize from under us from a different thread.
+ */
+ if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+ return VM_FAULT_OOM;
+ /* if an huge pmd materialized from under us just retry later */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ return 0;
+ /*
+ * A regular pmd is established and it can't morph into a huge pmd
+ * from under us anymore at this point because we hold the mmap_sem
+ * read mode and khugepaged takes it in write mode. So now it's
+ * safe to run pte_offset_map().
+ */
+ pte = pte_offset_map(pmd, address);
+ return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+}
+
+int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+ struct vm_area_struct *vma)
+{
+ struct page *src_page;
+ pmd_t pmd;
+ pgtable_t pgtable;
+ int ret;
+
+ ret = -ENOMEM;
+ pgtable = pte_alloc_one(dst_mm, addr);
+ if (unlikely(!pgtable))
+ goto out;
+
+ spin_lock(&dst_mm->page_table_lock);
+ spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+
+ ret = -EAGAIN;
+ pmd = *src_pmd;
+ if (unlikely(!pmd_trans_huge(pmd))) {
+ pte_free(dst_mm, pgtable);
+ goto out_unlock;
+ }
+ if (unlikely(pmd_trans_splitting(pmd))) {
+ /* split huge page running from under us */
+ spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(&dst_mm->page_table_lock);
+ pte_free(dst_mm, pgtable);
+
+ wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
+ goto out;
+ }
+ src_page = pmd_page(pmd);
+ VM_BUG_ON(!PageHead(src_page));
+ get_page(src_page);
+ page_dup_rmap(src_page);
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+
+ pmdp_set_wrprotect(src_mm, addr, src_pmd);
+ pmd = pmd_mkold(pmd_wrprotect(pmd));
+ set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+ prepare_pmd_huge_pte(pgtable, dst_mm);
+
+ ret = 0;
+out_unlock:
+ spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(&dst_mm->page_table_lock);
+out:
+ return ret;
+}
+
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
+{
+ pgtable_t pgtable;
+
+ assert_spin_locked(&mm->page_table_lock);
+
+ /* FIFO */
+ pgtable = mm->pmd_huge_pte;
+ if (list_empty(&pgtable->lru))
+ mm->pmd_huge_pte = NULL;
+ else {
+ mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+ struct page, lru);
+ list_del(&pgtable->lru);
+ }
+ return pgtable;
+}
+
+static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd,
+ struct page *page,
+ unsigned long haddr)
+{
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ int ret = 0, i;
+ struct page **pages;
+
+ pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
+ GFP_KERNEL);
+ if (unlikely(!pages)) {
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+ vma, address);
+ if (unlikely(!pages[i] ||
+ mem_cgroup_newpage_charge(pages[i], mm,
+ GFP_KERNEL))) {
+ if (pages[i])
+ put_page(pages[i]);
+ mem_cgroup_uncharge_start();
+ while (--i >= 0) {
+ mem_cgroup_uncharge_page(pages[i]);
+ put_page(pages[i]);
+ }
+ mem_cgroup_uncharge_end();
+ kfree(pages);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ copy_user_highpage(pages[i], page + i,
+ haddr + PAGE_SHIFT*i, vma);
+ __SetPageUptodate(pages[i]);
+ cond_resched();
+ }
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_free_pages;
+ VM_BUG_ON(!PageHead(page));
+
+ pmdp_clear_flush_notify(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = get_pmd_huge_pte(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ entry = mk_pte(pages[i], vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ page_add_new_anon_rmap(pages[i], vma, haddr);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ kfree(pages);
+
+ mm->nr_ptes++;
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ page_remove_rmap(page);
+ spin_unlock(&mm->page_table_lock);
+
+ ret |= VM_FAULT_WRITE;
+ put_page(page);
+
+out:
+ return ret;
+
+out_free_pages:
+ spin_unlock(&mm->page_table_lock);
+ mem_cgroup_uncharge_start();
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ mem_cgroup_uncharge_page(pages[i]);
+ put_page(pages[i]);
+ }
+ mem_cgroup_uncharge_end();
+ kfree(pages);
+ goto out;
+}
+
+int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
+{
+ int ret = 0;
+ struct page *page, *new_page;
+ unsigned long haddr;
+
+ VM_BUG_ON(!vma->anon_vma);
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_unlock;
+
+ page = pmd_page(orig_pmd);
+ VM_BUG_ON(!PageCompound(page) || !PageHead(page));
+ haddr = address & HPAGE_PMD_MASK;
+ if (page_mapcount(page) == 1) {
+ pmd_t entry;
+ entry = pmd_mkyoung(orig_pmd);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
+ update_mmu_cache(vma, address, entry);
+ ret |= VM_FAULT_WRITE;
+ goto out_unlock;
+ }
+ get_page(page);
+ spin_unlock(&mm->page_table_lock);
+
+ if (transparent_hugepage_enabled(vma) &&
+ !transparent_hugepage_debug_cow())
+ new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr);
+ else
+ new_page = NULL;
+
+ if (unlikely(!new_page)) {
+ ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+ pmd, orig_pmd, page, haddr);
+ put_page(page);
+ goto out;
+ }
+
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+ put_page(new_page);
+ put_page(page);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+ __SetPageUptodate(new_page);
+
+ spin_lock(&mm->page_table_lock);
+ put_page(page);
+ if (unlikely(!pmd_same(*pmd, orig_pmd))) {
+ mem_cgroup_uncharge_page(new_page);
+ put_page(new_page);
+ } else {
+ pmd_t entry;
+ VM_BUG_ON(!PageHead(page));
+ entry = mk_pmd(new_page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+ pmdp_clear_flush_notify(vma, haddr, pmd);
+ page_add_new_anon_rmap(new_page, vma, haddr);
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache(vma, address, entry);
+ page_remove_rmap(page);
+ put_page(page);
+ ret |= VM_FAULT_WRITE;
+ }
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+out:
+ return ret;
+}
+
+struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+ unsigned long addr,
+ pmd_t *pmd,
+ unsigned int flags)
+{
+ struct page *page = NULL;
+
+ assert_spin_locked(&mm->page_table_lock);
+
+ if (flags & FOLL_WRITE && !pmd_write(*pmd))
+ goto out;
+
+ page = pmd_page(*pmd);
+ VM_BUG_ON(!PageHead(page));
+ if (flags & FOLL_TOUCH) {
+ pmd_t _pmd;
+ /*
+ * We should set the dirty bit only for FOLL_WRITE but
+ * for now the dirty bit in the pmd is meaningless.
+ * And if the dirty bit will become meaningful and
+ * we'll only set it with FOLL_WRITE, an atomic
+ * set_bit will be required on the pmd to set the
+ * young bit, instead of the current set_pmd_at.
+ */
+ _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+ set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
+ }
+ page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
+ VM_BUG_ON(!PageCompound(page));
+ if (flags & FOLL_GET)
+ get_page(page);
+
+out:
+ return page;
+}
+
+int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd)
+{
+ int ret = 0;
+
+ spin_lock(&tlb->mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(&tlb->mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma,
+ pmd);
+ } else {
+ struct page *page;
+ pgtable_t pgtable;
+ pgtable = get_pmd_huge_pte(tlb->mm);
+ page = pmd_page(*pmd);
+ pmd_clear(pmd);
+ page_remove_rmap(page);
+ VM_BUG_ON(page_mapcount(page) < 0);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ VM_BUG_ON(!PageHead(page));
+ spin_unlock(&tlb->mm->page_table_lock);
+ tlb_remove_page(tlb, page);
+ pte_free(tlb->mm, pgtable);
+ ret = 1;
+ }
+ } else
+ spin_unlock(&tlb->mm->page_table_lock);
+
+ return ret;
+}
+
+int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ int ret = 0;
+
+ spin_lock(&vma->vm_mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ ret = !pmd_trans_splitting(*pmd);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ if (unlikely(!ret))
+ wait_split_huge_page(vma->anon_vma, pmd);
+ else {
+ /*
+ * All logical pages in the range are present
+ * if backed by a huge page.
+ */
+ memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+ }
+ } else
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
+ return ret;
+}
+
+int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, pgprot_t newprot)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int ret = 0;
+
+ spin_lock(&mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma, pmd);
+ } else {
+ pmd_t entry;
+
+ entry = pmdp_get_and_clear(mm, addr, pmd);
+ entry = pmd_modify(entry, newprot);
+ set_pmd_at(mm, addr, pmd, entry);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
+ ret = 1;
+ }
+ } else
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
+ return ret;
+}
+
+pmd_t *page_check_address_pmd(struct page *page,
+ struct mm_struct *mm,
+ unsigned long address,
+ enum page_check_address_pmd_flag flag)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd, *ret = NULL;
+
+ if (address & ~HPAGE_PMD_MASK)
+ goto out;
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd))
+ goto out;
+ if (pmd_page(*pmd) != page)
+ goto out;
+ /*
+ * split_vma() may create temporary aliased mappings. There is
+ * no risk as long as all huge pmd are found and have their
+ * splitting bit set before __split_huge_page_refcount
+ * runs. Finding the same huge pmd more than once during the
+ * same rmap walk is not a problem.
+ */
+ if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
+ pmd_trans_splitting(*pmd))
+ goto out;
+ if (pmd_trans_huge(*pmd)) {
+ VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
+ !pmd_trans_splitting(*pmd));
+ ret = pmd;
+ }
+out:
+ return ret;
+}
+
+static int __split_huge_page_splitting(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t *pmd;
+ int ret = 0;
+
+ spin_lock(&mm->page_table_lock);
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+ if (pmd) {
+ /*
+ * We can't temporarily set the pmd to null in order
+ * to split it, the pmd must remain marked huge at all
+ * times or the VM won't take the pmd_trans_huge paths
+ * and it won't wait on the anon_vma->root->lock to
+ * serialize against split_huge_page*.
+ */
+ pmdp_splitting_flush_notify(vma, address, pmd);
+ ret = 1;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return ret;
+}
+
+static void __split_huge_page_refcount(struct page *page)
+{
+ int i;
+ unsigned long head_index = page->index;
+ struct zone *zone = page_zone(page);
+ int zonestat;
+
+ /* prevent PageLRU to go away from under us, and freeze lru stats */
+ spin_lock_irq(&zone->lru_lock);
+ compound_lock(page);
+
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ struct page *page_tail = page + i;
+
+ /* tail_page->_count cannot change */
+ atomic_sub(atomic_read(&page_tail->_count), &page->_count);
+ BUG_ON(page_count(page) <= 0);
+ atomic_add(page_mapcount(page) + 1, &page_tail->_count);
+ BUG_ON(atomic_read(&page_tail->_count) <= 0);
+
+ /* after clearing PageTail the gup refcount can be released */
+ smp_mb();
+
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ page_tail->flags |= (page->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate)));
+ page_tail->flags |= (1L << PG_dirty);
+
+ /*
+ * 1) clear PageTail before overwriting first_page
+ * 2) clear PageTail before clearing PageHead for VM_BUG_ON
+ */
+ smp_wmb();
+
+ /*
+ * __split_huge_page_splitting() already set the
+ * splitting bit in all pmd that could map this
+ * hugepage, that will ensure no CPU can alter the
+ * mapcount on the head page. The mapcount is only
+ * accounted in the head page and it has to be
+ * transferred to all tail pages in the below code. So
+ * for this code to be safe, the split the mapcount
+ * can't change. But that doesn't mean userland can't
+ * keep changing and reading the page contents while
+ * we transfer the mapcount, so the pmd splitting
+ * status is achieved setting a reserved bit in the
+ * pmd, not by clearing the present bit.
+ */
+ BUG_ON(page_mapcount(page_tail));
+ page_tail->_mapcount = page->_mapcount;
+
+ BUG_ON(page_tail->mapping);
+ page_tail->mapping = page->mapping;
+
+ page_tail->index = ++head_index;
+
+ BUG_ON(!PageAnon(page_tail));
+ BUG_ON(!PageUptodate(page_tail));
+ BUG_ON(!PageDirty(page_tail));
+ BUG_ON(!PageSwapBacked(page_tail));
+
+ lru_add_page_tail(zone, page, page_tail);
+ }
+
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+
+ /*
+ * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
+ * so adjust those appropriately if this page is on the LRU.
+ */
+ if (PageLRU(page)) {
+ zonestat = NR_LRU_BASE + page_lru(page);
+ __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
+ }
+
+ ClearPageCompound(page);
+ compound_unlock(page);
+ spin_unlock_irq(&zone->lru_lock);
+
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ struct page *page_tail = page + i;
+ BUG_ON(page_count(page_tail) <= 0);
+ /*
+ * Tail pages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ put_page(page_tail);
+ }
+
+ /*
+ * Only the head page (now become a regular page) is required
+ * to be pinned by the caller.
+ */
+ BUG_ON(page_count(page) <= 0);
+}
+
+static int __split_huge_page_map(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t *pmd, _pmd;
+ int ret = 0, i;
+ pgtable_t pgtable;
+ unsigned long haddr;
+
+ spin_lock(&mm->page_table_lock);
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+ if (pmd) {
+ pgtable = get_pmd_huge_pte(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0, haddr = address; i < HPAGE_PMD_NR;
+ i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ BUG_ON(PageCompound(page+i));
+ entry = mk_pte(page + i, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (!pmd_write(*pmd))
+ entry = pte_wrprotect(entry);
+ else
+ BUG_ON(page_mapcount(page) != 1);
+ if (!pmd_young(*pmd))
+ entry = pte_mkold(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+
+ mm->nr_ptes++;
+ smp_wmb(); /* make pte visible before pmd */
+ /*
+ * Up to this point the pmd is present and huge and
+ * userland has the whole access to the hugepage
+ * during the split (which happens in place). If we
+ * overwrite the pmd with the not-huge version
+ * pointing to the pte here (which of course we could
+ * if all CPUs were bug free), userland could trigger
+ * a small page size TLB miss on the small sized TLB
+ * while the hugepage TLB entry is still established
+ * in the huge TLB. Some CPU doesn't like that. See
+ * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
+ * Erratum 383 on page 93. Intel should be safe but is
+ * also warns that it's only safe if the permission
+ * and cache attributes of the two entries loaded in
+ * the two TLB is identical (which should be the case
+ * here). But it is generally safer to never allow
+ * small and huge TLB entries for the same virtual
+ * address to be loaded simultaneously. So instead of
+ * doing "pmd_populate(); flush_tlb_range();" we first
+ * mark the current pmd notpresent (atomically because
+ * here the pmd_trans_huge and pmd_trans_splitting
+ * must remain set at all times on the pmd until the
+ * split is complete for this pmd), then we flush the
+ * SMP TLB and finally we write the non-huge version
+ * of the pmd entry with pmd_populate.
+ */
+ set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ pmd_populate(mm, pmd, pgtable);
+ ret = 1;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return ret;
+}
+
+/* must be called with anon_vma->root->lock hold */
+static void __split_huge_page(struct page *page,
+ struct anon_vma *anon_vma)
+{
+ int mapcount, mapcount2;
+ struct anon_vma_chain *avc;
+
+ BUG_ON(!PageHead(page));
+ BUG_ON(PageTail(page));
+
+ mapcount = 0;
+ list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long addr = vma_address(page, vma);
+ BUG_ON(is_vma_temporary_stack(vma));
+ if (addr == -EFAULT)
+ continue;
+ mapcount += __split_huge_page_splitting(page, vma, addr);
+ }
+ /*
+ * It is critical that new vmas are added to the tail of the
+ * anon_vma list. This guarantes that if copy_huge_pmd() runs
+ * and establishes a child pmd before
+ * __split_huge_page_splitting() freezes the parent pmd (so if
+ * we fail to prevent copy_huge_pmd() from running until the
+ * whole __split_huge_page() is complete), we will still see
+ * the newly established pmd of the child later during the
+ * walk, to be able to set it as pmd_trans_splitting too.
+ */
+ if (mapcount != page_mapcount(page))
+ printk(KERN_ERR "mapcount %d page_mapcount %d\n",
+ mapcount, page_mapcount(page));
+ BUG_ON(mapcount != page_mapcount(page));
+
+ __split_huge_page_refcount(page);
+
+ mapcount2 = 0;
+ list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long addr = vma_address(page, vma);
+ BUG_ON(is_vma_temporary_stack(vma));
+ if (addr == -EFAULT)
+ continue;
+ mapcount2 += __split_huge_page_map(page, vma, addr);
+ }
+ if (mapcount != mapcount2)
+ printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
+ mapcount, mapcount2, page_mapcount(page));
+ BUG_ON(mapcount != mapcount2);
+}
+
+int split_huge_page(struct page *page)
+{
+ struct anon_vma *anon_vma;
+ int ret = 1;
+
+ BUG_ON(!PageAnon(page));
+ anon_vma = page_lock_anon_vma(page);
+ if (!anon_vma)
+ goto out;
+ ret = 0;
+ if (!PageCompound(page))
+ goto out_unlock;
+
+ BUG_ON(!PageSwapBacked(page));
+ __split_huge_page(page, anon_vma);
+
+ BUG_ON(PageCompound(page));
+out_unlock:
+ page_unlock_anon_vma(anon_vma);
+out:
+ return ret;
+}
+
+int hugepage_madvise(struct vm_area_struct *vma,
+ unsigned long *vm_flags, int advice)
+{
+ switch (advice) {
+ case MADV_HUGEPAGE:
+ /*
+ * Be somewhat over-protective like KSM for now!
+ */
+ if (*vm_flags & (VM_HUGEPAGE |
+ VM_SHARED | VM_MAYSHARE |
+ VM_PFNMAP | VM_IO | VM_DONTEXPAND |
+ VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+ VM_MIXEDMAP | VM_SAO))
+ return -EINVAL;
+ *vm_flags &= ~VM_NOHUGEPAGE;
+ *vm_flags |= VM_HUGEPAGE;
+ /*
+ * If the vma become good for khugepaged to scan,
+ * register it here without waiting a page fault that
+ * may not happen any time soon.
+ */
+ if (unlikely(khugepaged_enter_vma_merge(vma)))
+ return -ENOMEM;
+ break;
+ case MADV_NOHUGEPAGE:
+ /*
+ * Be somewhat over-protective like KSM for now!
+ */
+ if (*vm_flags & (VM_NOHUGEPAGE |
+ VM_SHARED | VM_MAYSHARE |
+ VM_PFNMAP | VM_IO | VM_DONTEXPAND |
+ VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+ VM_MIXEDMAP | VM_SAO))
+ return -EINVAL;
+ *vm_flags &= ~VM_HUGEPAGE;
+ *vm_flags |= VM_NOHUGEPAGE;
+ /*
+ * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+ * this vma even if we leave the mm registered in khugepaged if
+ * it got registered before VM_NOHUGEPAGE was set.
+ */
+ break;
+ }
+
+ return 0;
+}
+
+static int __init khugepaged_slab_init(void)
+{
+ mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+ sizeof(struct mm_slot),
+ __alignof__(struct mm_slot), 0, NULL);
+ if (!mm_slot_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __init khugepaged_slab_free(void)
+{
+ kmem_cache_destroy(mm_slot_cache);
+ mm_slot_cache = NULL;
+}
+
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+ if (!mm_slot_cache) /* initialization failed */
+ return NULL;
+ return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+ kmem_cache_free(mm_slot_cache, mm_slot);
+}
+
+static int __init mm_slots_hash_init(void)
+{
+ mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!mm_slots_hash)
+ return -ENOMEM;
+ return 0;
+}
+
+#if 0
+static void __init mm_slots_hash_free(void)
+{
+ kfree(mm_slots_hash);
+ mm_slots_hash = NULL;
+}
+#endif
+
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ struct hlist_head *bucket;
+ struct hlist_node *node;
+
+ bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+ % MM_SLOTS_HASH_HEADS];
+ hlist_for_each_entry(mm_slot, node, bucket, hash) {
+ if (mm == mm_slot->mm)
+ return mm_slot;
+ }
+ return NULL;
+}
+
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+ struct mm_slot *mm_slot)
+{
+ struct hlist_head *bucket;
+
+ bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+ % MM_SLOTS_HASH_HEADS];
+ mm_slot->mm = mm;
+ hlist_add_head(&mm_slot->hash, bucket);
+}
+
+static inline int khugepaged_test_exit(struct mm_struct *mm)
+{
+ return atomic_read(&mm->mm_users) == 0;
+}
+
+int __khugepaged_enter(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int wakeup;
+
+ mm_slot = alloc_mm_slot();
+ if (!mm_slot)
+ return -ENOMEM;
+
+ /* __khugepaged_exit() must not run from under us */
+ VM_BUG_ON(khugepaged_test_exit(mm));
+ if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
+ free_mm_slot(mm_slot);
+ return 0;
+ }
+
+ spin_lock(&khugepaged_mm_lock);
+ insert_to_mm_slots_hash(mm, mm_slot);
+ /*
+ * Insert just behind the scanning cursor, to let the area settle
+ * down a little.
+ */
+ wakeup = list_empty(&khugepaged_scan.mm_head);
+ list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+ spin_unlock(&khugepaged_mm_lock);
+
+ atomic_inc(&mm->mm_count);
+ if (wakeup)
+ wake_up_interruptible(&khugepaged_wait);
+
+ return 0;
+}
+
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+ unsigned long hstart, hend;
+ if (!vma->anon_vma)
+ /*
+ * Not yet faulted in so we will register later in the
+ * page fault if needed.
+ */
+ return 0;
+ if (vma->vm_file || vma->vm_ops)
+ /* khugepaged not yet working on file or special mappings */
+ return 0;
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (hstart < hend)
+ return khugepaged_enter(vma);
+ return 0;
+}
+
+void __khugepaged_exit(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int free = 0;
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = get_mm_slot(mm);
+ if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+ hlist_del(&mm_slot->hash);
+ list_del(&mm_slot->mm_node);
+ free = 1;
+ }
+
+ if (free) {
+ spin_unlock(&khugepaged_mm_lock);
+ clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ } else if (mm_slot) {
+ spin_unlock(&khugepaged_mm_lock);
+ /*
+ * This is required to serialize against
+ * khugepaged_test_exit() (which is guaranteed to run
+ * under mmap sem read mode). Stop here (after we
+ * return all pagetables will be destroyed) until
+ * khugepaged has finished working on the pagetables
+ * under the mmap_sem.
+ */
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+ } else
+ spin_unlock(&khugepaged_mm_lock);
+}
+
+static void release_pte_page(struct page *page)
+{
+ /* 0 stands for page_is_file_cache(page) == false */
+ dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
+ unlock_page(page);
+ putback_lru_page(page);
+}
+
+static void release_pte_pages(pte_t *pte, pte_t *_pte)
+{
+ while (--_pte >= pte) {
+ pte_t pteval = *_pte;
+ if (!pte_none(pteval))
+ release_pte_page(pte_page(pteval));
+ }
+}
+
+static void release_all_pte_pages(pte_t *pte)
+{
+ release_pte_pages(pte, pte + HPAGE_PMD_NR);
+}
+
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *pte)
+{
+ struct page *page;
+ pte_t *_pte;
+ int referenced = 0, isolated = 0, none = 0;
+ for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+ if (pte_none(pteval)) {
+ if (++none <= khugepaged_max_ptes_none)
+ continue;
+ else {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ }
+ if (!pte_present(pteval) || !pte_write(pteval)) {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ page = vm_normal_page(vma, address, pteval);
+ if (unlikely(!page)) {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ VM_BUG_ON(PageCompound(page));
+ BUG_ON(!PageAnon(page));
+ VM_BUG_ON(!PageSwapBacked(page));
+
+ /* cannot use mapcount: can't collapse if there's a gup pin */
+ if (page_count(page) != 1) {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ /*
+ * We can do it before isolate_lru_page because the
+ * page can't be freed from under us. NOTE: PG_lock
+ * is needed to serialize against split_huge_page
+ * when invoked from the VM.
+ */
+ if (!trylock_page(page)) {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ /*
+ * Isolate the page to avoid collapsing an hugepage
+ * currently in use by the VM.
+ */
+ if (isolate_lru_page(page)) {
+ unlock_page(page);
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ /* 0 stands for page_is_file_cache(page) == false */
+ inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageLRU(page));
+
+ /* If there is no mapped pte young don't collapse the page */
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
+ referenced = 1;
+ }
+ if (unlikely(!referenced))
+ release_all_pte_pages(pte);
+ else
+ isolated = 1;
+out:
+ return isolated;
+}
+
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ spinlock_t *ptl)
+{
+ pte_t *_pte;
+ for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
+ pte_t pteval = *_pte;
+ struct page *src_page;
+
+ if (pte_none(pteval)) {
+ clear_user_highpage(page, address);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+ } else {
+ src_page = pte_page(pteval);
+ copy_user_highpage(page, src_page, address, vma);
+ VM_BUG_ON(page_mapcount(src_page) != 1);
+ VM_BUG_ON(page_count(src_page) != 2);
+ release_pte_page(src_page);
+ /*
+ * ptl mostly unnecessary, but preempt has to
+ * be disabled to update the per-cpu stats
+ * inside page_remove_rmap().
+ */
+ spin_lock(ptl);
+ /*
+ * paravirt calls inside pte_clear here are
+ * superfluous.
+ */
+ pte_clear(vma->vm_mm, address, _pte);
+ page_remove_rmap(src_page);
+ spin_unlock(ptl);
+ free_page_and_swap_cache(src_page);
+ }
+
+ address += PAGE_SIZE;
+ page++;
+ }
+}
+
+static void collapse_huge_page(struct mm_struct *mm,
+ unsigned long address,
+ struct page **hpage,
+ struct vm_area_struct *vma)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+ pgtable_t pgtable;
+ struct page *new_page;
+ spinlock_t *ptl;
+ int isolated;
+ unsigned long hstart, hend;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifndef CONFIG_NUMA
+ VM_BUG_ON(!*hpage);
+ new_page = *hpage;
+#else
+ VM_BUG_ON(*hpage);
+ /*
+ * Allocate the page while the vma is still valid and under
+ * the mmap_sem read mode so there is no memory allocation
+ * later when we take the mmap_sem in write mode. This is more
+ * friendly behavior (OTOH it may actually hide bugs) to
+ * filesystems in userland with daemons allocating memory in
+ * the userland I/O paths. Allocating memory with the
+ * mmap_sem in read mode is good idea also to allow greater
+ * scalability.
+ */
+ new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+ if (unlikely(!new_page)) {
+ up_read(&mm->mmap_sem);
+ *hpage = ERR_PTR(-ENOMEM);
+ return;
+ }
+#endif
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+ up_read(&mm->mmap_sem);
+ put_page(new_page);
+ return;
+ }
+
+ /* after allocating the hugepage upgrade to mmap_sem write mode */
+ up_read(&mm->mmap_sem);
+
+ /*
+ * Prevent all access to pagetables with the exception of
+ * gup_fast later hanlded by the ptep_clear_flush and the VM
+ * handled by the anon_vma lock + PG_lock.
+ */
+ down_write(&mm->mmap_sem);
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto out;
+
+ vma = find_vma(mm, address);
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ goto out;
+
+ if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE))
+ goto out;
+
+ /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
+ if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+ goto out;
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ /* pmd can't go away or become huge under us */
+ if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+ goto out;
+
+ anon_vma_lock(vma->anon_vma);
+
+ pte = pte_offset_map(pmd, address);
+ ptl = pte_lockptr(mm, pmd);
+
+ spin_lock(&mm->page_table_lock); /* probably unnecessary */
+ /*
+ * After this gup_fast can't run anymore. This also removes
+ * any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address
+ * to avoid the risk of CPU bugs in that area.
+ */
+ _pmd = pmdp_clear_flush_notify(vma, address, pmd);
+ spin_unlock(&mm->page_table_lock);
+
+ spin_lock(ptl);
+ isolated = __collapse_huge_page_isolate(vma, address, pte);
+ spin_unlock(ptl);
+ pte_unmap(pte);
+
+ if (unlikely(!isolated)) {
+ spin_lock(&mm->page_table_lock);
+ BUG_ON(!pmd_none(*pmd));
+ set_pmd_at(mm, address, pmd, _pmd);
+ spin_unlock(&mm->page_table_lock);
+ anon_vma_unlock(vma->anon_vma);
+ mem_cgroup_uncharge_page(new_page);
+ goto out;
+ }
+
+ /*
+ * All pages are isolated and locked so anon_vma rmap
+ * can't run anymore.
+ */
+ anon_vma_unlock(vma->anon_vma);
+
+ __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+ __SetPageUptodate(new_page);
+ pgtable = pmd_pgtable(_pmd);
+ VM_BUG_ON(page_count(pgtable) != 1);
+ VM_BUG_ON(page_mapcount(pgtable) != 0);
+
+ _pmd = mk_pmd(new_page, vma->vm_page_prot);
+ _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+ _pmd = pmd_mkhuge(_pmd);
+
+ /*
+ * spin_lock() below is not the equivalent of smp_wmb(), so
+ * this is needed to avoid the copy_huge_page writes to become
+ * visible after the set_pmd_at() write.
+ */
+ smp_wmb();
+
+ spin_lock(&mm->page_table_lock);
+ BUG_ON(!pmd_none(*pmd));
+ page_add_new_anon_rmap(new_page, vma, address);
+ set_pmd_at(mm, address, pmd, _pmd);
+ update_mmu_cache(vma, address, entry);
+ prepare_pmd_huge_pte(pgtable, mm);
+ mm->nr_ptes--;
+ spin_unlock(&mm->page_table_lock);
+
+#ifndef CONFIG_NUMA
+ *hpage = NULL;
+#endif
+ khugepaged_pages_collapsed++;
+out_up_write:
+ up_write(&mm->mmap_sem);
+ return;
+
+out:
+#ifdef CONFIG_NUMA
+ put_page(new_page);
+#endif
+ goto out_up_write;
+}
+
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ struct page **hpage)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte, *_pte;
+ int ret = 0, referenced = 0, none = 0;
+ struct page *page;
+ unsigned long _address;
+ spinlock_t *ptl;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+ goto out;
+
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, _address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+ if (pte_none(pteval)) {
+ if (++none <= khugepaged_max_ptes_none)
+ continue;
+ else
+ goto out_unmap;
+ }
+ if (!pte_present(pteval) || !pte_write(pteval))
+ goto out_unmap;
+ page = vm_normal_page(vma, _address, pteval);
+ if (unlikely(!page))
+ goto out_unmap;
+ VM_BUG_ON(PageCompound(page));
+ if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+ goto out_unmap;
+ /* cannot use mapcount: can't collapse if there's a gup pin */
+ if (page_count(page) != 1)
+ goto out_unmap;
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
+ referenced = 1;
+ }
+ if (referenced)
+ ret = 1;
+out_unmap:
+ pte_unmap_unlock(pte, ptl);
+ if (ret)
+ /* collapse_huge_page will return with the mmap_sem released */
+ collapse_huge_page(mm, address, hpage, vma);
+out:
+ return ret;
+}
+
+static void collect_mm_slot(struct mm_slot *mm_slot)
+{
+ struct mm_struct *mm = mm_slot->mm;
+
+ VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_test_exit(mm)) {
+ /* free mm_slot */
+ hlist_del(&mm_slot->hash);
+ list_del(&mm_slot->mm_node);
+
+ /*
+ * Not strictly needed because the mm exited already.
+ *
+ * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ */
+
+ /* khugepaged_mm_lock actually not necessary for the below */
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ }
+}
+
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
+ struct page **hpage)
+{
+ struct mm_slot *mm_slot;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ int progress = 0;
+
+ VM_BUG_ON(!pages);
+ VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_scan.mm_slot)
+ mm_slot = khugepaged_scan.mm_slot;
+ else {
+ mm_slot = list_entry(khugepaged_scan.mm_head.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.address = 0;
+ khugepaged_scan.mm_slot = mm_slot;
+ }
+ spin_unlock(&khugepaged_mm_lock);
+
+ mm = mm_slot->mm;
+ down_read(&mm->mmap_sem);
+ if (unlikely(khugepaged_test_exit(mm)))
+ vma = NULL;
+ else
+ vma = find_vma(mm, khugepaged_scan.address);
+
+ progress++;
+ for (; vma; vma = vma->vm_next) {
+ unsigned long hstart, hend;
+
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm))) {
+ progress++;
+ break;
+ }
+
+ if ((!(vma->vm_flags & VM_HUGEPAGE) &&
+ !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE)) {
+ progress++;
+ continue;
+ }
+
+ /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
+ if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
+ khugepaged_scan.address = vma->vm_end;
+ progress++;
+ continue;
+ }
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (hstart >= hend) {
+ progress++;
+ continue;
+ }
+ if (khugepaged_scan.address < hstart)
+ khugepaged_scan.address = hstart;
+ if (khugepaged_scan.address > hend) {
+ khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
+ progress++;
+ continue;
+ }
+ BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+
+ while (khugepaged_scan.address < hend) {
+ int ret;
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto breakouterloop;
+
+ VM_BUG_ON(khugepaged_scan.address < hstart ||
+ khugepaged_scan.address + HPAGE_PMD_SIZE >
+ hend);
+ ret = khugepaged_scan_pmd(mm, vma,
+ khugepaged_scan.address,
+ hpage);
+ /* move to next address */
+ khugepaged_scan.address += HPAGE_PMD_SIZE;
+ progress += HPAGE_PMD_NR;
+ if (ret)
+ /* we released mmap_sem so break loop */
+ goto breakouterloop_mmap_sem;
+ if (progress >= pages)
+ goto breakouterloop;
+ }
+ }
+breakouterloop:
+ up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
+breakouterloop_mmap_sem:
+
+ spin_lock(&khugepaged_mm_lock);
+ BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+ /*
+ * Release the current mm_slot if this mm is about to die, or
+ * if we scanned all vmas of this mm.
+ */
+ if (khugepaged_test_exit(mm) || !vma) {
+ /*
+ * Make sure that if mm_users is reaching zero while
+ * khugepaged runs here, khugepaged_exit will find
+ * mm_slot not pointing to the exiting mm.
+ */
+ if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+ khugepaged_scan.mm_slot = list_entry(
+ mm_slot->mm_node.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.address = 0;
+ } else {
+ khugepaged_scan.mm_slot = NULL;
+ khugepaged_full_scans++;
+ }
+
+ collect_mm_slot(mm_slot);
+ }
+
+ return progress;
+}
+
+static int khugepaged_has_work(void)
+{
+ return !list_empty(&khugepaged_scan.mm_head) &&
+ khugepaged_enabled();
+}
+
+static int khugepaged_wait_event(void)
+{
+ return !list_empty(&khugepaged_scan.mm_head) ||
+ !khugepaged_enabled();
+}
+
+static void khugepaged_do_scan(struct page **hpage)
+{
+ unsigned int progress = 0, pass_through_head = 0;
+ unsigned int pages = khugepaged_pages_to_scan;
+
+ barrier(); /* write khugepaged_pages_to_scan to local stack */
+
+ while (progress < pages) {
+ cond_resched();
+
+#ifndef CONFIG_NUMA
+ if (!*hpage) {
+ *hpage = alloc_hugepage(khugepaged_defrag());
+ if (unlikely(!*hpage))
+ break;
+ }
+#else
+ if (IS_ERR(*hpage))
+ break;
+#endif
+
+ if (unlikely(kthread_should_stop() || freezing(current)))
+ break;
+
+ spin_lock(&khugepaged_mm_lock);
+ if (!khugepaged_scan.mm_slot)
+ pass_through_head++;
+ if (khugepaged_has_work() &&
+ pass_through_head < 2)
+ progress += khugepaged_scan_mm_slot(pages - progress,
+ hpage);
+ else
+ progress = pages;
+ spin_unlock(&khugepaged_mm_lock);
+ }
+}
+
+static void khugepaged_alloc_sleep(void)
+{
+ DEFINE_WAIT(wait);
+ add_wait_queue(&khugepaged_wait, &wait);
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(
+ khugepaged_alloc_sleep_millisecs));
+ remove_wait_queue(&khugepaged_wait, &wait);
+}
+
+#ifndef CONFIG_NUMA
+static struct page *khugepaged_alloc_hugepage(void)
+{
+ struct page *hpage;
+
+ do {
+ hpage = alloc_hugepage(khugepaged_defrag());
+ if (!hpage)
+ khugepaged_alloc_sleep();
+ } while (unlikely(!hpage) &&
+ likely(khugepaged_enabled()));
+ return hpage;
+}
+#endif
+
+static void khugepaged_loop(void)
+{
+ struct page *hpage;
+
+#ifdef CONFIG_NUMA
+ hpage = NULL;
+#endif
+ while (likely(khugepaged_enabled())) {
+#ifndef CONFIG_NUMA
+ hpage = khugepaged_alloc_hugepage();
+ if (unlikely(!hpage))
+ break;
+#else
+ if (IS_ERR(hpage)) {
+ khugepaged_alloc_sleep();
+ hpage = NULL;
+ }
+#endif
+
+ khugepaged_do_scan(&hpage);
+#ifndef CONFIG_NUMA
+ if (hpage)
+ put_page(hpage);
+#endif
+ try_to_freeze();
+ if (unlikely(kthread_should_stop()))
+ break;
+ if (khugepaged_has_work()) {
+ DEFINE_WAIT(wait);
+ if (!khugepaged_scan_sleep_millisecs)
+ continue;
+ add_wait_queue(&khugepaged_wait, &wait);
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(
+ khugepaged_scan_sleep_millisecs));
+ remove_wait_queue(&khugepaged_wait, &wait);
+ } else if (khugepaged_enabled())
+ wait_event_freezable(khugepaged_wait,
+ khugepaged_wait_event());
+ }
+}
+
+static int khugepaged(void *none)
+{
+ struct mm_slot *mm_slot;
+
+ set_freezable();
+ set_user_nice(current, 19);
+
+ /* serialize with start_khugepaged() */
+ mutex_lock(&khugepaged_mutex);
+
+ for (;;) {
+ mutex_unlock(&khugepaged_mutex);
+ BUG_ON(khugepaged_thread != current);
+ khugepaged_loop();
+ BUG_ON(khugepaged_thread != current);
+
+ mutex_lock(&khugepaged_mutex);
+ if (!khugepaged_enabled())
+ break;
+ if (unlikely(kthread_should_stop()))
+ break;
+ }
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = khugepaged_scan.mm_slot;
+ khugepaged_scan.mm_slot = NULL;
+ if (mm_slot)
+ collect_mm_slot(mm_slot);
+ spin_unlock(&khugepaged_mm_lock);
+
+ khugepaged_thread = NULL;
+ mutex_unlock(&khugepaged_mutex);
+
+ return 0;
+}
+
+void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+{
+ struct page *page;
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ return;
+ }
+ page = pmd_page(*pmd);
+ VM_BUG_ON(!page_count(page));
+ get_page(page);
+ spin_unlock(&mm->page_table_lock);
+
+ split_huge_page(page);
+
+ put_page(page);
+ BUG_ON(pmd_trans_huge(*pmd));
+}
+
+static void split_huge_page_address(struct mm_struct *mm,
+ unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return;
+ /*
+ * Caller holds the mmap_sem write mode, so a huge pmd cannot
+ * materialize from under us.
+ */
+ split_huge_page_pmd(mm, pmd);
+}
+
+void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ long adjust_next)
+{
+ /*
+ * If the new start address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (start & ~HPAGE_PMD_MASK &&
+ (start & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_page_address(vma->vm_mm, start);
+
+ /*
+ * If the new end address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (end & ~HPAGE_PMD_MASK &&
+ (end & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_page_address(vma->vm_mm, end);
+
+ /*
+ * If we're also updating the vma->vm_next->vm_start, if the new
+ * vm_next->vm_start isn't page aligned and it could previously
+ * contain an hugepage: check if we need to split an huge pmd.
+ */
+ if (adjust_next > 0) {
+ struct vm_area_struct *next = vma->vm_next;
+ unsigned long nstart = next->vm_start;
+ nstart += adjust_next << PAGE_SHIFT;
+ if (nstart & ~HPAGE_PMD_MASK &&
+ (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
+ (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
+ split_huge_page_address(next->vm_mm, nstart);
+ }
+}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e91b81b63670..bb0b7c128015 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2,7 +2,6 @@
* Generic hugetlb support.
* (C) William Irwin, April 2004
*/
-#include <linux/gfp.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -18,6 +17,10 @@
#include <linux/mutex.h>
#include <linux/bootmem.h>
#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -220,6 +223,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
(vma->vm_pgoff >> huge_page_order(h));
}
+pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
+ unsigned long address)
+{
+ return vma_hugecache_offset(hstate_vma(vma), vma, address);
+}
+
/*
* Return the size of the pages allocated when backing a VMA. In the majority
* cases this will be same size as used by the page table entries.
@@ -385,67 +394,37 @@ static int vma_has_reserves(struct vm_area_struct *vma)
return 0;
}
-static void clear_gigantic_page(struct page *page,
- unsigned long addr, unsigned long sz)
-{
- int i;
- struct page *p = page;
-
- might_sleep();
- for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
- cond_resched();
- clear_user_highpage(p, addr + i * PAGE_SIZE);
- }
-}
-static void clear_huge_page(struct page *page,
- unsigned long addr, unsigned long sz)
-{
- int i;
-
- if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
- clear_gigantic_page(page, addr, sz);
- return;
- }
-
- might_sleep();
- for (i = 0; i < sz/PAGE_SIZE; i++) {
- cond_resched();
- clear_user_highpage(page + i, addr + i * PAGE_SIZE);
- }
-}
-
-static void copy_gigantic_page(struct page *dst, struct page *src,
- unsigned long addr, struct vm_area_struct *vma)
+static void copy_gigantic_page(struct page *dst, struct page *src)
{
int i;
- struct hstate *h = hstate_vma(vma);
+ struct hstate *h = page_hstate(src);
struct page *dst_base = dst;
struct page *src_base = src;
- might_sleep();
+
for (i = 0; i < pages_per_huge_page(h); ) {
cond_resched();
- copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+ copy_highpage(dst, src);
i++;
dst = mem_map_next(dst, dst_base, i);
src = mem_map_next(src, src_base, i);
}
}
-static void copy_huge_page(struct page *dst, struct page *src,
- unsigned long addr, struct vm_area_struct *vma)
+
+void copy_huge_page(struct page *dst, struct page *src)
{
int i;
- struct hstate *h = hstate_vma(vma);
+ struct hstate *h = page_hstate(src);
if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
- copy_gigantic_page(dst, src, addr, vma);
+ copy_gigantic_page(dst, src);
return;
}
might_sleep();
for (i = 0; i < pages_per_huge_page(h); i++) {
cond_resched();
- copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
+ copy_highpage(dst + i, src + i);
}
}
@@ -457,19 +436,34 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
h->free_huge_pages_node[nid]++;
}
+static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ if (list_empty(&h->hugepage_freelists[nid]))
+ return NULL;
+ page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
+ list_del(&page->lru);
+ set_page_refcounted(page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ return page;
+}
+
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve)
{
- int nid;
struct page *page = NULL;
struct mempolicy *mpol;
nodemask_t *nodemask;
- struct zonelist *zonelist = huge_zonelist(vma, address,
- htlb_alloc_mask, &mpol, &nodemask);
+ struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
+ get_mems_allowed();
+ zonelist = huge_zonelist(vma, address,
+ htlb_alloc_mask, &mpol, &nodemask);
/*
* A child process with MAP_PRIVATE mappings created by their parent
* have no page reserves. This check ensures that reservations are
@@ -477,30 +471,26 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
*/
if (!vma_has_reserves(vma) &&
h->free_huge_pages - h->resv_huge_pages == 0)
- return NULL;
+ goto err;
/* If reserves cannot be used, ensure enough pages are in the pool */
if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
- return NULL;
+ goto err;;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
- nid = zone_to_nid(zone);
- if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
- !list_empty(&h->hugepage_freelists[nid])) {
- page = list_entry(h->hugepage_freelists[nid].next,
- struct page, lru);
- list_del(&page->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
-
- if (!avoid_reserve)
- decrement_hugepage_resv_vma(h, vma);
-
- break;
+ if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
+ page = dequeue_huge_page_node(h, zone_to_nid(zone));
+ if (page) {
+ if (!avoid_reserve)
+ decrement_hugepage_resv_vma(h, vma);
+ break;
+ }
}
}
+err:
mpol_cond_put(mpol);
+ put_mems_allowed();
return page;
}
@@ -546,7 +536,9 @@ static void free_huge_page(struct page *page)
mapping = (struct address_space *) page_private(page);
set_page_private(page, 0);
+ page->mapping = NULL;
BUG_ON(page_count(page));
+ BUG_ON(page_mapcount(page));
INIT_LIST_HEAD(&page->lru);
spin_lock(&hugetlb_lock);
@@ -600,6 +592,8 @@ int PageHuge(struct page *page)
return dtor == free_huge_page;
}
+EXPORT_SYMBOL_GPL(PageHuge);
+
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
@@ -753,11 +747,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
return ret;
}
-static struct page *alloc_buddy_huge_page(struct hstate *h,
- struct vm_area_struct *vma, unsigned long address)
+static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
{
struct page *page;
- unsigned int nid;
+ unsigned int r_nid;
if (h->order >= MAX_ORDER)
return NULL;
@@ -795,9 +788,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
}
spin_unlock(&hugetlb_lock);
- page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
- __GFP_REPEAT|__GFP_NOWARN,
- huge_page_order(h));
+ if (nid == NUMA_NO_NODE)
+ page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+ __GFP_REPEAT|__GFP_NOWARN,
+ huge_page_order(h));
+ else
+ page = alloc_pages_exact_node(nid,
+ htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+ __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
if (page && arch_prepare_hugepage(page)) {
__free_pages(page, huge_page_order(h));
@@ -806,19 +804,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
spin_lock(&hugetlb_lock);
if (page) {
- /*
- * This page is now managed by the hugetlb allocator and has
- * no users -- drop the buddy allocator's reference.
- */
- put_page_testzero(page);
- VM_BUG_ON(page_count(page));
- nid = page_to_nid(page);
+ r_nid = page_to_nid(page);
set_compound_page_dtor(page, free_huge_page);
/*
* We incremented the global counters already
*/
- h->nr_huge_pages_node[nid]++;
- h->surplus_huge_pages_node[nid]++;
+ h->nr_huge_pages_node[r_nid]++;
+ h->surplus_huge_pages_node[r_nid]++;
__count_vm_event(HTLB_BUDDY_PGALLOC);
} else {
h->nr_huge_pages--;
@@ -831,6 +823,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
}
/*
+ * This allocation function is useful in the context where vma is irrelevant.
+ * E.g. soft-offlining uses this function because it only cares physical
+ * address of error page.
+ */
+struct page *alloc_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ spin_lock(&hugetlb_lock);
+ page = dequeue_huge_page_node(h, nid);
+ spin_unlock(&hugetlb_lock);
+
+ if (!page)
+ page = alloc_buddy_huge_page(h, nid);
+
+ return page;
+}
+
+/*
* Increase the hugetlb pool such that it can accomodate a reservation
* of size 'delta'.
*/
@@ -854,17 +865,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = alloc_buddy_huge_page(h, NULL, 0);
- if (!page) {
+ page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+ if (!page)
/*
* We were not able to allocate enough pages to
* satisfy the entire reservation so we free what
* we've allocated so far.
*/
- spin_lock(&hugetlb_lock);
- needed = 0;
goto free;
- }
list_add(&page->lru, &surplus_list);
}
@@ -891,31 +899,31 @@ retry:
needed += allocated;
h->resv_huge_pages += delta;
ret = 0;
-free:
+
+ spin_unlock(&hugetlb_lock);
/* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
if ((--needed) < 0)
break;
list_del(&page->lru);
+ /*
+ * This page is now managed by the hugetlb allocator and has
+ * no users -- drop the buddy allocator's reference.
+ */
+ put_page_testzero(page);
+ VM_BUG_ON(page_count(page));
enqueue_huge_page(h, page);
}
/* Free unnecessary surplus pages to the buddy allocator */
+free:
if (!list_empty(&surplus_list)) {
- spin_unlock(&hugetlb_lock);
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
list_del(&page->lru);
- /*
- * The page has a reference count of zero already, so
- * call free_huge_page directly instead of using
- * put_page. This must be done with hugetlb_lock
- * unlocked which is safe because free_huge_page takes
- * hugetlb_lock before deciding how to free the page.
- */
- free_huge_page(page);
+ put_page(page);
}
- spin_lock(&hugetlb_lock);
}
+ spin_lock(&hugetlb_lock);
return ret;
}
@@ -1035,14 +1043,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
spin_unlock(&hugetlb_lock);
if (!page) {
- page = alloc_buddy_huge_page(h, vma, addr);
+ page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) {
hugetlb_put_quota(inode->i_mapping, chg);
- return ERR_PTR(-VM_FAULT_OOM);
+ return ERR_PTR(-VM_FAULT_SIGBUS);
}
}
- set_page_refcounted(page);
set_page_private(page, (unsigned long) mapping);
vma_commit_reservation(h, vma, addr);
@@ -1356,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
return sprintf(buf, "%lu\n", nr_huge_pages);
}
+
static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t len)
@@ -1368,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
err = strict_strtoul(buf, 10, &count);
if (err)
- return 0;
+ goto out;
h = kobj_to_hstate(kobj, &nid);
+ if (h->order >= MAX_ORDER) {
+ err = -EINVAL;
+ goto out;
+ }
+
if (nid == NUMA_NO_NODE) {
/*
* global hstate attribute
@@ -1396,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
NODEMASK_FREE(nodes_allowed);
return len;
+out:
+ NODEMASK_FREE(nodes_allowed);
+ return err;
}
static ssize_t nr_hugepages_show(struct kobject *kobj,
@@ -1438,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
struct hstate *h = kobj_to_hstate(kobj, NULL);
return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
}
+
static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
@@ -1445,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
+ if (h->order >= MAX_ORDER)
+ return -EINVAL;
+
err = strict_strtoul(buf, 10, &input);
if (err)
- return 0;
+ return err;
spin_lock(&hugetlb_lock);
h->nr_overcommit_huge_pages = input;
@@ -1515,10 +1535,9 @@ static struct attribute_group hstate_attr_group = {
.attrs = hstate_attrs,
};
-static int __init hugetlb_sysfs_add_hstate(struct hstate *h,
- struct kobject *parent,
- struct kobject **hstate_kobjs,
- struct attribute_group *hstate_attr_group)
+static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
+ struct kobject **hstate_kobjs,
+ struct attribute_group *hstate_attr_group)
{
int retval;
int hi = h - hstates;
@@ -1851,13 +1870,19 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
{
struct hstate *h = &default_hstate;
unsigned long tmp;
+ int ret;
if (!write)
tmp = h->max_huge_pages;
+ if (write && h->order >= MAX_ORDER)
+ return -EINVAL;
+
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
- proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ if (ret)
+ goto out;
if (write) {
NODEMASK_ALLOC(nodemask_t, nodes_allowed,
@@ -1872,8 +1897,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
if (nodes_allowed != &node_states[N_HIGH_MEMORY])
NODEMASK_FREE(nodes_allowed);
}
-
- return 0;
+out:
+ return ret;
}
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -1911,21 +1936,27 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
{
struct hstate *h = &default_hstate;
unsigned long tmp;
+ int ret;
if (!write)
tmp = h->nr_overcommit_huge_pages;
+ if (write && h->order >= MAX_ORDER)
+ return -EINVAL;
+
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
- proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ if (ret)
+ goto out;
if (write) {
spin_lock(&hugetlb_lock);
h->nr_overcommit_huge_pages = tmp;
spin_unlock(&hugetlb_lock);
}
-
- return 0;
+out:
+ return ret;
}
#endif /* CONFIG_SYSCTL */
@@ -2088,7 +2119,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
- update_mmu_cache(vma, address, entry);
+ update_mmu_cache(vma, address, ptep);
}
}
@@ -2125,6 +2156,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
+ page_dup_rmap(ptepage);
set_huge_pte_at(dst, addr, dst_pte, entry);
}
spin_unlock(&src->page_table_lock);
@@ -2136,6 +2168,32 @@ nomem:
return -ENOMEM;
}
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_migration_entry(swp)) {
+ return 1;
+ } else
+ return 0;
+}
+
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
+ return 1;
+ } else
+ return 0;
+}
+
void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
@@ -2194,6 +2252,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
if (huge_pte_none(pte))
continue;
+ /*
+ * HWPoisoned hugepage is already unmapped and dropped reference
+ */
+ if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
+ continue;
+
page = pte_page(pte);
if (pte_dirty(pte))
set_page_dirty(page);
@@ -2203,6 +2267,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
flush_tlb_range(vma, start, end);
mmu_notifier_invalidate_range_end(mm, start, end);
list_for_each_entry_safe(page, tmp, &page_list, lru) {
+ page_remove_rmap(page);
list_del(&page->lru);
put_page(page);
}
@@ -2268,6 +2333,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
return 1;
}
+/*
+ * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ */
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, pte_t pte,
struct page *pagecache_page)
@@ -2282,8 +2350,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
retry_avoidcopy:
/* If no-one else is actually using this page, avoid the copy
* and just make the page writable */
- avoidcopy = (page_count(old_page) == 1);
+ avoidcopy = (page_mapcount(old_page) == 1);
if (avoidcopy) {
+ if (PageAnon(old_page))
+ page_move_anon_rmap(old_page, vma, address);
set_huge_ptep_writable(vma, address, ptep);
return 0;
}
@@ -2334,7 +2404,18 @@ retry_avoidcopy:
return -PTR_ERR(new_page);
}
- copy_huge_page(new_page, old_page, address, vma);
+ /*
+ * When the original hugepage is shared one, it does not have
+ * anon_vma prepared.
+ */
+ if (unlikely(anon_vma_prepare(vma))) {
+ /* Caller expects lock to be held */
+ spin_lock(&mm->page_table_lock);
+ return VM_FAULT_OOM;
+ }
+
+ copy_user_huge_page(new_page, old_page, address, vma,
+ pages_per_huge_page(h));
__SetPageUptodate(new_page);
/*
@@ -2345,11 +2426,19 @@ retry_avoidcopy:
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
if (likely(pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW */
+ mmu_notifier_invalidate_range_start(mm,
+ address & huge_page_mask(h),
+ (address & huge_page_mask(h)) + huge_page_size(h));
huge_ptep_clear_flush(vma, address, ptep);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
+ page_remove_rmap(old_page);
+ hugepage_add_new_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */
new_page = old_page;
+ mmu_notifier_invalidate_range_end(mm,
+ address & huge_page_mask(h),
+ (address & huge_page_mask(h)) + huge_page_size(h));
}
page_cache_release(new_page);
page_cache_release(old_page);
@@ -2430,7 +2519,7 @@ retry:
ret = -PTR_ERR(page);
goto out;
}
- clear_huge_page(page, address, huge_page_size(h));
+ clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
if (vma->vm_flags & VM_MAYSHARE) {
@@ -2448,8 +2537,27 @@ retry:
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
- } else
+ page_dup_rmap(page);
+ } else {
lock_page(page);
+ if (unlikely(anon_vma_prepare(vma))) {
+ ret = VM_FAULT_OOM;
+ goto backout_unlocked;
+ }
+ hugepage_add_new_anon_rmap(page, vma, address);
+ }
+ } else {
+ /*
+ * If memory error occurs between mmap() and fault, some process
+ * don't have hwpoisoned swap entry for errored virtual address.
+ * So we need to block hugepage fault by PG_hwpoison bit check.
+ */
+ if (unlikely(PageHWPoison(page))) {
+ ret = VM_FAULT_HWPOISON |
+ VM_FAULT_SET_HINDEX(h - hstates);
+ goto backout_unlocked;
+ }
+ page_dup_rmap(page);
}
/*
@@ -2501,10 +2609,22 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *ptep;
pte_t entry;
int ret;
+ struct page *page = NULL;
struct page *pagecache_page = NULL;
static DEFINE_MUTEX(hugetlb_instantiation_mutex);
struct hstate *h = hstate_vma(vma);
+ ptep = huge_pte_offset(mm, address);
+ if (ptep) {
+ entry = huge_ptep_get(ptep);
+ if (unlikely(is_hugetlb_entry_migration(entry))) {
+ migration_entry_wait(mm, (pmd_t *)ptep, address);
+ return 0;
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ return VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(h - hstates);
+ }
+
ptep = huge_pte_alloc(mm, address, huge_page_size(h));
if (!ptep)
return VM_FAULT_OOM;
@@ -2542,6 +2662,17 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vma, address);
}
+ /*
+ * hugetlb_cow() requires page locks of pte_page(entry) and
+ * pagecache_page, so here we need take the former one
+ * when page != pagecache_page or !pagecache_page.
+ * Note that locking order is always pagecache_page -> page,
+ * so no worry about deadlock.
+ */
+ page = pte_page(entry);
+ if (page != pagecache_page)
+ lock_page(page);
+
spin_lock(&mm->page_table_lock);
/* Check for a racing update before calling hugetlb_cow */
if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
@@ -2559,7 +2690,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
entry = pte_mkyoung(entry);
if (huge_ptep_set_access_flags(vma, address, ptep, entry,
flags & FAULT_FLAG_WRITE))
- update_mmu_cache(vma, address, entry);
+ update_mmu_cache(vma, address, ptep);
out_page_table_lock:
spin_unlock(&mm->page_table_lock);
@@ -2568,6 +2699,8 @@ out_page_table_lock:
unlock_page(pagecache_page);
put_page(pagecache_page);
}
+ if (page != pagecache_page)
+ unlock_page(page);
out_mutex:
mutex_unlock(&hugetlb_instantiation_mutex);
@@ -2779,3 +2912,42 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
hugetlb_put_quota(inode->i_mapping, (chg - freed));
hugetlb_acct_memory(h, -(chg - freed));
}
+
+#ifdef CONFIG_MEMORY_FAILURE
+
+/* Should be called in hugetlb_lock */
+static int is_hugepage_on_freelist(struct page *hpage)
+{
+ struct page *page;
+ struct page *tmp;
+ struct hstate *h = page_hstate(hpage);
+ int nid = page_to_nid(hpage);
+
+ list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
+ if (page == hpage)
+ return 1;
+ return 0;
+}
+
+/*
+ * This function is called from memory failure code.
+ * Assume the caller holds page lock of the head page.
+ */
+int dequeue_hwpoisoned_huge_page(struct page *hpage)
+{
+ struct hstate *h = page_hstate(hpage);
+ int nid = page_to_nid(hpage);
+ int ret = -EBUSY;
+
+ spin_lock(&hugetlb_lock);
+ if (is_hugepage_on_freelist(hpage)) {
+ list_del(&hpage->lru);
+ set_page_refcounted(hpage);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ ret = 0;
+ }
+ spin_unlock(&hugetlb_lock);
+ return ret;
+}
+#endif
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 10ea71905c1f..0948f1072d6b 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -5,6 +5,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
+#include <linux/hugetlb.h>
#include "internal.h"
static struct dentry *hwpoison_dir;
@@ -13,6 +14,7 @@ static int hwpoison_inject(void *data, u64 val)
{
unsigned long pfn = val;
struct page *p;
+ struct page *hpage;
int err;
if (!capable(CAP_SYS_ADMIN))
@@ -24,18 +26,19 @@ static int hwpoison_inject(void *data, u64 val)
return -ENXIO;
p = pfn_to_page(pfn);
+ hpage = compound_head(p);
/*
* This implies unable to support free buddy pages.
*/
- if (!get_page_unless_zero(p))
+ if (!get_page_unless_zero(hpage))
return 0;
- if (!PageLRU(p))
+ if (!PageLRU(p) && !PageHuge(p))
shake_page(p, 0);
/*
* This implies unable to support non-LRU pages.
*/
- if (!PageLRU(p))
+ if (!PageLRU(p) && !PageHuge(p))
return 0;
/*
@@ -44,9 +47,9 @@ static int hwpoison_inject(void *data, u64 val)
* We temporarily take page lock for try_get_mem_cgroup_from_page().
* __memory_failure() will redo the check reliably inside page lock.
*/
- lock_page(p);
- err = hwpoison_filter(p);
- unlock_page(p);
+ lock_page(hpage);
+ err = hwpoison_filter(hpage);
+ unlock_page(hpage);
if (err)
return 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 57aba0da9668..1d29cdfe8ebb 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -7,6 +7,11 @@
#include <asm/atomic.h>
#include <asm/pgtable.h>
+#include <asm/mmu.h>
+
+#ifndef INIT_MM_CONTEXT
+#define INIT_MM_CONTEXT(name)
+#endif
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
@@ -17,4 +22,5 @@ struct mm_struct init_mm = {
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.cpu_vm_mask = CPU_MASK_ALL,
+ INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/internal.h b/mm/internal.h
index 6a697bb97fc5..69488205723d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page);
*/
static inline unsigned long page_order(struct page *page)
{
- VM_BUG_ON(!PageBuddy(page));
+ /* PageBuddy() must be checked by the caller */
return page_private(page);
}
@@ -134,6 +134,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
}
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern unsigned long vma_address(struct page *page,
+ struct vm_area_struct *vma);
+#endif
#else /* !CONFIG_MMU */
static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
{
@@ -243,7 +247,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, unsigned int foll_flags,
- struct page **pages, struct vm_area_struct **vmas);
+ struct page **pages, struct vm_area_struct **vmas,
+ int *nonblocking);
#define ZONE_RECLAIM_NOSCAN -2
#define ZONE_RECLAIM_FULL -1
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 5b069e4f5e48..bd9bc214091b 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -72,7 +72,6 @@
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/prio_tree.h>
-#include <linux/gfp.h>
#include <linux/fs.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -212,6 +211,9 @@ static signed long jiffies_scan_wait;
static int kmemleak_stack_scan = 1;
/* protects the memory scanning, parameters and debug/kmemleak file access */
static DEFINE_MUTEX(scan_mutex);
+/* setting kmemleak=on, will set this var, skipping the disable */
+static int kmemleak_skip_disable;
+
/*
* Early object allocation/freeing logging. Kmemleak is initialized after the
@@ -399,7 +401,9 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
object = prio_tree_entry(node, struct kmemleak_object,
tree_node);
if (!alias && object->pointer != ptr) {
- kmemleak_warn("Found object by alias");
+ pr_warning("Found object by alias at 0x%08lx\n", ptr);
+ dump_stack();
+ dump_object_info(object);
object = NULL;
}
} else
@@ -696,7 +700,7 @@ static void paint_ptr(unsigned long ptr, int color)
}
/*
- * Make a object permanently as gray-colored so that it can no longer be
+ * Mark an object permanently as gray-colored so that it can no longer be
* reported as a leak. This is used in general to mark a false positive.
*/
static void make_gray_object(unsigned long ptr)
@@ -839,10 +843,19 @@ out:
rcu_read_unlock();
}
-/*
- * Memory allocation function callback. This function is called from the
- * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
- * vmalloc etc.).
+/**
+ * kmemleak_alloc - register a newly allocated object
+ * @ptr: pointer to beginning of the object
+ * @size: size of the object
+ * @min_count: minimum number of references to this object. If during memory
+ * scanning a number of references less than @min_count is found,
+ * the object is reported as a memory leak. If @min_count is 0,
+ * the object is never reported as a leak. If @min_count is -1,
+ * the object is ignored (not scanned and not reported as a leak)
+ * @gfp: kmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is called from the kernel allocators when a new object
+ * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.).
*/
void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
gfp_t gfp)
@@ -856,9 +869,12 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
}
EXPORT_SYMBOL_GPL(kmemleak_alloc);
-/*
- * Memory freeing function callback. This function is called from the kernel
- * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
+/**
+ * kmemleak_free - unregister a previously registered object
+ * @ptr: pointer to beginning of the object
+ *
+ * This function is called from the kernel allocators when an object (memory
+ * block) is freed (kmem_cache_free, kfree, vfree etc.).
*/
void __ref kmemleak_free(const void *ptr)
{
@@ -871,9 +887,14 @@ void __ref kmemleak_free(const void *ptr)
}
EXPORT_SYMBOL_GPL(kmemleak_free);
-/*
- * Partial memory freeing function callback. This function is usually called
- * from bootmem allocator when (part of) a memory block is freed.
+/**
+ * kmemleak_free_part - partially unregister a previously registered object
+ * @ptr: pointer to the beginning or inside the object. This also
+ * represents the start of the range to be freed
+ * @size: size to be unregistered
+ *
+ * This function is called when only a part of a memory block is freed
+ * (usually from the bootmem allocator).
*/
void __ref kmemleak_free_part(const void *ptr, size_t size)
{
@@ -886,9 +907,12 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);
-/*
- * Mark an already allocated memory block as a false positive. This will cause
- * the block to no longer be reported as leak and always be scanned.
+/**
+ * kmemleak_not_leak - mark an allocated object as false positive
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to no longer
+ * be reported as leak and always be scanned.
*/
void __ref kmemleak_not_leak(const void *ptr)
{
@@ -901,10 +925,14 @@ void __ref kmemleak_not_leak(const void *ptr)
}
EXPORT_SYMBOL(kmemleak_not_leak);
-/*
- * Ignore a memory block. This is usually done when it is known that the
- * corresponding block is not a leak and does not contain any references to
- * other allocated memory blocks.
+/**
+ * kmemleak_ignore - ignore an allocated object
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to be
+ * ignored (not scanned and not reported as a leak). This is usually done when
+ * it is known that the corresponding block is not a leak and does not contain
+ * any references to other allocated memory blocks.
*/
void __ref kmemleak_ignore(const void *ptr)
{
@@ -917,8 +945,16 @@ void __ref kmemleak_ignore(const void *ptr)
}
EXPORT_SYMBOL(kmemleak_ignore);
-/*
- * Limit the range to be scanned in an allocated memory block.
+/**
+ * kmemleak_scan_area - limit the range to be scanned in an allocated object
+ * @ptr: pointer to beginning or inside the object. This also
+ * represents the start of the scan area
+ * @size: size of the scan area
+ * @gfp: kmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is used when it is known that only certain parts of an object
+ * contain references to other objects. Kmemleak will only scan these areas
+ * reducing the number false negatives.
*/
void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
{
@@ -931,8 +967,14 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
}
EXPORT_SYMBOL(kmemleak_scan_area);
-/*
- * Inform kmemleak not to scan the given memory block.
+/**
+ * kmemleak_no_scan - do not scan an allocated object
+ * @ptr: pointer to beginning of the object
+ *
+ * This function notifies kmemleak not to scan the given memory block. Useful
+ * in situations where it is known that the given object does not contain any
+ * references to other objects. Kmemleak will not scan such objects reducing
+ * the number of false negatives.
*/
void __ref kmemleak_no_scan(const void *ptr)
{
@@ -1603,7 +1645,9 @@ static int kmemleak_boot_config(char *str)
return -EINVAL;
if (strcmp(str, "off") == 0)
kmemleak_disable();
- else if (strcmp(str, "on") != 0)
+ else if (strcmp(str, "on") == 0)
+ kmemleak_skip_disable = 1;
+ else
return -EINVAL;
return 0;
}
@@ -1617,6 +1661,13 @@ void __init kmemleak_init(void)
int i;
unsigned long flags;
+#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
+ if (!kmemleak_skip_disable) {
+ kmemleak_disable();
+ return;
+ }
+#endif
+
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
diff --git a/mm/ksm.c b/mm/ksm.c
index 56a0da1f9979..c2b2a94f9d67 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -33,6 +33,8 @@
#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/ksm.h>
+#include <linux/hash.h>
+#include <linux/freezer.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -153,8 +155,9 @@ struct rmap_item {
static struct rb_root root_stable_tree = RB_ROOT;
static struct rb_root root_unstable_tree = RB_ROOT;
-#define MM_SLOTS_HASH_HEADS 1024
-static struct hlist_head *mm_slots_hash;
+#define MM_SLOTS_HASH_SHIFT 10
+#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
+static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
static struct mm_slot ksm_mm_head = {
.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -269,28 +272,13 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
kmem_cache_free(mm_slot_cache, mm_slot);
}
-static int __init mm_slots_hash_init(void)
-{
- mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
- GFP_KERNEL);
- if (!mm_slots_hash)
- return -ENOMEM;
- return 0;
-}
-
-static void __init mm_slots_hash_free(void)
-{
- kfree(mm_slots_hash);
-}
-
static struct mm_slot *get_mm_slot(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
struct hlist_head *bucket;
struct hlist_node *node;
- bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
- % MM_SLOTS_HASH_HEADS];
+ bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
hlist_for_each_entry(mm_slot, node, bucket, link) {
if (mm == mm_slot->mm)
return mm_slot;
@@ -303,8 +291,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
{
struct hlist_head *bucket;
- bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
- % MM_SLOTS_HASH_HEADS];
+ bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
mm_slot->mm = mm;
hlist_add_head(&mm_slot->link, bucket);
}
@@ -318,19 +305,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item,
struct anon_vma *anon_vma)
{
rmap_item->anon_vma = anon_vma;
- atomic_inc(&anon_vma->ksm_refcount);
+ get_anon_vma(anon_vma);
}
-static void drop_anon_vma(struct rmap_item *rmap_item)
+static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
{
struct anon_vma *anon_vma = rmap_item->anon_vma;
- if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) {
- int empty = list_empty(&anon_vma->head);
- spin_unlock(&anon_vma->lock);
- if (empty)
- anon_vma_free(anon_vma);
- }
+ drop_anon_vma(anon_vma);
}
/*
@@ -365,7 +347,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
do {
cond_resched();
page = follow_page(vma, addr, FOLL_GET);
- if (!page)
+ if (IS_ERR_OR_NULL(page))
break;
if (PageKsm(page))
ret = handle_mm_fault(vma->vm_mm, vma, addr,
@@ -415,7 +397,7 @@ static void break_cow(struct rmap_item *rmap_item)
* It is not an accident that whenever we want to break COW
* to undo, we also need to drop a reference to the anon_vma.
*/
- drop_anon_vma(rmap_item);
+ ksm_drop_anon_vma(rmap_item);
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
@@ -430,6 +412,20 @@ out:
up_read(&mm->mmap_sem);
}
+static struct page *page_trans_compound_anon(struct page *page)
+{
+ if (PageTransCompound(page)) {
+ struct page *head = compound_trans_head(page);
+ /*
+ * head may actually be splitted and freed from under
+ * us but it's ok here.
+ */
+ if (PageAnon(head))
+ return head;
+ }
+ return NULL;
+}
+
static struct page *get_mergeable_page(struct rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
@@ -447,9 +443,9 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
goto out;
page = follow_page(vma, addr, FOLL_GET);
- if (!page)
+ if (IS_ERR_OR_NULL(page))
goto out;
- if (PageAnon(page)) {
+ if (PageAnon(page) || page_trans_compound_anon(page)) {
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
} else {
@@ -470,7 +466,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
ksm_pages_sharing--;
else
ksm_pages_shared--;
- drop_anon_vma(rmap_item);
+ ksm_drop_anon_vma(rmap_item);
rmap_item->address &= PAGE_MASK;
cond_resched();
}
@@ -558,7 +554,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
else
ksm_pages_shared--;
- drop_anon_vma(rmap_item);
+ ksm_drop_anon_vma(rmap_item);
rmap_item->address &= PAGE_MASK;
} else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -727,11 +723,12 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
if (addr == -EFAULT)
goto out;
+ BUG_ON(PageTransCompound(page));
ptep = page_check_address(page, mm, addr, &ptl, 0);
if (!ptep)
goto out;
- if (pte_write(*ptep)) {
+ if (pte_write(*ptep) || pte_dirty(*ptep)) {
pte_t entry;
swapped = PageSwapCache(page);
@@ -751,10 +748,12 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
* page
*/
if (page_mapcount(page) + 1 + swapped != page_count(page)) {
- set_pte_at_notify(mm, addr, ptep, entry);
+ set_pte_at(mm, addr, ptep, entry);
goto out_unlock;
}
- entry = pte_wrprotect(entry);
+ if (pte_dirty(entry))
+ set_page_dirty(page);
+ entry = pte_mkclean(pte_wrprotect(entry));
set_pte_at_notify(mm, addr, ptep, entry);
}
*orig_pte = *ptep;
@@ -800,6 +799,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
goto out;
pmd = pmd_offset(pud, addr);
+ BUG_ON(pmd_trans_huge(*pmd));
if (!pmd_present(*pmd))
goto out;
@@ -817,6 +817,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
page_remove_rmap(page);
+ if (!page_mapped(page))
+ try_to_free_swap(page);
put_page(page);
pte_unmap_unlock(ptep, ptl);
@@ -825,6 +827,33 @@ out:
return err;
}
+static int page_trans_compound_anon_split(struct page *page)
+{
+ int ret = 0;
+ struct page *transhuge_head = page_trans_compound_anon(page);
+ if (transhuge_head) {
+ /* Get the reference on the head to split it. */
+ if (get_page_unless_zero(transhuge_head)) {
+ /*
+ * Recheck we got the reference while the head
+ * was still anonymous.
+ */
+ if (PageAnon(transhuge_head))
+ ret = split_huge_page(transhuge_head);
+ else
+ /*
+ * Retry later if split_huge_page run
+ * from under us.
+ */
+ ret = 1;
+ put_page(transhuge_head);
+ } else
+ /* Retry later if split_huge_page run from under us. */
+ ret = 1;
+ }
+ return ret;
+}
+
/*
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
@@ -845,6 +874,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
if (!(vma->vm_flags & VM_MERGEABLE))
goto out;
+ if (PageTransCompound(page) && page_trans_compound_anon_split(page))
+ goto out;
+ BUG_ON(PageTransCompound(page));
if (!PageAnon(page))
goto out;
@@ -1086,7 +1118,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
cond_resched();
tree_rmap_item = rb_entry(*new, struct rmap_item, node);
tree_page = get_mergeable_page(tree_rmap_item);
- if (!tree_page)
+ if (IS_ERR_OR_NULL(tree_page))
return NULL;
/*
@@ -1264,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
slot = ksm_scan.mm_slot;
if (slot == &ksm_mm_head) {
+ /*
+ * A number of pages can hang around indefinitely on per-cpu
+ * pagevecs, raised page count preventing write_protect_page
+ * from merging them. Though it doesn't really matter much,
+ * it is puzzling to see some stuck in pages_volatile until
+ * other activity jostles them out, and they also prevented
+ * LTP's KSM test from succeeding deterministically; so drain
+ * them here (here rather than on entry to ksm_do_scan(),
+ * so we don't IPI too often when pages_to_scan is set low).
+ */
+ lru_add_drain_all();
+
root_unstable_tree = RB_ROOT;
spin_lock(&ksm_mmlist_lock);
@@ -1294,7 +1338,13 @@ next_mm:
if (ksm_test_exit(mm))
break;
*page = follow_page(vma, ksm_scan.address, FOLL_GET);
- if (*page && PageAnon(*page)) {
+ if (IS_ERR_OR_NULL(*page)) {
+ ksm_scan.address += PAGE_SIZE;
+ cond_resched();
+ continue;
+ }
+ if (PageAnon(*page) ||
+ page_trans_compound_anon(*page)) {
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
rmap_item = get_next_rmap_item(slot,
@@ -1308,8 +1358,7 @@ next_mm:
up_read(&mm->mmap_sem);
return rmap_item;
}
- if (*page)
- put_page(*page);
+ put_page(*page);
ksm_scan.address += PAGE_SIZE;
cond_resched();
}
@@ -1367,9 +1416,9 @@ next_mm:
static void ksm_do_scan(unsigned int scan_npages)
{
struct rmap_item *rmap_item;
- struct page *page;
+ struct page *uninitialized_var(page);
- while (scan_npages--) {
+ while (scan_npages-- && likely(!freezing(current))) {
cond_resched();
rmap_item = scan_get_next_rmap_item(&page);
if (!rmap_item)
@@ -1387,6 +1436,7 @@ static int ksmd_should_run(void)
static int ksm_scan_thread(void *nothing)
{
+ set_freezable();
set_user_nice(current, 5);
while (!kthread_should_stop()) {
@@ -1395,11 +1445,13 @@ static int ksm_scan_thread(void *nothing)
ksm_do_scan(ksm_thread_pages_to_scan);
mutex_unlock(&ksm_thread_mutex);
+ try_to_freeze();
+
if (ksmd_should_run()) {
schedule_timeout_interruptible(
msecs_to_jiffies(ksm_thread_sleep_millisecs));
} else {
- wait_event_interruptible(ksm_thread_wait,
+ wait_event_freezable(ksm_thread_wait,
ksmd_should_run() || kthread_should_stop());
}
}
@@ -1523,8 +1575,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
{
struct page *new_page;
- unlock_page(page); /* any racers will COW it, not modify it */
-
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (new_page) {
copy_user_highpage(new_page, page, address, vma);
@@ -1540,7 +1590,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
add_page_to_unevictable_list(new_page);
}
- page_cache_release(page);
return new_page;
}
@@ -1563,10 +1612,12 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
again:
hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
+ struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- spin_lock(&anon_vma->lock);
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ anon_vma_lock(anon_vma);
+ list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
continue;
@@ -1587,7 +1638,7 @@ again:
if (!search_new_forks || !mapcount)
break;
}
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
if (!mapcount)
goto out;
}
@@ -1614,10 +1665,12 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
again:
hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
+ struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- spin_lock(&anon_vma->lock);
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ anon_vma_lock(anon_vma);
+ list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
continue;
@@ -1633,11 +1686,11 @@ again:
ret = try_to_unmap_one(page, vma,
rmap_item->address, flags);
if (ret != SWAP_AGAIN || !page_mapped(page)) {
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
goto out;
}
}
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
}
if (!search_new_forks++)
goto again;
@@ -1664,10 +1717,12 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
again:
hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
+ struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- spin_lock(&anon_vma->lock);
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ anon_vma_lock(anon_vma);
+ list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
continue;
@@ -1682,11 +1737,11 @@ again:
ret = rmap_one(page, vma, rmap_item->address, arg);
if (ret != SWAP_AGAIN) {
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
goto out;
}
}
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
}
if (!search_new_forks++)
goto again;
@@ -1738,8 +1793,13 @@ static int ksm_memory_callback(struct notifier_block *self,
/*
* Keep it very simple for now: just lock out ksmd and
* MADV_UNMERGEABLE while any memory is going offline.
+ * mutex_lock_nested() is necessary because lockdep was alarmed
+ * that here we take ksm_thread_mutex inside notifier chain
+ * mutex, and later take notifier chain mutex inside
+ * ksm_thread_mutex to unlock it. But that's safe because both
+ * are inside mem_hotplug_mutex.
*/
- mutex_lock(&ksm_thread_mutex);
+ mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING);
break;
case MEM_OFFLINE:
@@ -1937,15 +1997,11 @@ static int __init ksm_init(void)
if (err)
goto out;
- err = mm_slots_hash_init();
- if (err)
- goto out_free1;
-
ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
if (IS_ERR(ksm_thread)) {
printk(KERN_ERR "ksm: creating kthread failed\n");
err = PTR_ERR(ksm_thread);
- goto out_free2;
+ goto out_free;
}
#ifdef CONFIG_SYSFS
@@ -1953,7 +2009,7 @@ static int __init ksm_init(void)
if (err) {
printk(KERN_ERR "ksm: register sysfs failed\n");
kthread_stop(ksm_thread);
- goto out_free2;
+ goto out_free;
}
#else
ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
@@ -1969,9 +2025,7 @@ static int __init ksm_init(void)
#endif
return 0;
-out_free2:
- mm_slots_hash_free();
-out_free1:
+out_free:
ksm_slab_free();
out:
return err;
diff --git a/mm/maccess.c b/mm/maccess.c
index 4e348dbaecd7..e2b6f5634e0d 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,9 +1,9 @@
/*
* Access kernel memory without faulting.
*/
-#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/mm.h>
+#include <linux/uaccess.h>
/**
* probe_kernel_read(): safely attempt to read from a location
diff --git a/mm/madvise.c b/mm/madvise.c
index 319528b8db74..2221491ed503 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
if (error)
goto out;
break;
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+ error = hugepage_madvise(vma, &new_flags, behavior);
+ if (error)
+ goto out;
+ break;
}
if (new_flags == vma->vm_flags) {
@@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior)
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+#endif
return 1;
default:
diff --git a/mm/memblock.c b/mm/memblock.c
new file mode 100644
index 000000000000..400dc62697d7
--- /dev/null
+++ b/mm/memblock.c
@@ -0,0 +1,842 @@
+/*
+ * Procedures for maintaining information about logical memory blocks.
+ *
+ * Peter Bergner, IBM Corp. June 2001.
+ * Copyright (C) 2001 Peter Bergner.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/poison.h>
+#include <linux/pfn.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/memblock.h>
+
+struct memblock memblock __initdata_memblock;
+
+int memblock_debug __initdata_memblock;
+int memblock_can_resize __initdata_memblock;
+static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
+static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
+
+/* inline so we don't get a warning when pr_debug is compiled out */
+static inline const char *memblock_type_name(struct memblock_type *type)
+{
+ if (type == &memblock.memory)
+ return "memory";
+ else if (type == &memblock.reserved)
+ return "reserved";
+ else
+ return "unknown";
+}
+
+/*
+ * Address comparison utilities
+ */
+
+static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size)
+{
+ return addr & ~(size - 1);
+}
+
+static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size)
+{
+ return (addr + (size - 1)) & ~(size - 1);
+}
+
+static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
+ phys_addr_t base2, phys_addr_t size2)
+{
+ return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
+}
+
+static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1,
+ phys_addr_t base2, phys_addr_t size2)
+{
+ if (base2 == base1 + size1)
+ return 1;
+ else if (base1 == base2 + size2)
+ return -1;
+
+ return 0;
+}
+
+static long __init_memblock memblock_regions_adjacent(struct memblock_type *type,
+ unsigned long r1, unsigned long r2)
+{
+ phys_addr_t base1 = type->regions[r1].base;
+ phys_addr_t size1 = type->regions[r1].size;
+ phys_addr_t base2 = type->regions[r2].base;
+ phys_addr_t size2 = type->regions[r2].size;
+
+ return memblock_addrs_adjacent(base1, size1, base2, size2);
+}
+
+long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
+{
+ unsigned long i;
+
+ for (i = 0; i < type->cnt; i++) {
+ phys_addr_t rgnbase = type->regions[i].base;
+ phys_addr_t rgnsize = type->regions[i].size;
+ if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
+ break;
+ }
+
+ return (i < type->cnt) ? i : -1;
+}
+
+/*
+ * Find, allocate, deallocate or reserve unreserved regions. All allocations
+ * are top-down.
+ */
+
+static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end,
+ phys_addr_t size, phys_addr_t align)
+{
+ phys_addr_t base, res_base;
+ long j;
+
+ /* In case, huge size is requested */
+ if (end < size)
+ return MEMBLOCK_ERROR;
+
+ base = memblock_align_down((end - size), align);
+
+ /* Prevent allocations returning 0 as it's also used to
+ * indicate an allocation failure
+ */
+ if (start == 0)
+ start = PAGE_SIZE;
+
+ while (start <= base) {
+ j = memblock_overlaps_region(&memblock.reserved, base, size);
+ if (j < 0)
+ return base;
+ res_base = memblock.reserved.regions[j].base;
+ if (res_base < size)
+ break;
+ base = memblock_align_down(res_base - size, align);
+ }
+
+ return MEMBLOCK_ERROR;
+}
+
+static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
+ phys_addr_t align, phys_addr_t start, phys_addr_t end)
+{
+ long i;
+
+ BUG_ON(0 == size);
+
+ size = memblock_align_up(size, align);
+
+ /* Pump up max_addr */
+ if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
+ end = memblock.current_limit;
+
+ /* We do a top-down search, this tends to limit memory
+ * fragmentation by keeping early boot allocs near the
+ * top of memory
+ */
+ for (i = memblock.memory.cnt - 1; i >= 0; i--) {
+ phys_addr_t memblockbase = memblock.memory.regions[i].base;
+ phys_addr_t memblocksize = memblock.memory.regions[i].size;
+ phys_addr_t bottom, top, found;
+
+ if (memblocksize < size)
+ continue;
+ if ((memblockbase + memblocksize) <= start)
+ break;
+ bottom = max(memblockbase, start);
+ top = min(memblockbase + memblocksize, end);
+ if (bottom >= top)
+ continue;
+ found = memblock_find_region(bottom, top, size, align);
+ if (found != MEMBLOCK_ERROR)
+ return found;
+ }
+ return MEMBLOCK_ERROR;
+}
+
+/*
+ * Find a free area with specified alignment in a specific range.
+ */
+u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align)
+{
+ return memblock_find_base(size, align, start, end);
+}
+
+/*
+ * Free memblock.reserved.regions
+ */
+int __init_memblock memblock_free_reserved_regions(void)
+{
+ if (memblock.reserved.regions == memblock_reserved_init_regions)
+ return 0;
+
+ return memblock_free(__pa(memblock.reserved.regions),
+ sizeof(struct memblock_region) * memblock.reserved.max);
+}
+
+/*
+ * Reserve memblock.reserved.regions
+ */
+int __init_memblock memblock_reserve_reserved_regions(void)
+{
+ if (memblock.reserved.regions == memblock_reserved_init_regions)
+ return 0;
+
+ return memblock_reserve(__pa(memblock.reserved.regions),
+ sizeof(struct memblock_region) * memblock.reserved.max);
+}
+
+static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
+{
+ unsigned long i;
+
+ for (i = r; i < type->cnt - 1; i++) {
+ type->regions[i].base = type->regions[i + 1].base;
+ type->regions[i].size = type->regions[i + 1].size;
+ }
+ type->cnt--;
+}
+
+/* Assumption: base addr of region 1 < base addr of region 2 */
+static void __init_memblock memblock_coalesce_regions(struct memblock_type *type,
+ unsigned long r1, unsigned long r2)
+{
+ type->regions[r1].size += type->regions[r2].size;
+ memblock_remove_region(type, r2);
+}
+
+/* Defined below but needed now */
+static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size);
+
+static int __init_memblock memblock_double_array(struct memblock_type *type)
+{
+ struct memblock_region *new_array, *old_array;
+ phys_addr_t old_size, new_size, addr;
+ int use_slab = slab_is_available();
+
+ /* We don't allow resizing until we know about the reserved regions
+ * of memory that aren't suitable for allocation
+ */
+ if (!memblock_can_resize)
+ return -1;
+
+ /* Calculate new doubled size */
+ old_size = type->max * sizeof(struct memblock_region);
+ new_size = old_size << 1;
+
+ /* Try to find some space for it.
+ *
+ * WARNING: We assume that either slab_is_available() and we use it or
+ * we use MEMBLOCK for allocations. That means that this is unsafe to use
+ * when bootmem is currently active (unless bootmem itself is implemented
+ * on top of MEMBLOCK which isn't the case yet)
+ *
+ * This should however not be an issue for now, as we currently only
+ * call into MEMBLOCK while it's still active, or much later when slab is
+ * active for memory hotplug operations
+ */
+ if (use_slab) {
+ new_array = kmalloc(new_size, GFP_KERNEL);
+ addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array);
+ } else
+ addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE);
+ if (addr == MEMBLOCK_ERROR) {
+ pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
+ memblock_type_name(type), type->max, type->max * 2);
+ return -1;
+ }
+ new_array = __va(addr);
+
+ memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
+ memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
+
+ /* Found space, we now need to move the array over before
+ * we add the reserved region since it may be our reserved
+ * array itself that is full.
+ */
+ memcpy(new_array, type->regions, old_size);
+ memset(new_array + type->max, 0, old_size);
+ old_array = type->regions;
+ type->regions = new_array;
+ type->max <<= 1;
+
+ /* If we use SLAB that's it, we are done */
+ if (use_slab)
+ return 0;
+
+ /* Add the new reserved region now. Should not fail ! */
+ BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0);
+
+ /* If the array wasn't our static init one, then free it. We only do
+ * that before SLAB is available as later on, we don't know whether
+ * to use kfree or free_bootmem_pages(). Shouldn't be a big deal
+ * anyways
+ */
+ if (old_array != memblock_memory_init_regions &&
+ old_array != memblock_reserved_init_regions)
+ memblock_free(__pa(old_array), old_size);
+
+ return 0;
+}
+
+extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
+ phys_addr_t addr2, phys_addr_t size2)
+{
+ return 1;
+}
+
+static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
+{
+ unsigned long coalesced = 0;
+ long adjacent, i;
+
+ if ((type->cnt == 1) && (type->regions[0].size == 0)) {
+ type->regions[0].base = base;
+ type->regions[0].size = size;
+ return 0;
+ }
+
+ /* First try and coalesce this MEMBLOCK with another. */
+ for (i = 0; i < type->cnt; i++) {
+ phys_addr_t rgnbase = type->regions[i].base;
+ phys_addr_t rgnsize = type->regions[i].size;
+
+ if ((rgnbase == base) && (rgnsize == size))
+ /* Already have this region, so we're done */
+ return 0;
+
+ adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize);
+ /* Check if arch allows coalescing */
+ if (adjacent != 0 && type == &memblock.memory &&
+ !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize))
+ break;
+ if (adjacent > 0) {
+ type->regions[i].base -= size;
+ type->regions[i].size += size;
+ coalesced++;
+ break;
+ } else if (adjacent < 0) {
+ type->regions[i].size += size;
+ coalesced++;
+ break;
+ }
+ }
+
+ /* If we plugged a hole, we may want to also coalesce with the
+ * next region
+ */
+ if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) &&
+ ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base,
+ type->regions[i].size,
+ type->regions[i+1].base,
+ type->regions[i+1].size)))) {
+ memblock_coalesce_regions(type, i, i+1);
+ coalesced++;
+ }
+
+ if (coalesced)
+ return coalesced;
+
+ /* If we are out of space, we fail. It's too late to resize the array
+ * but then this shouldn't have happened in the first place.
+ */
+ if (WARN_ON(type->cnt >= type->max))
+ return -1;
+
+ /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */
+ for (i = type->cnt - 1; i >= 0; i--) {
+ if (base < type->regions[i].base) {
+ type->regions[i+1].base = type->regions[i].base;
+ type->regions[i+1].size = type->regions[i].size;
+ } else {
+ type->regions[i+1].base = base;
+ type->regions[i+1].size = size;
+ break;
+ }
+ }
+
+ if (base < type->regions[0].base) {
+ type->regions[0].base = base;
+ type->regions[0].size = size;
+ }
+ type->cnt++;
+
+ /* The array is full ? Try to resize it. If that fails, we undo
+ * our allocation and return an error
+ */
+ if (type->cnt == type->max && memblock_double_array(type)) {
+ type->cnt--;
+ return -1;
+ }
+
+ return 0;
+}
+
+long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_add_region(&memblock.memory, base, size);
+
+}
+
+static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
+{
+ phys_addr_t rgnbegin, rgnend;
+ phys_addr_t end = base + size;
+ int i;
+
+ rgnbegin = rgnend = 0; /* supress gcc warnings */
+
+ /* Find the region where (base, size) belongs to */
+ for (i=0; i < type->cnt; i++) {
+ rgnbegin = type->regions[i].base;
+ rgnend = rgnbegin + type->regions[i].size;
+
+ if ((rgnbegin <= base) && (end <= rgnend))
+ break;
+ }
+
+ /* Didn't find the region */
+ if (i == type->cnt)
+ return -1;
+
+ /* Check to see if we are removing entire region */
+ if ((rgnbegin == base) && (rgnend == end)) {
+ memblock_remove_region(type, i);
+ return 0;
+ }
+
+ /* Check to see if region is matching at the front */
+ if (rgnbegin == base) {
+ type->regions[i].base = end;
+ type->regions[i].size -= size;
+ return 0;
+ }
+
+ /* Check to see if the region is matching at the end */
+ if (rgnend == end) {
+ type->regions[i].size -= size;
+ return 0;
+ }
+
+ /*
+ * We need to split the entry - adjust the current one to the
+ * beginging of the hole and add the region after hole.
+ */
+ type->regions[i].size = base - type->regions[i].base;
+ return memblock_add_region(type, end, rgnend - end);
+}
+
+long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
+{
+ return __memblock_remove(&memblock.memory, base, size);
+}
+
+long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
+{
+ return __memblock_remove(&memblock.reserved, base, size);
+}
+
+long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+{
+ struct memblock_type *_rgn = &memblock.reserved;
+
+ BUG_ON(0 == size);
+
+ return memblock_add_region(_rgn, base, size);
+}
+
+phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
+{
+ phys_addr_t found;
+
+ /* We align the size to limit fragmentation. Without this, a lot of
+ * small allocs quickly eat up the whole reserve array on sparc
+ */
+ size = memblock_align_up(size, align);
+
+ found = memblock_find_base(size, align, 0, max_addr);
+ if (found != MEMBLOCK_ERROR &&
+ memblock_add_region(&memblock.reserved, found, size) >= 0)
+ return found;
+
+ return 0;
+}
+
+phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
+{
+ phys_addr_t alloc;
+
+ alloc = __memblock_alloc_base(size, align, max_addr);
+
+ if (alloc == 0)
+ panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
+ (unsigned long long) size, (unsigned long long) max_addr);
+
+ return alloc;
+}
+
+phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
+{
+ return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+}
+
+
+/*
+ * Additional node-local allocators. Search for node memory is bottom up
+ * and walks memblock regions within that node bottom-up as well, but allocation
+ * within an memblock region is top-down. XXX I plan to fix that at some stage
+ *
+ * WARNING: Only available after early_node_map[] has been populated,
+ * on some architectures, that is after all the calls to add_active_range()
+ * have been done to populate it.
+ */
+
+phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid)
+{
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+ /*
+ * This code originates from sparc which really wants use to walk by addresses
+ * and returns the nid. This is not very convenient for early_pfn_map[] users
+ * as the map isn't sorted yet, and it really wants to be walked by nid.
+ *
+ * For now, I implement the inefficient method below which walks the early
+ * map multiple times. Eventually we may want to use an ARCH config option
+ * to implement a completely different method for both case.
+ */
+ unsigned long start_pfn, end_pfn;
+ int i;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ get_pfn_range_for_nid(i, &start_pfn, &end_pfn);
+ if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn))
+ continue;
+ *nid = i;
+ return min(end, PFN_PHYS(end_pfn));
+ }
+#endif
+ *nid = 0;
+
+ return end;
+}
+
+static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp,
+ phys_addr_t size,
+ phys_addr_t align, int nid)
+{
+ phys_addr_t start, end;
+
+ start = mp->base;
+ end = start + mp->size;
+
+ start = memblock_align_up(start, align);
+ while (start < end) {
+ phys_addr_t this_end;
+ int this_nid;
+
+ this_end = memblock_nid_range(start, end, &this_nid);
+ if (this_nid == nid) {
+ phys_addr_t ret = memblock_find_region(start, this_end, size, align);
+ if (ret != MEMBLOCK_ERROR &&
+ memblock_add_region(&memblock.reserved, ret, size) >= 0)
+ return ret;
+ }
+ start = this_end;
+ }
+
+ return MEMBLOCK_ERROR;
+}
+
+phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
+{
+ struct memblock_type *mem = &memblock.memory;
+ int i;
+
+ BUG_ON(0 == size);
+
+ /* We align the size to limit fragmentation. Without this, a lot of
+ * small allocs quickly eat up the whole reserve array on sparc
+ */
+ size = memblock_align_up(size, align);
+
+ /* We do a bottom-up search for a region with the right
+ * nid since that's easier considering how memblock_nid_range()
+ * works
+ */
+ for (i = 0; i < mem->cnt; i++) {
+ phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i],
+ size, align, nid);
+ if (ret != MEMBLOCK_ERROR)
+ return ret;
+ }
+
+ return 0;
+}
+
+phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
+{
+ phys_addr_t res = memblock_alloc_nid(size, align, nid);
+
+ if (res)
+ return res;
+ return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
+}
+
+
+/*
+ * Remaining API functions
+ */
+
+/* You must call memblock_analyze() before this. */
+phys_addr_t __init memblock_phys_mem_size(void)
+{
+ return memblock.memory_size;
+}
+
+phys_addr_t __init_memblock memblock_end_of_DRAM(void)
+{
+ int idx = memblock.memory.cnt - 1;
+
+ return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
+}
+
+/* You must call memblock_analyze() after this. */
+void __init memblock_enforce_memory_limit(phys_addr_t memory_limit)
+{
+ unsigned long i;
+ phys_addr_t limit;
+ struct memblock_region *p;
+
+ if (!memory_limit)
+ return;
+
+ /* Truncate the memblock regions to satisfy the memory limit. */
+ limit = memory_limit;
+ for (i = 0; i < memblock.memory.cnt; i++) {
+ if (limit > memblock.memory.regions[i].size) {
+ limit -= memblock.memory.regions[i].size;
+ continue;
+ }
+
+ memblock.memory.regions[i].size = limit;
+ memblock.memory.cnt = i + 1;
+ break;
+ }
+
+ memory_limit = memblock_end_of_DRAM();
+
+ /* And truncate any reserves above the limit also. */
+ for (i = 0; i < memblock.reserved.cnt; i++) {
+ p = &memblock.reserved.regions[i];
+
+ if (p->base > memory_limit)
+ p->size = 0;
+ else if ((p->base + p->size) > memory_limit)
+ p->size = memory_limit - p->base;
+
+ if (p->size == 0) {
+ memblock_remove_region(&memblock.reserved, i);
+ i--;
+ }
+ }
+}
+
+static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
+{
+ unsigned int left = 0, right = type->cnt;
+
+ do {
+ unsigned int mid = (right + left) / 2;
+
+ if (addr < type->regions[mid].base)
+ right = mid;
+ else if (addr >= (type->regions[mid].base +
+ type->regions[mid].size))
+ left = mid + 1;
+ else
+ return mid;
+ } while (left < right);
+ return -1;
+}
+
+int __init memblock_is_reserved(phys_addr_t addr)
+{
+ return memblock_search(&memblock.reserved, addr) != -1;
+}
+
+int __init_memblock memblock_is_memory(phys_addr_t addr)
+{
+ return memblock_search(&memblock.memory, addr) != -1;
+}
+
+int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
+{
+ int idx = memblock_search(&memblock.reserved, base);
+
+ if (idx == -1)
+ return 0;
+ return memblock.reserved.regions[idx].base <= base &&
+ (memblock.reserved.regions[idx].base +
+ memblock.reserved.regions[idx].size) >= (base + size);
+}
+
+int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+}
+
+
+void __init_memblock memblock_set_current_limit(phys_addr_t limit)
+{
+ memblock.current_limit = limit;
+}
+
+static void __init_memblock memblock_dump(struct memblock_type *region, char *name)
+{
+ unsigned long long base, size;
+ int i;
+
+ pr_info(" %s.cnt = 0x%lx\n", name, region->cnt);
+
+ for (i = 0; i < region->cnt; i++) {
+ base = region->regions[i].base;
+ size = region->regions[i].size;
+
+ pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n",
+ name, i, base, base + size - 1, size);
+ }
+}
+
+void __init_memblock memblock_dump_all(void)
+{
+ if (!memblock_debug)
+ return;
+
+ pr_info("MEMBLOCK configuration:\n");
+ pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size);
+
+ memblock_dump(&memblock.memory, "memory");
+ memblock_dump(&memblock.reserved, "reserved");
+}
+
+void __init memblock_analyze(void)
+{
+ int i;
+
+ /* Check marker in the unused last array entry */
+ WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
+ != (phys_addr_t)RED_INACTIVE);
+ WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
+ != (phys_addr_t)RED_INACTIVE);
+
+ memblock.memory_size = 0;
+
+ for (i = 0; i < memblock.memory.cnt; i++)
+ memblock.memory_size += memblock.memory.regions[i].size;
+
+ /* We allow resizing from there */
+ memblock_can_resize = 1;
+}
+
+void __init memblock_init(void)
+{
+ static int init_done __initdata = 0;
+
+ if (init_done)
+ return;
+ init_done = 1;
+
+ /* Hookup the initial arrays */
+ memblock.memory.regions = memblock_memory_init_regions;
+ memblock.memory.max = INIT_MEMBLOCK_REGIONS;
+ memblock.reserved.regions = memblock_reserved_init_regions;
+ memblock.reserved.max = INIT_MEMBLOCK_REGIONS;
+
+ /* Write a marker in the unused last array entry */
+ memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+ memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+
+ /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
+ * This simplifies the memblock_add() code below...
+ */
+ memblock.memory.regions[0].base = 0;
+ memblock.memory.regions[0].size = 0;
+ memblock.memory.cnt = 1;
+
+ /* Ditto. */
+ memblock.reserved.regions[0].base = 0;
+ memblock.reserved.regions[0].size = 0;
+ memblock.reserved.cnt = 1;
+
+ memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE;
+}
+
+static int __init early_memblock(char *p)
+{
+ if (p && strstr(p, "debug"))
+ memblock_debug = 1;
+ return 0;
+}
+early_param("memblock", early_memblock);
+
+#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK)
+
+static int memblock_debug_show(struct seq_file *m, void *private)
+{
+ struct memblock_type *type = m->private;
+ struct memblock_region *reg;
+ int i;
+
+ for (i = 0; i < type->cnt; i++) {
+ reg = &type->regions[i];
+ seq_printf(m, "%4d: ", i);
+ if (sizeof(phys_addr_t) == 4)
+ seq_printf(m, "0x%08lx..0x%08lx\n",
+ (unsigned long)reg->base,
+ (unsigned long)(reg->base + reg->size - 1));
+ else
+ seq_printf(m, "0x%016llx..0x%016llx\n",
+ (unsigned long long)reg->base,
+ (unsigned long long)(reg->base + reg->size - 1));
+
+ }
+ return 0;
+}
+
+static int memblock_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, memblock_debug_show, inode->i_private);
+}
+
+static const struct file_operations memblock_debug_fops = {
+ .open = memblock_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init memblock_init_debugfs(void)
+{
+ struct dentry *root = debugfs_create_dir("memblock", NULL);
+ if (!root)
+ return -ENXIO;
+ debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops);
+ debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops);
+
+ return 0;
+}
+__initcall(memblock_init_debugfs);
+
+#endif /* CONFIG_DEBUG_FS */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 954032b80bed..8ab841031436 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6,6 +6,10 @@
* Copyright 2007 OpenVZ SWsoft Inc
* Author: Pavel Emelianov <xemul@openvz.org>
*
+ * Memory thresholds
+ * Copyright (C) 2009 Nokia Corporation
+ * Author: Kirill A. Shutemov
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +25,7 @@
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/mm.h>
+#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
@@ -32,17 +37,23 @@
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/swap.h>
+#include <linux/swapops.h>
#include <linux/spinlock.h>
+#include <linux/eventfd.h>
+#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
+#include <linux/oom.h>
#include "internal.h"
#include <asm/uaccess.h>
+#include <trace/events/vmscan.h>
+
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES 5
struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -50,12 +61,27 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
int do_swap_account __read_mostly;
-static int really_do_swap_account __initdata = 1; /* for remember boot option*/
+
+/* for remember boot option*/
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
+static int really_do_swap_account __initdata = 1;
+#else
+static int really_do_swap_account __initdata = 0;
+#endif
+
#else
#define do_swap_account (0)
#endif
-#define SOFTLIMIT_EVENTS_THRESH (1000)
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. This counter
+ * is used for trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ *
+ * These values will be used as !((event) & ((1 <<(thresh)) - 1))
+ */
+#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
+#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
/*
* Statistics for memory cgroup.
@@ -69,62 +95,19 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
- MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
+ MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
+ /* incremented at every pagein/pageout */
+ MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
+ MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
MEM_CGROUP_STAT_NSTATS,
};
struct mem_cgroup_stat_cpu {
s64 count[MEM_CGROUP_STAT_NSTATS];
-} ____cacheline_aligned_in_smp;
-
-struct mem_cgroup_stat {
- struct mem_cgroup_stat_cpu cpustat[0];
};
-static inline void
-__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
- enum mem_cgroup_stat_index idx)
-{
- stat->count[idx] = 0;
-}
-
-static inline s64
-__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
- enum mem_cgroup_stat_index idx)
-{
- return stat->count[idx];
-}
-
-/*
- * For accounting under irq disable, no need for increment preempt count.
- */
-static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
- enum mem_cgroup_stat_index idx, int val)
-{
- stat->count[idx] += val;
-}
-
-static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
- enum mem_cgroup_stat_index idx)
-{
- int cpu;
- s64 ret = 0;
- for_each_possible_cpu(cpu)
- ret += stat->cpustat[cpu].count[idx];
- return ret;
-}
-
-static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
-{
- s64 ret;
-
- ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
- ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
- return ret;
-}
-
/*
* per-zone information in memory controller.
*/
@@ -174,6 +157,41 @@ struct mem_cgroup_tree {
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
+struct mem_cgroup_threshold {
+ struct eventfd_ctx *eventfd;
+ u64 threshold;
+};
+
+/* For threshold */
+struct mem_cgroup_threshold_ary {
+ /* An array index points to threshold just below usage. */
+ int current_threshold;
+ /* Size of entries[] */
+ unsigned int size;
+ /* Array of thresholds */
+ struct mem_cgroup_threshold entries[0];
+};
+
+struct mem_cgroup_thresholds {
+ /* Primary thresholds array */
+ struct mem_cgroup_threshold_ary *primary;
+ /*
+ * Spare threshold array.
+ * This is needed to make mem_cgroup_unregister_event() "never fail".
+ * It must be able to store at least primary->size - 1 entries.
+ */
+ struct mem_cgroup_threshold_ary *spare;
+};
+
+/* for OOM */
+struct mem_cgroup_eventfd_list {
+ struct list_head list;
+ struct eventfd_ctx *eventfd;
+};
+
+static void mem_cgroup_threshold(struct mem_cgroup *mem);
+static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
+
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
@@ -206,8 +224,6 @@ struct mem_cgroup {
*/
spinlock_t reclaim_param_lock;
- int prev_priority; /* for recording reclaim priority */
-
/*
* While reclaiming in a hierarchy, we cache the last child we
* reclaimed from.
@@ -217,20 +233,83 @@ struct mem_cgroup {
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
- unsigned long last_oom_jiffies;
+ atomic_t oom_lock;
atomic_t refcnt;
unsigned int swappiness;
+ /* OOM-Killer disable */
+ int oom_kill_disable;
/* set when res.limit == memsw.limit */
bool memsw_is_minimum;
+ /* protect arrays of thresholds */
+ struct mutex thresholds_lock;
+
+ /* thresholds for memory usage. RCU-protected */
+ struct mem_cgroup_thresholds thresholds;
+
+ /* thresholds for mem+swap usage. RCU-protected */
+ struct mem_cgroup_thresholds memsw_thresholds;
+
+ /* For oom notifier event fd */
+ struct list_head oom_notify;
+
+ /*
+ * Should we move charges of a task when a task is moved into this
+ * mem_cgroup ? And what type of charges should we move ?
+ */
+ unsigned long move_charge_at_immigrate;
+ /*
+ * percpu counter.
+ */
+ struct mem_cgroup_stat_cpu *stat;
/*
- * statistics. This must be placed at the end of memcg.
+ * used when a cpu is offlined or other synchronizations
+ * See mem_cgroup_read_stat().
*/
- struct mem_cgroup_stat stat;
+ struct mem_cgroup_stat_cpu nocpu_base;
+ spinlock_t pcp_counter_lock;
};
+/* Stuffs for move charges at task migration. */
+/*
+ * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
+ * left-shifted bitmap of these types.
+ */
+enum move_type {
+ MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
+ MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */
+ NR_MOVE_TYPE,
+};
+
+/* "mc" and its members are protected by cgroup_mutex */
+static struct move_charge_struct {
+ spinlock_t lock; /* for from, to */
+ struct mem_cgroup *from;
+ struct mem_cgroup *to;
+ unsigned long precharge;
+ unsigned long moved_charge;
+ unsigned long moved_swap;
+ struct task_struct *moving_task; /* a task moving charges */
+ wait_queue_head_t waitq; /* a waitq for other context */
+} mc = {
+ .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
+ .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
+};
+
+static bool move_anon(void)
+{
+ return test_bit(MOVE_CHARGE_TYPE_ANON,
+ &mc.to->move_charge_at_immigrate);
+}
+
+static bool move_file(void)
+{
+ return test_bit(MOVE_CHARGE_TYPE_FILE,
+ &mc.to->move_charge_at_immigrate);
+}
+
/*
* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
@@ -258,9 +337,12 @@ enum charge_type {
/* for encoding cft->private value on file */
#define _MEM (0)
#define _MEMSWAP (1)
+#define _OOM_TYPE (2)
#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
+/* Used for OOM nofiier */
+#define OOM_CONTROL (0)
/*
* Reclaim flags for mem_cgroup_hierarchical_reclaim
@@ -371,23 +453,6 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
spin_unlock(&mctz->lock);
}
-static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
-{
- bool ret = false;
- int cpu;
- s64 val;
- struct mem_cgroup_stat_cpu *cpustat;
-
- cpu = get_cpu();
- cpustat = &mem->stat.cpustat[cpu];
- val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
- if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
- __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
- ret = true;
- }
- put_cpu();
- return ret;
-}
static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
{
@@ -481,17 +546,57 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
return mz;
}
+/*
+ * Implementation Note: reading percpu statistics for memcg.
+ *
+ * Both of vmstat[] and percpu_counter has threshold and do periodic
+ * synchronization to implement "quick" read. There are trade-off between
+ * reading cost and precision of value. Then, we may have a chance to implement
+ * a periodic synchronizion of counter in memcg's counter.
+ *
+ * But this _read() function is used for user interface now. The user accounts
+ * memory usage by memory cgroup and he _always_ requires exact value because
+ * he accounts memory. Even if we provide quick-and-fuzzy read, we always
+ * have to visit all online cpus and make sum. So, for now, unnecessary
+ * synchronization is not implemented. (just implemented for cpu hotplug)
+ *
+ * If there are kernel internal actions which can make use of some not-exact
+ * value, and reading all cpu value can be performance bottleneck in some
+ * common workload, threashold and synchonization as vmstat[] should be
+ * implemented.
+ */
+static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
+ enum mem_cgroup_stat_index idx)
+{
+ int cpu;
+ s64 val = 0;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ val += per_cpu(mem->stat->count[idx], cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+ spin_lock(&mem->pcp_counter_lock);
+ val += mem->nocpu_base.count[idx];
+ spin_unlock(&mem->pcp_counter_lock);
+#endif
+ put_online_cpus();
+ return val;
+}
+
+static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
+{
+ s64 ret;
+
+ ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
+ ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
+ return ret;
+}
+
static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
bool charge)
{
int val = (charge) ? 1 : -1;
- struct mem_cgroup_stat *stat = &mem->stat;
- struct mem_cgroup_stat_cpu *cpustat;
- int cpu = get_cpu();
-
- cpustat = &stat->cpustat[cpu];
- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
- put_cpu();
+ this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
@@ -499,24 +604,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
bool charge)
{
int val = (charge) ? 1 : -1;
- struct mem_cgroup_stat *stat = &mem->stat;
- struct mem_cgroup_stat_cpu *cpustat;
- int cpu = get_cpu();
- cpustat = &stat->cpustat[cpu];
+ preempt_disable();
+
if (PageCgroupCache(pc))
- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
+ __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
else
- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
+ __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
if (charge)
- __mem_cgroup_stat_add_safe(cpustat,
- MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
+ __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
else
- __mem_cgroup_stat_add_safe(cpustat,
- MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
- put_cpu();
+ __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
+ __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
+
+ preempt_enable();
}
static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
@@ -534,6 +636,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
return total;
}
+static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
+{
+ s64 val;
+
+ val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
+
+ return !(val & ((1 << event_mask_shift) - 1));
+}
+
+/*
+ * Check events in order.
+ *
+ */
+static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
+{
+ /* threshold event is triggered in finer grain than soft limit */
+ if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
+ mem_cgroup_threshold(mem);
+ if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
+ mem_cgroup_update_tree(mem, page);
+ }
+}
+
static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
{
return container_of(cgroup_subsys_state(cont,
@@ -576,40 +701,83 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
return mem;
}
-/*
- * Call callback function against all cgroup under hierarchy tree.
- */
-static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
- int (*func)(struct mem_cgroup *, void *))
+/* The caller has to guarantee "mem" exists before calling this */
+static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
{
- int found, ret, nextid;
struct cgroup_subsys_state *css;
- struct mem_cgroup *mem;
-
- if (!root->use_hierarchy)
- return (*func)(root, data);
+ int found;
- nextid = 1;
- do {
- ret = 0;
+ if (!mem) /* ROOT cgroup has the smallest ID */
+ return root_mem_cgroup; /*css_put/get against root is ignored*/
+ if (!mem->use_hierarchy) {
+ if (css_tryget(&mem->css))
+ return mem;
+ return NULL;
+ }
+ rcu_read_lock();
+ /*
+ * searching a memory cgroup which has the smallest ID under given
+ * ROOT cgroup. (ID >= 1)
+ */
+ css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
+ if (css && css_tryget(css))
+ mem = container_of(css, struct mem_cgroup, css);
+ else
mem = NULL;
+ rcu_read_unlock();
+ return mem;
+}
+
+static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
+ struct mem_cgroup *root,
+ bool cond)
+{
+ int nextid = css_id(&iter->css) + 1;
+ int found;
+ int hierarchy_used;
+ struct cgroup_subsys_state *css;
+
+ hierarchy_used = iter->use_hierarchy;
+
+ css_put(&iter->css);
+ /* If no ROOT, walk all, ignore hierarchy */
+ if (!cond || (root && !hierarchy_used))
+ return NULL;
+
+ if (!root)
+ root = root_mem_cgroup;
+ do {
+ iter = NULL;
rcu_read_lock();
- css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
- &found);
+
+ css = css_get_next(&mem_cgroup_subsys, nextid,
+ &root->css, &found);
if (css && css_tryget(css))
- mem = container_of(css, struct mem_cgroup, css);
+ iter = container_of(css, struct mem_cgroup, css);
rcu_read_unlock();
-
- if (mem) {
- ret = (*func)(mem, data);
- css_put(&mem->css);
- }
+ /* If css is NULL, no more cgroups will be found */
nextid = found + 1;
- } while (!ret && css);
+ } while (css && !iter);
- return ret;
+ return iter;
}
+/*
+ * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
+ * be careful that "break" loop is not allowed. We have reference count.
+ * Instead of that modify "cond" to be false and "continue" to exit the loop.
+ */
+#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
+ for (iter = mem_cgroup_start_loop(root);\
+ iter != NULL;\
+ iter = mem_cgroup_get_next(iter, root, cond))
+
+#define for_each_mem_cgroup_tree(iter, root) \
+ for_each_mem_cgroup_tree_cond(iter, root, true)
+
+#define for_each_mem_cgroup_all(iter) \
+ for_each_mem_cgroup_tree_cond(iter, NULL, true)
+
static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
{
@@ -652,7 +820,6 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
return;
VM_BUG_ON(list_empty(&pc->lru));
list_del_init(&pc->lru);
- return;
}
void mem_cgroup_del_lru(struct page *page)
@@ -756,12 +923,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
{
int ret;
struct mem_cgroup *curr = NULL;
+ struct task_struct *p;
- task_lock(task);
- rcu_read_lock();
- curr = try_get_mem_cgroup_from_mm(task->mm);
- rcu_read_unlock();
- task_unlock(task);
+ p = find_lock_task_mm(task);
+ if (!p)
+ return 0;
+ curr = try_get_mem_cgroup_from_mm(p->mm);
+ task_unlock(p);
if (!curr)
return 0;
/*
@@ -778,35 +946,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
return ret;
}
-/*
- * prev_priority control...this will be used in memory reclaim path.
- */
-int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
-{
- int prev_priority;
-
- spin_lock(&mem->reclaim_param_lock);
- prev_priority = mem->prev_priority;
- spin_unlock(&mem->reclaim_param_lock);
-
- return prev_priority;
-}
-
-void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
- spin_lock(&mem->reclaim_param_lock);
- if (priority < mem->prev_priority)
- mem->prev_priority = priority;
- spin_unlock(&mem->reclaim_param_lock);
-}
-
-void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
- spin_lock(&mem->reclaim_param_lock);
- mem->prev_priority = priority;
- spin_unlock(&mem->reclaim_param_lock);
-}
-
static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
{
unsigned long active;
@@ -864,7 +1003,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
struct zone *zone,
enum lru_list lru)
{
- int nid = zone->zone_pgdat->node_id;
+ int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
@@ -874,7 +1013,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
struct zone *zone)
{
- int nid = zone->zone_pgdat->node_id;
+ int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
@@ -919,7 +1058,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
LIST_HEAD(pc_list);
struct list_head *src;
struct page_cgroup *pc, *tmp;
- int nid = z->zone_pgdat->node_id;
+ int nid = zone_to_nid(z);
int zid = zone_idx(z);
struct mem_cgroup_per_zone *mz;
int lru = LRU_FILE * file + active;
@@ -946,7 +1085,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
case 0:
list_move(&page->lru, dst);
mem_cgroup_del_lru(page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
/* we don't affect global LRU but rotate in our LRU */
@@ -958,6 +1097,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
}
*scanned = scan;
+
+ trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
+ 0, 0, 0, mode);
+
return nr_taken;
}
@@ -992,15 +1135,94 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
return swappiness;
}
-static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
+static void mem_cgroup_start_move(struct mem_cgroup *mem)
{
- int *val = data;
- (*val)++;
- return 0;
+ int cpu;
+
+ get_online_cpus();
+ spin_lock(&mem->pcp_counter_lock);
+ for_each_online_cpu(cpu)
+ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
+ mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
+ spin_unlock(&mem->pcp_counter_lock);
+ put_online_cpus();
+
+ synchronize_rcu();
+}
+
+static void mem_cgroup_end_move(struct mem_cgroup *mem)
+{
+ int cpu;
+
+ if (!mem)
+ return;
+ get_online_cpus();
+ spin_lock(&mem->pcp_counter_lock);
+ for_each_online_cpu(cpu)
+ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
+ mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
+ spin_unlock(&mem->pcp_counter_lock);
+ put_online_cpus();
+}
+/*
+ * 2 routines for checking "mem" is under move_account() or not.
+ *
+ * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
+ * for avoiding race in accounting. If true,
+ * pc->mem_cgroup may be overwritten.
+ *
+ * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
+ * under hierarchy of moving cgroups. This is for
+ * waiting at hith-memory prressure caused by "move".
+ */
+
+static bool mem_cgroup_stealed(struct mem_cgroup *mem)
+{
+ VM_BUG_ON(!rcu_read_lock_held());
+ return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
+}
+
+static bool mem_cgroup_under_move(struct mem_cgroup *mem)
+{
+ struct mem_cgroup *from;
+ struct mem_cgroup *to;
+ bool ret = false;
+ /*
+ * Unlike task_move routines, we access mc.to, mc.from not under
+ * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
+ */
+ spin_lock(&mc.lock);
+ from = mc.from;
+ to = mc.to;
+ if (!from)
+ goto unlock;
+ if (from == mem || to == mem
+ || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
+ || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
+ ret = true;
+unlock:
+ spin_unlock(&mc.lock);
+ return ret;
+}
+
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
+{
+ if (mc.moving_task && current != mc.moving_task) {
+ if (mem_cgroup_under_move(mem)) {
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
+ /* moving charge context might have finished. */
+ if (mc.moving_task)
+ schedule();
+ finish_wait(&mc.waitq, &wait);
+ return true;
+ }
+ }
+ return false;
}
/**
- * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode.
+ * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
@@ -1073,11 +1295,33 @@ done:
static int mem_cgroup_count_children(struct mem_cgroup *mem)
{
int num = 0;
- mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, mem)
+ num++;
return num;
}
/*
+ * Return the memory (and swap, if configured) limit for a memcg.
+ */
+u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+{
+ u64 limit;
+ u64 memsw;
+
+ limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ limit += total_swap_pages << PAGE_SHIFT;
+
+ memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+ /*
+ * If memsw is finite and limits the amount of swap space available
+ * to this memcg, return that limit.
+ */
+ return min(limit, memsw);
+}
+
+/*
* Visit the first child (need not be the first child as per the ordering
* of the cgroup list, since we track last_scanned_child) of @mem and use
* that to reclaim free pages from.
@@ -1174,7 +1418,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
}
}
}
- if (!mem_cgroup_local_usage(&victim->stat)) {
+ if (!mem_cgroup_local_usage(victim)) {
/* this cgroup's local usage == 0 */
css_put(&victim->css);
continue;
@@ -1182,8 +1426,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
/* we use swappiness of local cgroup */
if (check_soft)
ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
- noswap, get_swappiness(victim), zone,
- zone->zone_pgdat->node_id);
+ noswap, get_swappiness(victim), zone);
else
ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
noswap, get_swappiness(victim));
@@ -1205,69 +1448,203 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
return total;
}
-bool mem_cgroup_oom_called(struct task_struct *task)
+/*
+ * Check OOM-Killer is already running under our hierarchy.
+ * If someone is running, return false.
+ */
+static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
{
- bool ret = false;
- struct mem_cgroup *mem;
- struct mm_struct *mm;
+ int x, lock_count = 0;
+ struct mem_cgroup *iter;
- rcu_read_lock();
- mm = task->mm;
- if (!mm)
- mm = &init_mm;
- mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
- ret = true;
- rcu_read_unlock();
- return ret;
+ for_each_mem_cgroup_tree(iter, mem) {
+ x = atomic_inc_return(&iter->oom_lock);
+ lock_count = max(x, lock_count);
+ }
+
+ if (lock_count == 1)
+ return true;
+ return false;
}
-static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
+static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
{
- mem->last_oom_jiffies = jiffies;
+ struct mem_cgroup *iter;
+
+ /*
+ * When a new child is created while the hierarchy is under oom,
+ * mem_cgroup_oom_lock() may not be called. We have to use
+ * atomic_add_unless() here.
+ */
+ for_each_mem_cgroup_tree(iter, mem)
+ atomic_add_unless(&iter->oom_lock, -1, 0);
return 0;
}
-static void record_last_oom(struct mem_cgroup *mem)
+
+static DEFINE_MUTEX(memcg_oom_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+
+struct oom_wait_info {
+ struct mem_cgroup *mem;
+ wait_queue_t wait;
+};
+
+static int memcg_oom_wake_function(wait_queue_t *wait,
+ unsigned mode, int sync, void *arg)
+{
+ struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
+ struct oom_wait_info *oom_wait_info;
+
+ oom_wait_info = container_of(wait, struct oom_wait_info, wait);
+
+ if (oom_wait_info->mem == wake_mem)
+ goto wakeup;
+ /* if no hierarchy, no match */
+ if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
+ return 0;
+ /*
+ * Both of oom_wait_info->mem and wake_mem are stable under us.
+ * Then we can use css_is_ancestor without taking care of RCU.
+ */
+ if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
+ !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
+ return 0;
+
+wakeup:
+ return autoremove_wake_function(wait, mode, sync, arg);
+}
+
+static void memcg_wakeup_oom(struct mem_cgroup *mem)
+{
+ /* for filtering, pass "mem" as argument. */
+ __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
+}
+
+static void memcg_oom_recover(struct mem_cgroup *mem)
+{
+ if (mem && atomic_read(&mem->oom_lock))
+ memcg_wakeup_oom(mem);
+}
+
+/*
+ * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ */
+bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
{
- mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
+ struct oom_wait_info owait;
+ bool locked, need_to_kill;
+
+ owait.mem = mem;
+ owait.wait.flags = 0;
+ owait.wait.func = memcg_oom_wake_function;
+ owait.wait.private = current;
+ INIT_LIST_HEAD(&owait.wait.task_list);
+ need_to_kill = true;
+ /* At first, try to OOM lock hierarchy under mem.*/
+ mutex_lock(&memcg_oom_mutex);
+ locked = mem_cgroup_oom_lock(mem);
+ /*
+ * Even if signal_pending(), we can't quit charge() loop without
+ * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
+ * under OOM is always welcomed, use TASK_KILLABLE here.
+ */
+ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+ if (!locked || mem->oom_kill_disable)
+ need_to_kill = false;
+ if (locked)
+ mem_cgroup_oom_notify(mem);
+ mutex_unlock(&memcg_oom_mutex);
+
+ if (need_to_kill) {
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+ mem_cgroup_out_of_memory(mem, mask);
+ } else {
+ schedule();
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+ }
+ mutex_lock(&memcg_oom_mutex);
+ mem_cgroup_oom_unlock(mem);
+ memcg_wakeup_oom(mem);
+ mutex_unlock(&memcg_oom_mutex);
+
+ if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+ return false;
+ /* Give chance to dying process */
+ schedule_timeout(1);
+ return true;
}
/*
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.
+ *
+ * Notes: Race condition
+ *
+ * We usually use page_cgroup_lock() for accessing page_cgroup member but
+ * it tends to be costly. But considering some conditions, we doesn't need
+ * to do so _always_.
+ *
+ * Considering "charge", lock_page_cgroup() is not required because all
+ * file-stat operations happen after a page is attached to radix-tree. There
+ * are no race with "charge".
+ *
+ * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
+ * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
+ * if there are race with "uncharge". Statistics itself is properly handled
+ * by flags.
+ *
+ * Considering "move", this is an only case we see a race. To make the race
+ * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
+ * possibility of race condition. If there is, we take a lock.
*/
-void mem_cgroup_update_file_mapped(struct page *page, int val)
+
+void mem_cgroup_update_page_stat(struct page *page,
+ enum mem_cgroup_page_stat_item idx, int val)
{
struct mem_cgroup *mem;
- struct mem_cgroup_stat *stat;
- struct mem_cgroup_stat_cpu *cpustat;
- int cpu;
- struct page_cgroup *pc;
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+ bool need_unlock = false;
+ unsigned long uninitialized_var(flags);
- pc = lookup_page_cgroup(page);
if (unlikely(!pc))
return;
- lock_page_cgroup(pc);
+ rcu_read_lock();
mem = pc->mem_cgroup;
- if (!mem)
- goto done;
+ if (unlikely(!mem || !PageCgroupUsed(pc)))
+ goto out;
+ /* pc->mem_cgroup is unstable ? */
+ if (unlikely(mem_cgroup_stealed(mem))) {
+ /* take a lock against to access pc->mem_cgroup */
+ move_lock_page_cgroup(pc, &flags);
+ need_unlock = true;
+ mem = pc->mem_cgroup;
+ if (!mem || !PageCgroupUsed(pc))
+ goto out;
+ }
- if (!PageCgroupUsed(pc))
- goto done;
+ switch (idx) {
+ case MEMCG_NR_FILE_MAPPED:
+ if (val > 0)
+ SetPageCgroupFileMapped(pc);
+ else if (!page_mapped(page))
+ ClearPageCgroupFileMapped(pc);
+ idx = MEM_CGROUP_STAT_FILE_MAPPED;
+ break;
+ default:
+ BUG();
+ }
- /*
- * Preemption is already disabled, we don't need get_cpu()
- */
- cpu = smp_processor_id();
- stat = &mem->stat;
- cpustat = &stat->cpustat[cpu];
+ this_cpu_add(mem->stat->count[idx], val);
- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
-done:
- unlock_page_cgroup(pc);
+out:
+ if (unlikely(need_unlock))
+ move_unlock_page_cgroup(pc, &flags);
+ rcu_read_unlock();
+ return;
}
+EXPORT_SYMBOL(mem_cgroup_update_page_stat);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -1330,7 +1707,7 @@ static void drain_local_stock(struct work_struct *dummy)
/*
* Cache charges(val) which is from res_counter, to local per_cpu area.
- * This will be consumed by consumt_stock() function, later.
+ * This will be consumed by consume_stock() function, later.
*/
static void refill_stock(struct mem_cgroup *mem, int val)
{
@@ -1382,38 +1759,149 @@ static void drain_all_stock_sync(void)
atomic_dec(&memcg_drain_count);
}
-static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
+/*
+ * This function drains percpu counter value from DEAD cpu and
+ * move it to local cpu. Note that this function can be preempted.
+ */
+static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
+{
+ int i;
+
+ spin_lock(&mem->pcp_counter_lock);
+ for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
+ s64 x = per_cpu(mem->stat->count[i], cpu);
+
+ per_cpu(mem->stat->count[i], cpu) = 0;
+ mem->nocpu_base.count[i] += x;
+ }
+ /* need to clear ON_MOVE value, works as a kind of lock. */
+ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
+ spin_unlock(&mem->pcp_counter_lock);
+}
+
+static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
+{
+ int idx = MEM_CGROUP_ON_MOVE;
+
+ spin_lock(&mem->pcp_counter_lock);
+ per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
+ spin_unlock(&mem->pcp_counter_lock);
+}
+
+static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
unsigned long action,
void *hcpu)
{
int cpu = (unsigned long)hcpu;
struct memcg_stock_pcp *stock;
+ struct mem_cgroup *iter;
- if (action != CPU_DEAD)
+ if ((action == CPU_ONLINE)) {
+ for_each_mem_cgroup_all(iter)
+ synchronize_mem_cgroup_on_move(iter, cpu);
return NOTIFY_OK;
+ }
+
+ if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
+ return NOTIFY_OK;
+
+ for_each_mem_cgroup_all(iter)
+ mem_cgroup_drain_pcp_counter(iter, cpu);
+
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
return NOTIFY_OK;
}
+
+/* See __mem_cgroup_try_charge() for details */
+enum {
+ CHARGE_OK, /* success */
+ CHARGE_RETRY, /* need to retry but retry is not bad */
+ CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
+ CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
+ CHARGE_OOM_DIE, /* the current is killed because of OOM */
+};
+
+static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+ int csize, bool oom_check)
+{
+ struct mem_cgroup *mem_over_limit;
+ struct res_counter *fail_res;
+ unsigned long flags = 0;
+ int ret;
+
+ ret = res_counter_charge(&mem->res, csize, &fail_res);
+
+ if (likely(!ret)) {
+ if (!do_swap_account)
+ return CHARGE_OK;
+ ret = res_counter_charge(&mem->memsw, csize, &fail_res);
+ if (likely(!ret))
+ return CHARGE_OK;
+
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+ flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+ } else
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+
+ if (csize > PAGE_SIZE) /* change csize and retry */
+ return CHARGE_RETRY;
+
+ if (!(gfp_mask & __GFP_WAIT))
+ return CHARGE_WOULDBLOCK;
+
+ ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+ gfp_mask, flags);
+ /*
+ * try_to_free_mem_cgroup_pages() might not give us a full
+ * picture of reclaim. Some pages are reclaimed and might be
+ * moved to swap cache or just unmapped from the cgroup.
+ * Check the limit again to see if the reclaim reduced the
+ * current usage of the cgroup before giving up
+ */
+ if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+ return CHARGE_RETRY;
+
+ /*
+ * At task move, charge accounts can be doubly counted. So, it's
+ * better to wait until the end of task_move if something is going on.
+ */
+ if (mem_cgroup_wait_acct_move(mem_over_limit))
+ return CHARGE_RETRY;
+
+ /* If we don't need to call oom-killer at el, return immediately */
+ if (!oom_check)
+ return CHARGE_NOMEM;
+ /* check OOM */
+ if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
+ return CHARGE_OOM_DIE;
+
+ return CHARGE_RETRY;
+}
+
/*
* Unlike exported interface, "oom" parameter is added. if oom==true,
* oom-killer can be invoked.
*/
static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcg,
- bool oom, struct page *page)
+ gfp_t gfp_mask,
+ struct mem_cgroup **memcg, bool oom,
+ int page_size)
{
- struct mem_cgroup *mem, *mem_over_limit;
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct res_counter *fail_res;
- int csize = CHARGE_SIZE;
+ int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct mem_cgroup *mem = NULL;
+ int ret;
+ int csize = max(CHARGE_SIZE, (unsigned long) page_size);
- if (unlikely(test_thread_flag(TIF_MEMDIE))) {
- /* Don't account this! */
- *memcg = NULL;
- return 0;
- }
+ /*
+ * Unlike gloval-vm's OOM-kill, we're not in memory shortage
+ * in system level. So, allow to go ahead dying process in addition to
+ * MEMDIE process.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE)
+ || fatal_signal_pending(current)))
+ goto bypass;
/*
* We always charge the cgroup the mm_struct belongs to.
@@ -1421,90 +1909,111 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
- mem = *memcg;
- if (likely(!mem)) {
- mem = try_get_mem_cgroup_from_mm(mm);
- *memcg = mem;
- } else {
+ if (!*memcg && !mm)
+ goto bypass;
+again:
+ if (*memcg) { /* css should be a valid one */
+ mem = *memcg;
+ VM_BUG_ON(css_is_removed(&mem->css));
+ if (mem_cgroup_is_root(mem))
+ goto done;
+ if (page_size == PAGE_SIZE && consume_stock(mem))
+ goto done;
css_get(&mem->css);
- }
- if (unlikely(!mem))
- return 0;
-
- VM_BUG_ON(css_is_removed(&mem->css));
- if (mem_cgroup_is_root(mem))
- goto done;
-
- while (1) {
- int ret = 0;
- unsigned long flags = 0;
+ } else {
+ struct task_struct *p;
- if (consume_stock(mem))
- goto charged;
+ rcu_read_lock();
+ p = rcu_dereference(mm->owner);
+ /*
+ * Because we don't have task_lock(), "p" can exit.
+ * In that case, "mem" can point to root or p can be NULL with
+ * race with swapoff. Then, we have small risk of mis-accouning.
+ * But such kind of mis-account by race always happens because
+ * we don't have cgroup_mutex(). It's overkill and we allo that
+ * small race, here.
+ * (*) swapoff at el will charge against mm-struct not against
+ * task-struct. So, mm->owner can be NULL.
+ */
+ mem = mem_cgroup_from_task(p);
+ if (!mem || mem_cgroup_is_root(mem)) {
+ rcu_read_unlock();
+ goto done;
+ }
+ if (page_size == PAGE_SIZE && consume_stock(mem)) {
+ /*
+ * It seems dagerous to access memcg without css_get().
+ * But considering how consume_stok works, it's not
+ * necessary. If consume_stock success, some charges
+ * from this memcg are cached on this cpu. So, we
+ * don't need to call css_get()/css_tryget() before
+ * calling consume_stock().
+ */
+ rcu_read_unlock();
+ goto done;
+ }
+ /* after here, we may be blocked. we need to get refcnt */
+ if (!css_tryget(&mem->css)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+ }
- ret = res_counter_charge(&mem->res, csize, &fail_res);
- if (likely(!ret)) {
- if (!do_swap_account)
- break;
- ret = res_counter_charge(&mem->memsw, csize, &fail_res);
- if (likely(!ret))
- break;
- /* mem+swap counter fails */
- res_counter_uncharge(&mem->res, csize);
- flags |= MEM_CGROUP_RECLAIM_NOSWAP;
- mem_over_limit = mem_cgroup_from_res_counter(fail_res,
- memsw);
- } else
- /* mem counter fails */
- mem_over_limit = mem_cgroup_from_res_counter(fail_res,
- res);
+ do {
+ bool oom_check;
- /* reduce request size and retry */
- if (csize > PAGE_SIZE) {
- csize = PAGE_SIZE;
- continue;
+ /* If killed, bypass charge */
+ if (fatal_signal_pending(current)) {
+ css_put(&mem->css);
+ goto bypass;
}
- if (!(gfp_mask & __GFP_WAIT))
- goto nomem;
- ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
- gfp_mask, flags);
- if (ret)
- continue;
+ oom_check = false;
+ if (oom && !nr_oom_retries) {
+ oom_check = true;
+ nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ }
- /*
- * try_to_free_mem_cgroup_pages() might not give us a full
- * picture of reclaim. Some pages are reclaimed and might be
- * moved to swap cache or just unmapped from the cgroup.
- * Check the limit again to see if the reclaim reduced the
- * current usage of the cgroup before giving up
- *
- */
- if (mem_cgroup_check_under_limit(mem_over_limit))
- continue;
+ ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
- if (!nr_retries--) {
- if (oom) {
- mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
- record_last_oom(mem_over_limit);
- }
+ switch (ret) {
+ case CHARGE_OK:
+ break;
+ case CHARGE_RETRY: /* not in OOM situation but retry */
+ csize = page_size;
+ css_put(&mem->css);
+ mem = NULL;
+ goto again;
+ case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
+ css_put(&mem->css);
goto nomem;
+ case CHARGE_NOMEM: /* OOM routine works */
+ if (!oom) {
+ css_put(&mem->css);
+ goto nomem;
+ }
+ /* If oom, we never return -ENOMEM */
+ nr_oom_retries--;
+ break;
+ case CHARGE_OOM_DIE: /* Killed by OOM Killer */
+ css_put(&mem->css);
+ goto bypass;
}
- }
- if (csize > PAGE_SIZE)
- refill_stock(mem, csize - PAGE_SIZE);
-charged:
- /*
- * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
- * if they exceeds softlimit.
- */
- if (mem_cgroup_soft_limit_check(mem))
- mem_cgroup_update_tree(mem, page);
+ } while (ret != CHARGE_OK);
+
+ if (csize > page_size)
+ refill_stock(mem, csize - page_size);
+ css_put(&mem->css);
done:
+ *memcg = mem;
return 0;
nomem:
- css_put(&mem->css);
+ *memcg = NULL;
return -ENOMEM;
+bypass:
+ *memcg = NULL;
+ return 0;
}
/*
@@ -1512,14 +2021,20 @@ nomem:
* This function is for that and do uncharge, put css's refcnt.
* gotten by try_charge().
*/
-static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
+static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
+ unsigned long count)
{
if (!mem_cgroup_is_root(mem)) {
- res_counter_uncharge(&mem->res, PAGE_SIZE);
+ res_counter_uncharge(&mem->res, PAGE_SIZE * count);
if (do_swap_account)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+ res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
}
- css_put(&mem->css);
+}
+
+static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
+ int page_size)
+{
+ __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
}
/*
@@ -1573,22 +2088,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
* commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
* USED state. If already USED, uncharge and return.
*/
-
-static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
- struct page_cgroup *pc,
- enum charge_type ctype)
+static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
+ struct page_cgroup *pc,
+ enum charge_type ctype)
{
- /* try_charge() can return NULL to *memcg, taking care of it. */
- if (!mem)
- return;
-
- lock_page_cgroup(pc);
- if (unlikely(PageCgroupUsed(pc))) {
- unlock_page_cgroup(pc);
- mem_cgroup_cancel_charge(mem);
- return;
- }
-
pc->mem_cgroup = mem;
/*
* We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -1613,8 +2116,41 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
}
mem_cgroup_charge_statistics(mem, pc, true);
+}
+
+static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+ struct page_cgroup *pc,
+ enum charge_type ctype,
+ int page_size)
+{
+ int i;
+ int count = page_size >> PAGE_SHIFT;
+
+ /* try_charge() can return NULL to *memcg, taking care of it. */
+ if (!mem)
+ return;
+
+ lock_page_cgroup(pc);
+ if (unlikely(PageCgroupUsed(pc))) {
+ unlock_page_cgroup(pc);
+ mem_cgroup_cancel_charge(mem, page_size);
+ return;
+ }
+
+ /*
+ * we don't need page_cgroup_lock about tail pages, becase they are not
+ * accessed by any other context at this point.
+ */
+ for (i = 0; i < count; i++)
+ ____mem_cgroup_commit_charge(mem, pc + i, ctype);
unlock_page_cgroup(pc);
+ /*
+ * "charge_statistics" updated event counter. Then, check it.
+ * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+ * if they exceeds softlimit.
+ */
+ memcg_check_events(mem, pc->page);
}
/**
@@ -1622,61 +2158,48 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
* @pc: page_cgroup of the page.
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
+ * @uncharge: whether we should call uncharge and css_put against @from.
*
* The caller must confirm following.
* - page is not on LRU (isolate_page() is useful.)
* - the pc is locked, used, and ->mem_cgroup points to @from.
*
- * This function does "uncharge" from old cgroup but doesn't do "charge" to
- * new cgroup. It should be done by a caller.
+ * This function doesn't do "charge" nor css_get to new cgroup. It should be
+ * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
+ * true, this function does "uncharge" from old cgroup, but it doesn't if
+ * @uncharge is false, so a caller should do "uncharge".
*/
static void __mem_cgroup_move_account(struct page_cgroup *pc,
- struct mem_cgroup *from, struct mem_cgroup *to)
+ struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
{
- struct page *page;
- int cpu;
- struct mem_cgroup_stat *stat;
- struct mem_cgroup_stat_cpu *cpustat;
-
VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(pc->page));
- VM_BUG_ON(!PageCgroupLocked(pc));
+ VM_BUG_ON(!page_is_cgroup_locked(pc));
VM_BUG_ON(!PageCgroupUsed(pc));
VM_BUG_ON(pc->mem_cgroup != from);
- if (!mem_cgroup_is_root(from))
- res_counter_uncharge(&from->res, PAGE_SIZE);
- mem_cgroup_charge_statistics(from, pc, false);
-
- page = pc->page;
- if (page_mapped(page) && !PageAnon(page)) {
- cpu = smp_processor_id();
- /* Update mapped_file data for mem_cgroup "from" */
- stat = &from->stat;
- cpustat = &stat->cpustat[cpu];
- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
- -1);
-
- /* Update mapped_file data for mem_cgroup "to" */
- stat = &to->stat;
- cpustat = &stat->cpustat[cpu];
- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
- 1);
+ if (PageCgroupFileMapped(pc)) {
+ /* Update mapped_file data for mem_cgroup */
+ preempt_disable();
+ __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+ __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+ preempt_enable();
}
+ mem_cgroup_charge_statistics(from, pc, false);
+ if (uncharge)
+ /* This is not "cancel", but cancel_charge does all we need. */
+ mem_cgroup_cancel_charge(from, PAGE_SIZE);
- if (do_swap_account && !mem_cgroup_is_root(from))
- res_counter_uncharge(&from->memsw, PAGE_SIZE);
- css_put(&from->css);
-
- css_get(&to->css);
+ /* caller should have done css_get */
pc->mem_cgroup = to;
mem_cgroup_charge_statistics(to, pc, true);
/*
* We charges against "to" which may not have any tasks. Then, "to"
* can be under rmdir(). But in current implementation, caller of
- * this function is just force_empty() and it's garanteed that
- * "to" is never removed. So, we don't check rmdir status here.
+ * this function is just force_empty() and move charge, so it's
+ * garanteed that "to" is never removed. So, we don't check rmdir
+ * status here.
*/
}
@@ -1685,15 +2208,24 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
* __mem_cgroup_move_account()
*/
static int mem_cgroup_move_account(struct page_cgroup *pc,
- struct mem_cgroup *from, struct mem_cgroup *to)
+ struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
{
int ret = -EINVAL;
+ unsigned long flags;
+
lock_page_cgroup(pc);
if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
- __mem_cgroup_move_account(pc, from, to);
+ move_lock_page_cgroup(pc, &flags);
+ __mem_cgroup_move_account(pc, from, to, uncharge);
+ move_unlock_page_cgroup(pc, &flags);
ret = 0;
}
unlock_page_cgroup(pc);
+ /*
+ * check events
+ */
+ memcg_check_events(to, pc->page);
+ memcg_check_events(from, pc->page);
return ret;
}
@@ -1722,15 +2254,14 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
goto put;
parent = mem_cgroup_from_cont(pcg);
- ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
+ ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false,
+ PAGE_SIZE);
if (ret || !parent)
goto put_back;
- ret = mem_cgroup_move_account(pc, child, parent);
- if (!ret)
- css_put(&parent->css); /* drop extra refcnt by try_charge() */
- else
- mem_cgroup_cancel_charge(parent); /* does css_put */
+ ret = mem_cgroup_move_account(pc, child, parent, true);
+ if (ret)
+ mem_cgroup_cancel_charge(parent, PAGE_SIZE);
put_back:
putback_lru_page(page);
put:
@@ -1746,12 +2277,17 @@ out:
* < 0 if the cgroup is over its limit
*/
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, enum charge_type ctype,
- struct mem_cgroup *memcg)
+ gfp_t gfp_mask, enum charge_type ctype)
{
- struct mem_cgroup *mem;
+ struct mem_cgroup *mem = NULL;
struct page_cgroup *pc;
int ret;
+ int page_size = PAGE_SIZE;
+
+ if (PageTransHuge(page)) {
+ page_size <<= compound_order(page);
+ VM_BUG_ON(!PageTransHuge(page));
+ }
pc = lookup_page_cgroup(page);
/* can happen at boot */
@@ -1759,12 +2295,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
return 0;
prefetchw(pc);
- mem = memcg;
- ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
if (ret || !mem)
return ret;
- __mem_cgroup_commit_charge(mem, pc, ctype);
+ __mem_cgroup_commit_charge(mem, pc, ctype, page_size);
return 0;
}
@@ -1773,8 +2308,6 @@ int mem_cgroup_newpage_charge(struct page *page,
{
if (mem_cgroup_disabled())
return 0;
- if (PageCompound(page))
- return 0;
/*
* If already mapped, we don't have to account.
* If page cache, page->mapping has address_space.
@@ -1787,7 +2320,7 @@ int mem_cgroup_newpage_charge(struct page *page,
if (unlikely(!mm))
mm = &init_mm;
return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
+ MEM_CGROUP_CHARGE_TYPE_MAPPED);
}
static void
@@ -1797,7 +2330,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
- struct mem_cgroup *mem = NULL;
int ret;
if (mem_cgroup_disabled())
@@ -1818,7 +2350,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
if (!(gfp_mask & __GFP_WAIT)) {
struct page_cgroup *pc;
-
pc = lookup_page_cgroup(page);
if (!pc)
return 0;
@@ -1830,22 +2361,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
unlock_page_cgroup(pc);
}
- if (unlikely(!mm && !mem))
+ if (unlikely(!mm))
mm = &init_mm;
if (page_is_file_cache(page))
return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
+ MEM_CGROUP_CHARGE_TYPE_CACHE);
/* shmem */
if (PageSwapCache(page)) {
+ struct mem_cgroup *mem = NULL;
+
ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
if (!ret)
__mem_cgroup_commit_charge_swapin(page, mem,
MEM_CGROUP_CHARGE_TYPE_SHMEM);
} else
ret = mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
+ MEM_CGROUP_CHARGE_TYPE_SHMEM);
return ret;
}
@@ -1880,14 +2413,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
if (!mem)
goto charge_cur_mm;
*ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
- /* drop extra refcnt from tryget */
+ ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE);
css_put(&mem->css);
return ret;
charge_cur_mm:
if (unlikely(!mm))
mm = &init_mm;
- return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
+ return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE);
}
static void
@@ -1903,7 +2435,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
cgroup_exclude_rmdir(&ptr->css);
pc = lookup_page_cgroup(page);
mem_cgroup_lru_del_before_commit_swapcache(page);
- __mem_cgroup_commit_charge(ptr, pc, ctype);
+ __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
mem_cgroup_lru_add_after_commit_swapcache(page);
/*
* Now swap is on-memory. This means this page may be
@@ -1952,26 +2484,18 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
return;
if (!mem)
return;
- mem_cgroup_cancel_charge(mem);
+ mem_cgroup_cancel_charge(mem, PAGE_SIZE);
}
static void
-__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
+__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
+ int page_size)
{
struct memcg_batch_info *batch = NULL;
bool uncharge_memsw = true;
/* If swapout, usage of swap doesn't decrease */
if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
uncharge_memsw = false;
- /*
- * do_batch > 0 when unmapping pages or inode invalidate/truncate.
- * In those cases, all pages freed continously can be expected to be in
- * the same cgroup and we have chance to coalesce uncharges.
- * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
- * because we want to do uncharge as soon as possible.
- */
- if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
- goto direct_uncharge;
batch = &current->memcg_batch;
/*
@@ -1982,6 +2506,20 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
if (!batch->memcg)
batch->memcg = mem;
/*
+ * do_batch > 0 when unmapping pages or inode invalidate/truncate.
+ * In those cases, all pages freed continously can be expected to be in
+ * the same cgroup and we have chance to coalesce uncharges.
+ * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
+ * because we want to do uncharge as soon as possible.
+ */
+
+ if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
+ goto direct_uncharge;
+
+ if (page_size != PAGE_SIZE)
+ goto direct_uncharge;
+
+ /*
* In typical case, batch->memcg == mem. This means we can
* merge a series of uncharges to an uncharge of res_counter.
* If not, we uncharge res_counter ony by one.
@@ -1994,9 +2532,11 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
batch->memsw_bytes += PAGE_SIZE;
return;
direct_uncharge:
- res_counter_uncharge(&mem->res, PAGE_SIZE);
+ res_counter_uncharge(&mem->res, page_size);
if (uncharge_memsw)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+ res_counter_uncharge(&mem->memsw, page_size);
+ if (unlikely(batch->memcg != mem))
+ memcg_oom_recover(mem);
return;
}
@@ -2006,9 +2546,11 @@ direct_uncharge:
static struct mem_cgroup *
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
{
+ int i;
+ int count;
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
- struct mem_cgroup_per_zone *mz;
+ int page_size = PAGE_SIZE;
if (mem_cgroup_disabled())
return NULL;
@@ -2016,6 +2558,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
if (PageSwapCache(page))
return NULL;
+ if (PageTransHuge(page)) {
+ page_size <<= compound_order(page);
+ VM_BUG_ON(!PageTransHuge(page));
+ }
+
+ count = page_size >> PAGE_SHIFT;
/*
* Check if our page_cgroup is valid
*/
@@ -2033,7 +2581,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
switch (ctype) {
case MEM_CGROUP_CHARGE_TYPE_MAPPED:
case MEM_CGROUP_CHARGE_TYPE_DROP:
- if (page_mapped(page))
+ /* See mem_cgroup_prepare_migration() */
+ if (page_mapped(page) || PageCgroupMigration(pc))
goto unlock_out;
break;
case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
@@ -2047,11 +2596,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
break;
}
- if (!mem_cgroup_is_root(mem))
- __do_uncharge(mem, ctype);
- if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
- mem_cgroup_swap_statistics(mem, true);
- mem_cgroup_charge_statistics(mem, pc, false);
+ for (i = 0; i < count; i++)
+ mem_cgroup_charge_statistics(mem, pc + i, false);
ClearPageCgroupUsed(pc);
/*
@@ -2061,14 +2607,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
* special functions.
*/
- mz = page_cgroup_zoneinfo(pc);
unlock_page_cgroup(pc);
-
- if (mem_cgroup_soft_limit_check(mem))
- mem_cgroup_update_tree(mem, page);
- /* at swapout, this memcg will be accessed to record to swap */
- if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
- css_put(&mem->css);
+ /*
+ * even after unlock, we have mem->res.usage here and this memcg
+ * will never be freed.
+ */
+ memcg_check_events(mem, page);
+ if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
+ mem_cgroup_swap_statistics(mem, true);
+ mem_cgroup_get(mem);
+ }
+ if (!mem_cgroup_is_root(mem))
+ __do_uncharge(mem, ctype, page_size);
return mem;
@@ -2134,6 +2684,7 @@ void mem_cgroup_uncharge_end(void)
res_counter_uncharge(&batch->memcg->res, batch->bytes);
if (batch->memsw_bytes)
res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
+ memcg_oom_recover(batch->memcg);
/* forget this pointer (for sanity check) */
batch->memcg = NULL;
}
@@ -2154,13 +2705,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
memcg = __mem_cgroup_uncharge_common(page, ctype);
- /* record memcg information */
- if (do_swap_account && swapout && memcg) {
+ /*
+ * record memcg information, if swapout && memcg != NULL,
+ * mem_cgroup_get() was called in uncharge().
+ */
+ if (do_swap_account && swapout && memcg)
swap_cgroup_record(ent, css_id(&memcg->css));
- mem_cgroup_get(memcg);
- }
- if (swapout && memcg)
- css_put(&memcg->css);
}
#endif
@@ -2192,18 +2742,78 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
}
rcu_read_unlock();
}
+
+/**
+ * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
+ * @entry: swap entry to be moved
+ * @from: mem_cgroup which the entry is moved from
+ * @to: mem_cgroup which the entry is moved to
+ * @need_fixup: whether we should fixup res_counters and refcounts.
+ *
+ * It succeeds only when the swap_cgroup's record for this entry is the same
+ * as the mem_cgroup's id of @from.
+ *
+ * Returns 0 on success, -EINVAL on failure.
+ *
+ * The caller must have charged to @to, IOW, called res_counter_charge() about
+ * both res and memsw, and called css_get().
+ */
+static int mem_cgroup_move_swap_account(swp_entry_t entry,
+ struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
+{
+ unsigned short old_id, new_id;
+
+ old_id = css_id(&from->css);
+ new_id = css_id(&to->css);
+
+ if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
+ mem_cgroup_swap_statistics(from, false);
+ mem_cgroup_swap_statistics(to, true);
+ /*
+ * This function is only called from task migration context now.
+ * It postpones res_counter and refcount handling till the end
+ * of task migration(mem_cgroup_clear_mc()) for performance
+ * improvement. But we cannot postpone mem_cgroup_get(to)
+ * because if the process that has been moved to @to does
+ * swap-in, the refcount of @to might be decreased to 0.
+ */
+ mem_cgroup_get(to);
+ if (need_fixup) {
+ if (!mem_cgroup_is_root(from))
+ res_counter_uncharge(&from->memsw, PAGE_SIZE);
+ mem_cgroup_put(from);
+ /*
+ * we charged both to->res and to->memsw, so we should
+ * uncharge to->res.
+ */
+ if (!mem_cgroup_is_root(to))
+ res_counter_uncharge(&to->res, PAGE_SIZE);
+ }
+ return 0;
+ }
+ return -EINVAL;
+}
+#else
+static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
+ struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
+{
+ return -EINVAL;
+}
#endif
/*
* Before starting migration, account PAGE_SIZE to mem_cgroup that the old
* page belongs to.
*/
-int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
+int mem_cgroup_prepare_migration(struct page *page,
+ struct page *newpage, struct mem_cgroup **ptr)
{
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
+ enum charge_type ctype;
int ret = 0;
+ VM_BUG_ON(PageTransHuge(page));
if (mem_cgroup_disabled())
return 0;
@@ -2212,70 +2822,121 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
if (PageCgroupUsed(pc)) {
mem = pc->mem_cgroup;
css_get(&mem->css);
+ /*
+ * At migrating an anonymous page, its mapcount goes down
+ * to 0 and uncharge() will be called. But, even if it's fully
+ * unmapped, migration may fail and this page has to be
+ * charged again. We set MIGRATION flag here and delay uncharge
+ * until end_migration() is called
+ *
+ * Corner Case Thinking
+ * A)
+ * When the old page was mapped as Anon and it's unmap-and-freed
+ * while migration was ongoing.
+ * If unmap finds the old page, uncharge() of it will be delayed
+ * until end_migration(). If unmap finds a new page, it's
+ * uncharged when it make mapcount to be 1->0. If unmap code
+ * finds swap_migration_entry, the new page will not be mapped
+ * and end_migration() will find it(mapcount==0).
+ *
+ * B)
+ * When the old page was mapped but migraion fails, the kernel
+ * remaps it. A charge for it is kept by MIGRATION flag even
+ * if mapcount goes down to 0. We can do remap successfully
+ * without charging it again.
+ *
+ * C)
+ * The "old" page is under lock_page() until the end of
+ * migration, so, the old page itself will not be swapped-out.
+ * If the new page is swapped out before end_migraton, our
+ * hook to usual swap-out path will catch the event.
+ */
+ if (PageAnon(page))
+ SetPageCgroupMigration(pc);
}
unlock_page_cgroup(pc);
+ /*
+ * If the page is not charged at this point,
+ * we return here.
+ */
+ if (!mem)
+ return 0;
- if (mem) {
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
- page);
- css_put(&mem->css);
- }
*ptr = mem;
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE);
+ css_put(&mem->css);/* drop extra refcnt */
+ if (ret || *ptr == NULL) {
+ if (PageAnon(page)) {
+ lock_page_cgroup(pc);
+ ClearPageCgroupMigration(pc);
+ unlock_page_cgroup(pc);
+ /*
+ * The old page may be fully unmapped while we kept it.
+ */
+ mem_cgroup_uncharge_page(page);
+ }
+ return -ENOMEM;
+ }
+ /*
+ * We charge new page before it's used/mapped. So, even if unlock_page()
+ * is called before end_migration, we can catch all events on this new
+ * page. In the case new page is migrated but not remapped, new page's
+ * mapcount will be finally 0 and we call uncharge in end_migration().
+ */
+ pc = lookup_page_cgroup(newpage);
+ if (PageAnon(page))
+ ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
+ else if (page_is_file_cache(page))
+ ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+ else
+ ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+ __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE);
return ret;
}
/* remove redundant charge if migration failed*/
void mem_cgroup_end_migration(struct mem_cgroup *mem,
- struct page *oldpage, struct page *newpage)
+ struct page *oldpage, struct page *newpage, bool migration_ok)
{
- struct page *target, *unused;
+ struct page *used, *unused;
struct page_cgroup *pc;
- enum charge_type ctype;
if (!mem)
return;
+ /* blocks rmdir() */
cgroup_exclude_rmdir(&mem->css);
- /* at migration success, oldpage->mapping is NULL. */
- if (oldpage->mapping) {
- target = oldpage;
- unused = NULL;
+ if (!migration_ok) {
+ used = oldpage;
+ unused = newpage;
} else {
- target = newpage;
+ used = newpage;
unused = oldpage;
}
-
- if (PageAnon(target))
- ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
- else if (page_is_file_cache(target))
- ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
- else
- ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-
- /* unused page is not on radix-tree now. */
- if (unused)
- __mem_cgroup_uncharge_common(unused, ctype);
-
- pc = lookup_page_cgroup(target);
/*
- * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
- * So, double-counting is effectively avoided.
+ * We disallowed uncharge of pages under migration because mapcount
+ * of the page goes down to zero, temporarly.
+ * Clear the flag and check the page should be charged.
*/
- __mem_cgroup_commit_charge(mem, pc, ctype);
+ pc = lookup_page_cgroup(oldpage);
+ lock_page_cgroup(pc);
+ ClearPageCgroupMigration(pc);
+ unlock_page_cgroup(pc);
+
+ __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
/*
- * Both of oldpage and newpage are still under lock_page().
- * Then, we don't have to care about race in radix-tree.
- * But we have to be careful that this page is unmapped or not.
- *
- * There is a case for !page_mapped(). At the start of
- * migration, oldpage was mapped. But now, it's zapped.
- * But we know *target* page is not freed/reused under us.
- * mem_cgroup_uncharge_page() does all necessary checks.
+ * If a page is a file cache, radix-tree replacement is very atomic
+ * and we can skip this check. When it was an Anon page, its mapcount
+ * goes down to 0. But because we added MIGRATION flage, it's not
+ * uncharged yet. There are several case but page->mapcount check
+ * and USED bit check in mem_cgroup_uncharge_page() will do enough
+ * check. (see prepare_charge() also)
*/
- if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
- mem_cgroup_uncharge_page(target);
+ if (PageAnon(used))
+ mem_cgroup_uncharge_page(used);
/*
- * At migration, we may charge account against cgroup which has no tasks
+ * At migration, we may charge account against cgroup which has no
+ * tasks.
* So, rmdir()->pre_destroy() can be called while we do this charge.
* In that case, we need to call pre_destroy() again. check it here.
*/
@@ -2313,10 +2974,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
unsigned long long val)
{
int retry_count;
- u64 memswlimit;
+ u64 memswlimit, memlimit;
int ret = 0;
int children = mem_cgroup_count_children(memcg);
u64 curusage, oldusage;
+ int enlarge;
/*
* For keeping hierarchical_reclaim simple, how long we should retry
@@ -2327,6 +2989,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ enlarge = 0;
while (retry_count) {
if (signal_pending(current)) {
ret = -EINTR;
@@ -2344,6 +3007,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
mutex_unlock(&set_limit_mutex);
break;
}
+
+ memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ if (memlimit < val)
+ enlarge = 1;
+
ret = res_counter_set_limit(&memcg->res, val);
if (!ret) {
if (memswlimit == val)
@@ -2365,6 +3033,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
else
oldusage = curusage;
}
+ if (!ret && enlarge)
+ memcg_oom_recover(memcg);
return ret;
}
@@ -2373,9 +3043,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
unsigned long long val)
{
int retry_count;
- u64 memlimit, oldusage, curusage;
+ u64 memlimit, memswlimit, oldusage, curusage;
int children = mem_cgroup_count_children(memcg);
int ret = -EBUSY;
+ int enlarge = 0;
/* see mem_cgroup_resize_res_limit */
retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
@@ -2397,6 +3068,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
mutex_unlock(&set_limit_mutex);
break;
}
+ memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+ if (memswlimit < val)
+ enlarge = 1;
ret = res_counter_set_limit(&memcg->memsw, val);
if (!ret) {
if (memlimit == val)
@@ -2419,12 +3093,13 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
else
oldusage = curusage;
}
+ if (!ret && enlarge)
+ memcg_oom_recover(memcg);
return ret;
}
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
- gfp_t gfp_mask, int nid,
- int zid)
+ gfp_t gfp_mask)
{
unsigned long nr_reclaimed = 0;
struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -2436,7 +3111,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
if (order > 0)
return 0;
- mctz = soft_limit_tree_node_zone(nid, zid);
+ mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
/*
* This loop can run a while, specially if mem_cgroup's continuously
* keep exceeding their soft limit and putting the system under
@@ -2545,7 +3220,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
pc = list_entry(list->prev, struct page_cgroup, lru);
if (busy == pc) {
list_move(&pc->lru, list);
- busy = 0;
+ busy = NULL;
spin_unlock_irqrestore(&zone->lru_lock, flags);
continue;
}
@@ -2597,6 +3272,7 @@ move_account:
lru_add_drain_all();
drain_all_stock_sync();
ret = 0;
+ mem_cgroup_start_move(mem);
for_each_node_state(node, N_HIGH_MEMORY) {
for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
enum lru_list l;
@@ -2610,6 +3286,8 @@ move_account:
if (ret)
break;
}
+ mem_cgroup_end_move(mem);
+ memcg_oom_recover(mem);
/* it seems parent cgroup doesn't have enough mem */
if (ret == -ENOMEM)
goto try_to_free;
@@ -2695,64 +3373,62 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
return retval;
}
-struct mem_cgroup_idx_data {
- s64 val;
- enum mem_cgroup_stat_index idx;
-};
-static int
-mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
+static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
+ enum mem_cgroup_stat_index idx)
{
- struct mem_cgroup_idx_data *d = data;
- d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
- return 0;
+ struct mem_cgroup *iter;
+ s64 val = 0;
+
+ /* each per cpu's value can be minus.Then, use s64 */
+ for_each_mem_cgroup_tree(iter, mem)
+ val += mem_cgroup_read_stat(iter, idx);
+
+ if (val < 0) /* race ? */
+ val = 0;
+ return val;
}
-static void
-mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
- enum mem_cgroup_stat_index idx, s64 *val)
+static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
{
- struct mem_cgroup_idx_data d;
- d.idx = idx;
- d.val = 0;
- mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
- *val = d.val;
+ u64 val;
+
+ if (!mem_cgroup_is_root(mem)) {
+ if (!swap)
+ return res_counter_read_u64(&mem->res, RES_USAGE);
+ else
+ return res_counter_read_u64(&mem->memsw, RES_USAGE);
+ }
+
+ val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS);
+
+ if (swap)
+ val += mem_cgroup_get_recursive_idx_stat(mem,
+ MEM_CGROUP_STAT_SWAPOUT);
+
+ return val << PAGE_SHIFT;
}
static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
- u64 idx_val, val;
+ u64 val;
int type, name;
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
switch (type) {
case _MEM:
- if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
- mem_cgroup_get_recursive_idx_stat(mem,
- MEM_CGROUP_STAT_CACHE, &idx_val);
- val = idx_val;
- mem_cgroup_get_recursive_idx_stat(mem,
- MEM_CGROUP_STAT_RSS, &idx_val);
- val += idx_val;
- val <<= PAGE_SHIFT;
- } else
+ if (name == RES_USAGE)
+ val = mem_cgroup_usage(mem, false);
+ else
val = res_counter_read_u64(&mem->res, name);
break;
case _MEMSWAP:
- if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
- mem_cgroup_get_recursive_idx_stat(mem,
- MEM_CGROUP_STAT_CACHE, &idx_val);
- val = idx_val;
- mem_cgroup_get_recursive_idx_stat(mem,
- MEM_CGROUP_STAT_RSS, &idx_val);
- val += idx_val;
- mem_cgroup_get_recursive_idx_stat(mem,
- MEM_CGROUP_STAT_SWAPOUT, &idx_val);
- val += idx_val;
- val <<= PAGE_SHIFT;
- } else
+ if (name == RES_USAGE)
+ val = mem_cgroup_usage(mem, true);
+ else
val = res_counter_read_u64(&mem->memsw, name);
break;
default:
@@ -2865,6 +3541,39 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
return 0;
}
+static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
+ struct cftype *cft)
+{
+ return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
+}
+
+#ifdef CONFIG_MMU
+static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
+ struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+
+ if (val >= (1 << NR_MOVE_TYPE))
+ return -EINVAL;
+ /*
+ * We check this value several times in both in can_attach() and
+ * attach(), so we need cgroup lock to prevent this value from being
+ * inconsistent.
+ */
+ cgroup_lock();
+ mem->move_charge_at_immigrate = val;
+ cgroup_unlock();
+
+ return 0;
+}
+#else
+static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
+ struct cftype *cft, u64 val)
+{
+ return -ENOSYS;
+}
+#endif
+
/* For read statistics */
enum {
@@ -2904,24 +3613,24 @@ struct {
};
-static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
+static void
+mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
{
- struct mcs_total_stat *s = data;
s64 val;
/* per cpu stat */
- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
s->stat[MCS_CACHE] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
s->stat[MCS_RSS] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
s->stat[MCS_PGPGIN] += val;
- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
s->stat[MCS_PGPGOUT] += val;
if (do_swap_account) {
- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
s->stat[MCS_SWAP] += val * PAGE_SIZE;
}
@@ -2936,13 +3645,15 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
- return 0;
}
static void
mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
{
- mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, mem)
+ mem_cgroup_get_local_stat(iter, s);
}
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
@@ -3049,12 +3760,344 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
return 0;
}
+static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
+{
+ struct mem_cgroup_threshold_ary *t;
+ u64 usage;
+ int i;
+
+ rcu_read_lock();
+ if (!swap)
+ t = rcu_dereference(memcg->thresholds.primary);
+ else
+ t = rcu_dereference(memcg->memsw_thresholds.primary);
+
+ if (!t)
+ goto unlock;
+
+ usage = mem_cgroup_usage(memcg, swap);
+
+ /*
+ * current_threshold points to threshold just below usage.
+ * If it's not true, a threshold was crossed after last
+ * call of __mem_cgroup_threshold().
+ */
+ i = t->current_threshold;
+
+ /*
+ * Iterate backward over array of thresholds starting from
+ * current_threshold and check if a threshold is crossed.
+ * If none of thresholds below usage is crossed, we read
+ * only one element of the array here.
+ */
+ for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
+ eventfd_signal(t->entries[i].eventfd, 1);
+
+ /* i = current_threshold + 1 */
+ i++;
+
+ /*
+ * Iterate forward over array of thresholds starting from
+ * current_threshold+1 and check if a threshold is crossed.
+ * If none of thresholds above usage is crossed, we read
+ * only one element of the array here.
+ */
+ for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
+ eventfd_signal(t->entries[i].eventfd, 1);
+
+ /* Update current_threshold */
+ t->current_threshold = i - 1;
+unlock:
+ rcu_read_unlock();
+}
+
+static void mem_cgroup_threshold(struct mem_cgroup *memcg)
+{
+ while (memcg) {
+ __mem_cgroup_threshold(memcg, false);
+ if (do_swap_account)
+ __mem_cgroup_threshold(memcg, true);
+
+ memcg = parent_mem_cgroup(memcg);
+ }
+}
+
+static int compare_thresholds(const void *a, const void *b)
+{
+ const struct mem_cgroup_threshold *_a = a;
+ const struct mem_cgroup_threshold *_b = b;
+
+ return _a->threshold - _b->threshold;
+}
+
+static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
+{
+ struct mem_cgroup_eventfd_list *ev;
+
+ list_for_each_entry(ev, &mem->oom_notify, list)
+ eventfd_signal(ev->eventfd, 1);
+ return 0;
+}
+
+static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
+{
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, mem)
+ mem_cgroup_oom_notify_cb(iter);
+}
+
+static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
+ struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup_thresholds *thresholds;
+ struct mem_cgroup_threshold_ary *new;
+ int type = MEMFILE_TYPE(cft->private);
+ u64 threshold, usage;
+ int i, size, ret;
+
+ ret = res_counter_memparse_write_strategy(args, &threshold);
+ if (ret)
+ return ret;
+
+ mutex_lock(&memcg->thresholds_lock);
+
+ if (type == _MEM)
+ thresholds = &memcg->thresholds;
+ else if (type == _MEMSWAP)
+ thresholds = &memcg->memsw_thresholds;
+ else
+ BUG();
+
+ usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
+
+ /* Check if a threshold crossed before adding a new one */
+ if (thresholds->primary)
+ __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+
+ size = thresholds->primary ? thresholds->primary->size + 1 : 1;
+
+ /* Allocate memory for new array of thresholds */
+ new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
+ GFP_KERNEL);
+ if (!new) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ new->size = size;
+
+ /* Copy thresholds (if any) to new array */
+ if (thresholds->primary) {
+ memcpy(new->entries, thresholds->primary->entries, (size - 1) *
+ sizeof(struct mem_cgroup_threshold));
+ }
+
+ /* Add new threshold */
+ new->entries[size - 1].eventfd = eventfd;
+ new->entries[size - 1].threshold = threshold;
+
+ /* Sort thresholds. Registering of new threshold isn't time-critical */
+ sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
+ compare_thresholds, NULL);
+
+ /* Find current threshold */
+ new->current_threshold = -1;
+ for (i = 0; i < size; i++) {
+ if (new->entries[i].threshold < usage) {
+ /*
+ * new->current_threshold will not be used until
+ * rcu_assign_pointer(), so it's safe to increment
+ * it here.
+ */
+ ++new->current_threshold;
+ }
+ }
+
+ /* Free old spare buffer and save old primary buffer as spare */
+ kfree(thresholds->spare);
+ thresholds->spare = thresholds->primary;
+
+ rcu_assign_pointer(thresholds->primary, new);
+
+ /* To be sure that nobody uses thresholds */
+ synchronize_rcu();
+
+unlock:
+ mutex_unlock(&memcg->thresholds_lock);
+
+ return ret;
+}
+
+static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
+ struct cftype *cft, struct eventfd_ctx *eventfd)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup_thresholds *thresholds;
+ struct mem_cgroup_threshold_ary *new;
+ int type = MEMFILE_TYPE(cft->private);
+ u64 usage;
+ int i, j, size;
+
+ mutex_lock(&memcg->thresholds_lock);
+ if (type == _MEM)
+ thresholds = &memcg->thresholds;
+ else if (type == _MEMSWAP)
+ thresholds = &memcg->memsw_thresholds;
+ else
+ BUG();
+
+ /*
+ * Something went wrong if we trying to unregister a threshold
+ * if we don't have thresholds
+ */
+ BUG_ON(!thresholds);
+
+ usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
+
+ /* Check if a threshold crossed before removing */
+ __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+
+ /* Calculate new number of threshold */
+ size = 0;
+ for (i = 0; i < thresholds->primary->size; i++) {
+ if (thresholds->primary->entries[i].eventfd != eventfd)
+ size++;
+ }
+
+ new = thresholds->spare;
+
+ /* Set thresholds array to NULL if we don't have thresholds */
+ if (!size) {
+ kfree(new);
+ new = NULL;
+ goto swap_buffers;
+ }
+
+ new->size = size;
+
+ /* Copy thresholds and find current threshold */
+ new->current_threshold = -1;
+ for (i = 0, j = 0; i < thresholds->primary->size; i++) {
+ if (thresholds->primary->entries[i].eventfd == eventfd)
+ continue;
+
+ new->entries[j] = thresholds->primary->entries[i];
+ if (new->entries[j].threshold < usage) {
+ /*
+ * new->current_threshold will not be used
+ * until rcu_assign_pointer(), so it's safe to increment
+ * it here.
+ */
+ ++new->current_threshold;
+ }
+ j++;
+ }
+
+swap_buffers:
+ /* Swap primary and spare array */
+ thresholds->spare = thresholds->primary;
+ rcu_assign_pointer(thresholds->primary, new);
+
+ /* To be sure that nobody uses thresholds */
+ synchronize_rcu();
+
+ mutex_unlock(&memcg->thresholds_lock);
+}
+
+static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
+ struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup_eventfd_list *event;
+ int type = MEMFILE_TYPE(cft->private);
+
+ BUG_ON(type != _OOM_TYPE);
+ event = kmalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+
+ mutex_lock(&memcg_oom_mutex);
+
+ event->eventfd = eventfd;
+ list_add(&event->list, &memcg->oom_notify);
+
+ /* already in OOM ? */
+ if (atomic_read(&memcg->oom_lock))
+ eventfd_signal(eventfd, 1);
+ mutex_unlock(&memcg_oom_mutex);
+
+ return 0;
+}
+
+static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
+ struct cftype *cft, struct eventfd_ctx *eventfd)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup_eventfd_list *ev, *tmp;
+ int type = MEMFILE_TYPE(cft->private);
+
+ BUG_ON(type != _OOM_TYPE);
+
+ mutex_lock(&memcg_oom_mutex);
+
+ list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
+ if (ev->eventfd == eventfd) {
+ list_del(&ev->list);
+ kfree(ev);
+ }
+ }
+
+ mutex_unlock(&memcg_oom_mutex);
+}
+
+static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
+ struct cftype *cft, struct cgroup_map_cb *cb)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+
+ cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
+
+ if (atomic_read(&mem->oom_lock))
+ cb->fill(cb, "under_oom", 1);
+ else
+ cb->fill(cb, "under_oom", 0);
+ return 0;
+}
+
+static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
+ struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup *parent;
+
+ /* cannot set to root cgroup and only 0 and 1 are allowed */
+ if (!cgrp->parent || !((val == 0) || (val == 1)))
+ return -EINVAL;
+
+ parent = mem_cgroup_from_cont(cgrp->parent);
+
+ cgroup_lock();
+ /* oom-kill-disable is a flag for subhierarchy. */
+ if ((parent->use_hierarchy) ||
+ (mem->use_hierarchy && !list_empty(&cgrp->children))) {
+ cgroup_unlock();
+ return -EINVAL;
+ }
+ mem->oom_kill_disable = val;
+ if (!val)
+ memcg_oom_recover(mem);
+ cgroup_unlock();
+ return 0;
+}
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
.read_u64 = mem_cgroup_read,
+ .register_event = mem_cgroup_usage_register_event,
+ .unregister_event = mem_cgroup_usage_unregister_event,
},
{
.name = "max_usage_in_bytes",
@@ -3098,6 +4141,19 @@ static struct cftype mem_cgroup_files[] = {
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
},
+ {
+ .name = "move_charge_at_immigrate",
+ .read_u64 = mem_cgroup_move_charge_read,
+ .write_u64 = mem_cgroup_move_charge_write,
+ },
+ {
+ .name = "oom_control",
+ .read_map = mem_cgroup_oom_control_read,
+ .write_u64 = mem_cgroup_oom_control_write,
+ .register_event = mem_cgroup_oom_register_event,
+ .unregister_event = mem_cgroup_oom_unregister_event,
+ .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
+ },
};
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -3106,6 +4162,8 @@ static struct cftype memsw_cgroup_files[] = {
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
.read_u64 = mem_cgroup_read,
+ .register_event = mem_cgroup_usage_register_event,
+ .unregister_event = mem_cgroup_usage_unregister_event,
},
{
.name = "memsw.max_usage_in_bytes",
@@ -3157,13 +4215,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
*/
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
- pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+ pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
if (!pn)
return 1;
mem->info.nodeinfo[node] = pn;
- memset(pn, 0, sizeof(*pn));
-
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
for_each_lru(l)
@@ -3180,25 +4236,32 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
kfree(mem->info.nodeinfo[node]);
}
-static int mem_cgroup_size(void)
-{
- int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
- return sizeof(struct mem_cgroup) + cpustat_size;
-}
-
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *mem;
- int size = mem_cgroup_size();
+ int size = sizeof(struct mem_cgroup);
+ /* Can be very big if MAX_NUMNODES is very big */
if (size < PAGE_SIZE)
- mem = kmalloc(size, GFP_KERNEL);
+ mem = kzalloc(size, GFP_KERNEL);
else
- mem = vmalloc(size);
+ mem = vzalloc(size);
- if (mem)
- memset(mem, 0, size);
+ if (!mem)
+ return NULL;
+
+ mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+ if (!mem->stat)
+ goto out_free;
+ spin_lock_init(&mem->pcp_counter_lock);
return mem;
+
+out_free:
+ if (size < PAGE_SIZE)
+ kfree(mem);
+ else
+ vfree(mem);
+ return NULL;
}
/*
@@ -3222,7 +4285,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);
- if (mem_cgroup_size() < PAGE_SIZE)
+ free_percpu(mem->stat);
+ if (sizeof(struct mem_cgroup) < PAGE_SIZE)
kfree(mem);
else
vfree(mem);
@@ -3233,9 +4297,9 @@ static void mem_cgroup_get(struct mem_cgroup *mem)
atomic_inc(&mem->refcnt);
}
-static void mem_cgroup_put(struct mem_cgroup *mem)
+static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
{
- if (atomic_dec_and_test(&mem->refcnt)) {
+ if (atomic_sub_and_test(count, &mem->refcnt)) {
struct mem_cgroup *parent = parent_mem_cgroup(mem);
__mem_cgroup_free(mem);
if (parent)
@@ -3243,6 +4307,11 @@ static void mem_cgroup_put(struct mem_cgroup *mem)
}
}
+static void mem_cgroup_put(struct mem_cgroup *mem)
+{
+ __mem_cgroup_put(mem, 1);
+}
+
/*
* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
*/
@@ -3318,11 +4387,11 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
&per_cpu(memcg_stock, cpu);
INIT_WORK(&stock->work, drain_local_stock);
}
- hotcpu_notifier(memcg_stock_cpu_callback, 0);
-
+ hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
} else {
parent = mem_cgroup_from_cont(cont->parent);
mem->use_hierarchy = parent->use_hierarchy;
+ mem->oom_kill_disable = parent->oom_kill_disable;
}
if (parent && parent->use_hierarchy) {
@@ -3341,10 +4410,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
}
mem->last_scanned_child = 0;
spin_lock_init(&mem->reclaim_param_lock);
+ INIT_LIST_HEAD(&mem->oom_notify);
if (parent)
mem->swappiness = get_swappiness(parent);
atomic_set(&mem->refcnt, 1);
+ mem->move_charge_at_immigrate = 0;
+ mutex_init(&mem->thresholds_lock);
return &mem->css;
free_out:
__mem_cgroup_free(mem);
@@ -3381,17 +4453,517 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
return ret;
}
+#ifdef CONFIG_MMU
+/* Handlers for move charge at task migration. */
+#define PRECHARGE_COUNT_AT_ONCE 256
+static int mem_cgroup_do_precharge(unsigned long count)
+{
+ int ret = 0;
+ int batch_count = PRECHARGE_COUNT_AT_ONCE;
+ struct mem_cgroup *mem = mc.to;
+
+ if (mem_cgroup_is_root(mem)) {
+ mc.precharge += count;
+ /* we don't need css_get for root */
+ return ret;
+ }
+ /* try to charge at once */
+ if (count > 1) {
+ struct res_counter *dummy;
+ /*
+ * "mem" cannot be under rmdir() because we've already checked
+ * by cgroup_lock_live_cgroup() that it is not removed and we
+ * are still under the same cgroup_mutex. So we can postpone
+ * css_get().
+ */
+ if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
+ goto one_by_one;
+ if (do_swap_account && res_counter_charge(&mem->memsw,
+ PAGE_SIZE * count, &dummy)) {
+ res_counter_uncharge(&mem->res, PAGE_SIZE * count);
+ goto one_by_one;
+ }
+ mc.precharge += count;
+ return ret;
+ }
+one_by_one:
+ /* fall back to one by one charge */
+ while (count--) {
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ if (!batch_count--) {
+ batch_count = PRECHARGE_COUNT_AT_ONCE;
+ cond_resched();
+ }
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
+ PAGE_SIZE);
+ if (ret || !mem)
+ /* mem_cgroup_clear_mc() will do uncharge later */
+ return -ENOMEM;
+ mc.precharge++;
+ }
+ return ret;
+}
+
+/**
+ * is_target_pte_for_mc - check a pte whether it is valid for move charge
+ * @vma: the vma the pte to be checked belongs
+ * @addr: the address corresponding to the pte to be checked
+ * @ptent: the pte to be checked
+ * @target: the pointer the target page or swap ent will be stored(can be NULL)
+ *
+ * Returns
+ * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
+ * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
+ * move charge. if @target is not NULL, the page is stored in target->page
+ * with extra refcnt got(Callers should handle it).
+ * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
+ * target for charge migration. if @target is not NULL, the entry is stored
+ * in target->ent.
+ *
+ * Called with pte lock held.
+ */
+union mc_target {
+ struct page *page;
+ swp_entry_t ent;
+};
+
+enum mc_target_type {
+ MC_TARGET_NONE, /* not used */
+ MC_TARGET_PAGE,
+ MC_TARGET_SWAP,
+};
+
+static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent)
+{
+ struct page *page = vm_normal_page(vma, addr, ptent);
+
+ if (!page || !page_mapped(page))
+ return NULL;
+ if (PageAnon(page)) {
+ /* we don't move shared anon */
+ if (!move_anon() || page_mapcount(page) > 2)
+ return NULL;
+ } else if (!move_file())
+ /* we ignore mapcount for file pages */
+ return NULL;
+ if (!get_page_unless_zero(page))
+ return NULL;
+
+ return page;
+}
+
+static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent, swp_entry_t *entry)
+{
+ int usage_count;
+ struct page *page = NULL;
+ swp_entry_t ent = pte_to_swp_entry(ptent);
+
+ if (!move_anon() || non_swap_entry(ent))
+ return NULL;
+ usage_count = mem_cgroup_count_swap_user(ent, &page);
+ if (usage_count > 1) { /* we don't move shared anon */
+ if (page)
+ put_page(page);
+ return NULL;
+ }
+ if (do_swap_account)
+ entry->val = ent.val;
+
+ return page;
+}
+
+static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent, swp_entry_t *entry)
+{
+ struct page *page = NULL;
+ struct inode *inode;
+ struct address_space *mapping;
+ pgoff_t pgoff;
+
+ if (!vma->vm_file) /* anonymous vma */
+ return NULL;
+ if (!move_file())
+ return NULL;
+
+ inode = vma->vm_file->f_path.dentry->d_inode;
+ mapping = vma->vm_file->f_mapping;
+ if (pte_none(ptent))
+ pgoff = linear_page_index(vma, addr);
+ else /* pte_file(ptent) is true */
+ pgoff = pte_to_pgoff(ptent);
+
+ /* page is moved even if it's not RSS of this task(page-faulted). */
+ if (!mapping_cap_swap_backed(mapping)) { /* normal file */
+ page = find_get_page(mapping, pgoff);
+ } else { /* shmem/tmpfs file. we should take account of swap too. */
+ swp_entry_t ent;
+ mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
+ if (do_swap_account)
+ entry->val = ent.val;
+ }
+
+ return page;
+}
+
+static int is_target_pte_for_mc(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent, union mc_target *target)
+{
+ struct page *page = NULL;
+ struct page_cgroup *pc;
+ int ret = 0;
+ swp_entry_t ent = { .val = 0 };
+
+ if (pte_present(ptent))
+ page = mc_handle_present_pte(vma, addr, ptent);
+ else if (is_swap_pte(ptent))
+ page = mc_handle_swap_pte(vma, addr, ptent, &ent);
+ else if (pte_none(ptent) || pte_file(ptent))
+ page = mc_handle_file_pte(vma, addr, ptent, &ent);
+
+ if (!page && !ent.val)
+ return 0;
+ if (page) {
+ pc = lookup_page_cgroup(page);
+ /*
+ * Do only loose check w/o page_cgroup lock.
+ * mem_cgroup_move_account() checks the pc is valid or not under
+ * the lock.
+ */
+ if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
+ ret = MC_TARGET_PAGE;
+ if (target)
+ target->page = page;
+ }
+ if (!ret || !target)
+ put_page(page);
+ }
+ /* There is a swap entry and a page doesn't exist or isn't charged */
+ if (ent.val && !ret &&
+ css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
+ ret = MC_TARGET_SWAP;
+ if (target)
+ target->ent = ent;
+ }
+ return ret;
+}
+
+static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->private;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE)
+ if (is_target_pte_for_mc(vma, addr, *pte, NULL))
+ mc.precharge++; /* increment precharge temporarily */
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+
+ return 0;
+}
+
+static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
+{
+ unsigned long precharge;
+ struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ struct mm_walk mem_cgroup_count_precharge_walk = {
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
+ .mm = mm,
+ .private = vma,
+ };
+ if (is_vm_hugetlb_page(vma))
+ continue;
+ walk_page_range(vma->vm_start, vma->vm_end,
+ &mem_cgroup_count_precharge_walk);
+ }
+ up_read(&mm->mmap_sem);
+
+ precharge = mc.precharge;
+ mc.precharge = 0;
+
+ return precharge;
+}
+
+static int mem_cgroup_precharge_mc(struct mm_struct *mm)
+{
+ unsigned long precharge = mem_cgroup_count_precharge(mm);
+
+ VM_BUG_ON(mc.moving_task);
+ mc.moving_task = current;
+ return mem_cgroup_do_precharge(precharge);
+}
+
+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
+static void __mem_cgroup_clear_mc(void)
+{
+ struct mem_cgroup *from = mc.from;
+ struct mem_cgroup *to = mc.to;
+
+ /* we must uncharge all the leftover precharges from mc.to */
+ if (mc.precharge) {
+ __mem_cgroup_cancel_charge(mc.to, mc.precharge);
+ mc.precharge = 0;
+ }
+ /*
+ * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
+ * we must uncharge here.
+ */
+ if (mc.moved_charge) {
+ __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
+ mc.moved_charge = 0;
+ }
+ /* we must fixup refcnts and charges */
+ if (mc.moved_swap) {
+ /* uncharge swap account from the old cgroup */
+ if (!mem_cgroup_is_root(mc.from))
+ res_counter_uncharge(&mc.from->memsw,
+ PAGE_SIZE * mc.moved_swap);
+ __mem_cgroup_put(mc.from, mc.moved_swap);
+
+ if (!mem_cgroup_is_root(mc.to)) {
+ /*
+ * we charged both to->res and to->memsw, so we should
+ * uncharge to->res.
+ */
+ res_counter_uncharge(&mc.to->res,
+ PAGE_SIZE * mc.moved_swap);
+ }
+ /* we've already done mem_cgroup_get(mc.to) */
+ mc.moved_swap = 0;
+ }
+ memcg_oom_recover(from);
+ memcg_oom_recover(to);
+ wake_up_all(&mc.waitq);
+}
+
+static void mem_cgroup_clear_mc(void)
+{
+ struct mem_cgroup *from = mc.from;
+
+ /*
+ * we must clear moving_task before waking up waiters at the end of
+ * task migration.
+ */
+ mc.moving_task = NULL;
+ __mem_cgroup_clear_mc();
+ spin_lock(&mc.lock);
+ mc.from = NULL;
+ mc.to = NULL;
+ spin_unlock(&mc.lock);
+ mem_cgroup_end_move(from);
+}
+
+static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
+ struct cgroup *cgroup,
+ struct task_struct *p,
+ bool threadgroup)
+{
+ int ret = 0;
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
+
+ if (mem->move_charge_at_immigrate) {
+ struct mm_struct *mm;
+ struct mem_cgroup *from = mem_cgroup_from_task(p);
+
+ VM_BUG_ON(from == mem);
+
+ mm = get_task_mm(p);
+ if (!mm)
+ return 0;
+ /* We move charges only when we move a owner of the mm */
+ if (mm->owner == p) {
+ VM_BUG_ON(mc.from);
+ VM_BUG_ON(mc.to);
+ VM_BUG_ON(mc.precharge);
+ VM_BUG_ON(mc.moved_charge);
+ VM_BUG_ON(mc.moved_swap);
+ mem_cgroup_start_move(from);
+ spin_lock(&mc.lock);
+ mc.from = from;
+ mc.to = mem;
+ spin_unlock(&mc.lock);
+ /* We set mc.moving_task later */
+
+ ret = mem_cgroup_precharge_mc(mm);
+ if (ret)
+ mem_cgroup_clear_mc();
+ }
+ mmput(mm);
+ }
+ return ret;
+}
+
+static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
+ struct cgroup *cgroup,
+ struct task_struct *p,
+ bool threadgroup)
+{
+ mem_cgroup_clear_mc();
+}
+
+static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ int ret = 0;
+ struct vm_area_struct *vma = walk->private;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+retry:
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; addr += PAGE_SIZE) {
+ pte_t ptent = *(pte++);
+ union mc_target target;
+ int type;
+ struct page *page;
+ struct page_cgroup *pc;
+ swp_entry_t ent;
+
+ if (!mc.precharge)
+ break;
+
+ type = is_target_pte_for_mc(vma, addr, ptent, &target);
+ switch (type) {
+ case MC_TARGET_PAGE:
+ page = target.page;
+ if (isolate_lru_page(page))
+ goto put;
+ pc = lookup_page_cgroup(page);
+ if (!mem_cgroup_move_account(pc,
+ mc.from, mc.to, false)) {
+ mc.precharge--;
+ /* we uncharge from mc.from later. */
+ mc.moved_charge++;
+ }
+ putback_lru_page(page);
+put: /* is_target_pte_for_mc() gets the page */
+ put_page(page);
+ break;
+ case MC_TARGET_SWAP:
+ ent = target.ent;
+ if (!mem_cgroup_move_swap_account(ent,
+ mc.from, mc.to, false)) {
+ mc.precharge--;
+ /* we fixup refcnts and charges later. */
+ mc.moved_swap++;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+
+ if (addr != end) {
+ /*
+ * We have consumed all precharges we got in can_attach().
+ * We try charge one by one, but don't do any additional
+ * charges to mc.to if we have failed in charge once in attach()
+ * phase.
+ */
+ ret = mem_cgroup_do_precharge(1);
+ if (!ret)
+ goto retry;
+ }
+
+ return ret;
+}
+
+static void mem_cgroup_move_charge(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ lru_add_drain_all();
+retry:
+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+ /*
+ * Someone who are holding the mmap_sem might be waiting in
+ * waitq. So we cancel all extra charges, wake up all waiters,
+ * and retry. Because we cancel precharges, we might not be able
+ * to move enough charges, but moving charge is a best-effort
+ * feature anyway, so it wouldn't be a big problem.
+ */
+ __mem_cgroup_clear_mc();
+ cond_resched();
+ goto retry;
+ }
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ int ret;
+ struct mm_walk mem_cgroup_move_charge_walk = {
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
+ .mm = mm,
+ .private = vma,
+ };
+ if (is_vm_hugetlb_page(vma))
+ continue;
+ ret = walk_page_range(vma->vm_start, vma->vm_end,
+ &mem_cgroup_move_charge_walk);
+ if (ret)
+ /*
+ * means we have consumed all precharges and failed in
+ * doing additional charge. Just abandon here.
+ */
+ break;
+ }
+ up_read(&mm->mmap_sem);
+}
+
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
struct task_struct *p,
bool threadgroup)
{
- /*
- * FIXME: It's better to move charges of this process from old
- * memcg to new memcg. But it's just on TODO-List now.
- */
+ struct mm_struct *mm;
+
+ if (!mc.to)
+ /* no need to move charge */
+ return;
+
+ mm = get_task_mm(p);
+ if (mm) {
+ mem_cgroup_move_charge(mm);
+ mmput(mm);
+ }
+ mem_cgroup_clear_mc();
}
+#else /* !CONFIG_MMU */
+static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
+ struct cgroup *cgroup,
+ struct task_struct *p,
+ bool threadgroup)
+{
+ return 0;
+}
+static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
+ struct cgroup *cgroup,
+ struct task_struct *p,
+ bool threadgroup)
+{
+}
+static void mem_cgroup_move_task(struct cgroup_subsys *ss,
+ struct cgroup *cont,
+ struct cgroup *old_cont,
+ struct task_struct *p,
+ bool threadgroup)
+{
+}
+#endif
struct cgroup_subsys mem_cgroup_subsys = {
.name = "memory",
@@ -3400,16 +4972,28 @@ struct cgroup_subsys mem_cgroup_subsys = {
.pre_destroy = mem_cgroup_pre_destroy,
.destroy = mem_cgroup_destroy,
.populate = mem_cgroup_populate,
+ .can_attach = mem_cgroup_can_attach,
+ .cancel_attach = mem_cgroup_cancel_attach,
.attach = mem_cgroup_move_task,
.early_init = 0,
.use_id = 1,
};
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+static int __init enable_swap_account(char *s)
+{
+ /* consider enabled if no parameter or 1 is given */
+ if (!s || !strcmp(s, "1"))
+ really_do_swap_account = 1;
+ else if (!strcmp(s, "0"))
+ really_do_swap_account = 0;
+ return 1;
+}
+__setup("swapaccount", enable_swap_account);
static int __init disable_swap_account(char *s)
{
- really_do_swap_account = 0;
+ enable_swap_account("0");
return 1;
}
__setup("noswapaccount", disable_swap_account);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 17299fd4577c..548fbd70f026 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -7,21 +7,26 @@
* Free Software Foundation.
*
* High level machine check handler. Handles pages reported by the
- * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * hardware as being corrupted usually due to a multi-bit ECC memory or cache
* failure.
+ *
+ * In addition there is a "soft offline" entry point that allows stop using
+ * not-yet-corrupted-by-suspicious pages without killing anything.
*
* Handles page cache pages in various states. The tricky part
- * here is that we can access any page asynchronous to other VM
- * users, because memory failures could happen anytime and anywhere,
- * possibly violating some of their assumptions. This is why this code
- * has to be extremely careful. Generally it tries to use normal locking
- * rules, as in get the standard locks, even if that means the
- * error handling takes potentially a long time.
- *
- * The operation to map back from RMAP chains to processes has to walk
- * the complete process list and has non linear complexity with the number
- * mappings. In short it can be quite slow. But since memory corruptions
- * are rare we hope to get away with this.
+ * here is that we can access any page asynchronously in respect to
+ * other VM users, because memory failures could happen anytime and
+ * anywhere. This could violate some of their assumptions. This is why
+ * this code has to be extremely careful. Generally it tries to use
+ * normal locking rules, as in get the standard locks, even if that means
+ * the error handling takes potentially a long time.
+ *
+ * There are several operations here with exponential complexity because
+ * of unsuitable VM data structures. For example the operation to map back
+ * from RMAP chains to processes has to walk the complete process list and
+ * has non linear complexity with the number. But since memory corruptions
+ * are rare we hope to get away with this. This avoids impacting the core
+ * VM.
*/
/*
@@ -30,7 +35,6 @@
* - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
* - pass bad pages to kdump next kernel
*/
-#define DEBUG 1 /* remove me in 2.6.34 */
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
@@ -44,6 +48,10 @@
#include <linux/migrate.h>
#include <linux/page-isolation.h>
#include <linux/suspend.h>
+#include <linux/slab.h>
+#include <linux/swapops.h>
+#include <linux/hugetlb.h>
+#include <linux/memory_hotplug.h>
#include "internal.h"
int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -75,7 +83,7 @@ static int hwpoison_filter_dev(struct page *p)
return 0;
/*
- * page_mapping() does not accept slab page
+ * page_mapping() does not accept slab pages.
*/
if (PageSlab(p))
return -EINVAL;
@@ -180,7 +188,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter);
* signal.
*/
static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
- unsigned long pfn)
+ unsigned long pfn, struct page *page)
{
struct siginfo si;
int ret;
@@ -195,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
#ifdef __ARCH_SI_TRAPNO
si.si_trapno = trapno;
#endif
- si.si_addr_lsb = PAGE_SHIFT;
+ si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
/*
* Don't use force here, it's convenient if the signal
* can be temporarily blocked.
@@ -232,7 +240,7 @@ void shake_page(struct page *p, int access)
int nr;
do {
nr = shrink_slab(1000, GFP_KERNEL, 1000);
- if (page_count(p) == 0)
+ if (page_count(p) == 1)
break;
} while (nr > 10);
}
@@ -265,7 +273,7 @@ struct to_kill {
struct list_head nd;
struct task_struct *tsk;
unsigned long addr;
- unsigned addr_valid:1;
+ char addr_valid;
};
/*
@@ -306,7 +314,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* a SIGKILL because the error is not contained anymore.
*/
if (tk->addr == -EFAULT) {
- pr_debug("MCE: Unable to find user space address %lx in %s\n",
+ pr_info("MCE: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
tk->addr_valid = 0;
}
@@ -324,7 +332,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* wrong earlier.
*/
static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
- int fail, unsigned long pfn)
+ int fail, struct page *page, unsigned long pfn)
{
struct to_kill *tk, *next;
@@ -349,7 +357,7 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
* process anyways.
*/
else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
- pfn) < 0)
+ pfn, page) < 0)
printk(KERN_ERR
"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
@@ -378,14 +386,19 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
struct task_struct *tsk;
struct anon_vma *av;
+ if (!PageHuge(page) && unlikely(split_huge_page(page)))
+ return;
read_lock(&tasklist_lock);
av = page_lock_anon_vma(page);
if (av == NULL) /* Not actually mapped anymore */
goto out;
for_each_process (tsk) {
+ struct anon_vma_chain *vmac;
+
if (!task_early_kill(tsk))
continue;
- list_for_each_entry (vma, &av->head, anon_vma_node) {
+ list_for_each_entry(vmac, &av->head, same_anon_vma) {
+ vma = vmac->vma;
if (!page_mapped_in_vma(page, vma))
continue;
if (vma->vm_mm == tsk->mm)
@@ -571,7 +584,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
pfn, err);
} else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) {
- pr_debug("MCE %#lx: failed to release buffers\n", pfn);
+ pr_info("MCE %#lx: failed to release buffers\n", pfn);
} else {
ret = RECOVERED;
}
@@ -685,17 +698,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
/*
* Huge pages. Needs work.
* Issues:
- * No rmap support so we cannot find the original mapper. In theory could walk
- * all MMs and look for the mappings, but that would be non atomic and racy.
- * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
- * like just walking the current process and hoping it has it mapped (that
- * should be usually true for the common "shared database cache" case)
- * Should handle free huge pages and dequeue them too, but this needs to
- * handle huge page accounting correctly.
+ * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
+ * To narrow down kill region to one page, we need to break up pmd.
*/
static int me_huge_page(struct page *p, unsigned long pfn)
{
- return FAILED;
+ int res = 0;
+ struct page *hpage = compound_head(p);
+ /*
+ * We can safely recover from error on free or reserved (i.e.
+ * not in-use) hugepage by dequeuing it from freelist.
+ * To check whether a hugepage is in-use or not, we can't use
+ * page->lru because it can be used in other hugepage operations,
+ * such as __unmap_hugepage_range() and gather_surplus_pages().
+ * So instead we use page_mapping() and PageAnon().
+ * We assume that this function is called with page lock held,
+ * so there is no race between isolation and mapping/unmapping.
+ */
+ if (!(page_mapping(hpage) || PageAnon(hpage))) {
+ res = dequeue_hwpoisoned_huge_page(hpage);
+ if (!res)
+ return RECOVERED;
+ }
+ return DELAYED;
}
/*
@@ -818,8 +843,6 @@ static int page_action(struct page_state *ps, struct page *p,
return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
}
-#define N_UNMAP_TRIES 5
-
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
@@ -831,8 +854,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
- int i;
int kill = 1;
+ struct page *hpage = compound_head(p);
if (PageReserved(p) || PageSlab(p))
return SWAP_SUCCESS;
@@ -841,10 +864,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* This check implies we don't kill processes if their pages
* are in the swap cache early. Those are always late kills.
*/
- if (!page_mapped(p))
+ if (!page_mapped(hpage))
return SWAP_SUCCESS;
- if (PageCompound(p) || PageKsm(p))
+ if (PageKsm(p))
return SWAP_FAIL;
if (PageSwapCache(p)) {
@@ -859,10 +882,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* XXX: the dirty test could be racy: set_page_dirty() may not always
* be called inside page lock (it's recommended but not enforced).
*/
- mapping = page_mapping(p);
- if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
- if (page_mkclean(p)) {
- SetPageDirty(p);
+ mapping = page_mapping(hpage);
+ if (!PageDirty(hpage) && mapping &&
+ mapping_cap_writeback_dirty(mapping)) {
+ if (page_mkclean(hpage)) {
+ SetPageDirty(hpage);
} else {
kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
@@ -881,22 +905,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(p, &tokill);
-
- /*
- * try_to_unmap can fail temporarily due to races.
- * Try a few times (RED-PEN better strategy?)
- */
- for (i = 0; i < N_UNMAP_TRIES; i++) {
- ret = try_to_unmap(p, ttu);
- if (ret == SWAP_SUCCESS)
- break;
- pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
- }
+ collect_procs(hpage, &tokill);
+ ret = try_to_unmap(hpage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(p));
+ pfn, page_mapcount(hpage));
/*
* Now that the dirty bit has been propagated to the
@@ -907,17 +921,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- kill_procs_ao(&tokill, !!PageDirty(p), trapno,
- ret != SWAP_SUCCESS, pfn);
+ kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
+ ret != SWAP_SUCCESS, p, pfn);
return ret;
}
+static void set_page_hwpoison_huge_page(struct page *hpage)
+{
+ int i;
+ int nr_pages = 1 << compound_trans_order(hpage);
+ for (i = 0; i < nr_pages; i++)
+ SetPageHWPoison(hpage + i);
+}
+
+static void clear_page_hwpoison_huge_page(struct page *hpage)
+{
+ int i;
+ int nr_pages = 1 << compound_trans_order(hpage);
+ for (i = 0; i < nr_pages; i++)
+ ClearPageHWPoison(hpage + i);
+}
+
int __memory_failure(unsigned long pfn, int trapno, int flags)
{
struct page_state *ps;
struct page *p;
+ struct page *hpage;
int res;
+ unsigned int nr_pages;
if (!sysctl_memory_failure_recovery)
panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -930,18 +962,23 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
}
p = pfn_to_page(pfn);
+ hpage = compound_head(p);
if (TestSetPageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
return 0;
}
- atomic_long_add(1, &mce_bad_pages);
+ nr_pages = 1 << compound_trans_order(hpage);
+ atomic_long_add(nr_pages, &mce_bad_pages);
/*
* We need/can do nothing about count=0 pages.
* 1) it's a free page, and therefore in safe hand:
* prep_new_page() will be the gate keeper.
- * 2) it's part of a non-compound high order page.
+ * 2) it's a free hugepage, which is also safe:
+ * an affected hugepage will be dequeued from hugepage freelist,
+ * so there's no concern about reusing it ever after.
+ * 3) it's part of a non-compound high order page.
* Implies some kernel user: cannot stop them from
* R/W the page; let's pray that the page has been
* used and will be freed some time later.
@@ -949,10 +986,28 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
*/
if (!(flags & MF_COUNT_INCREASED) &&
- !get_page_unless_zero(compound_head(p))) {
+ !get_page_unless_zero(hpage)) {
if (is_free_buddy_page(p)) {
action_result(pfn, "free buddy", DELAYED);
return 0;
+ } else if (PageHuge(hpage)) {
+ /*
+ * Check "just unpoisoned", "filter hit", and
+ * "race with other subpage."
+ */
+ lock_page_nosync(hpage);
+ if (!PageHWPoison(hpage)
+ || (hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != hpage && TestSetPageHWPoison(hpage))) {
+ atomic_long_sub(nr_pages, &mce_bad_pages);
+ return 0;
+ }
+ set_page_hwpoison_huge_page(hpage);
+ res = dequeue_hwpoisoned_huge_page(hpage);
+ action_result(pfn, "free huge",
+ res ? IGNORED : DELAYED);
+ unlock_page(hpage);
+ return res;
} else {
action_result(pfn, "high order kernel", IGNORED);
return -EBUSY;
@@ -967,9 +1022,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
- if (!PageLRU(p))
+ if (!PageLRU(p) && !PageHuge(p))
shake_page(p, 0);
- if (!PageLRU(p)) {
+ if (!PageLRU(p) && !PageHuge(p)) {
/*
* shake_page could have turned it free.
*/
@@ -987,7 +1042,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* It's very difficult to mess with pages currently under IO
* and in many cases impossible, so we just avoid it here.
*/
- lock_page_nosync(p);
+ lock_page_nosync(hpage);
/*
* unpoison always clear PG_hwpoison inside page lock
@@ -999,12 +1054,32 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
- atomic_long_dec(&mce_bad_pages);
- unlock_page(p);
- put_page(p);
+ atomic_long_sub(nr_pages, &mce_bad_pages);
+ unlock_page(hpage);
+ put_page(hpage);
return 0;
}
+ /*
+ * For error on the tail page, we should set PG_hwpoison
+ * on the head page to show that the hugepage is hwpoisoned
+ */
+ if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+ action_result(pfn, "hugepage already hardware poisoned",
+ IGNORED);
+ unlock_page(hpage);
+ put_page(hpage);
+ return 0;
+ }
+ /*
+ * Set PG_hwpoison on all pages in an error hugepage,
+ * because containment is done in hugepage unit for now.
+ * Since we have done TestSetPageHWPoison() for the head page with
+ * page lock held, we can safely set PG_hwpoison bits on tail pages.
+ */
+ if (PageHuge(p))
+ set_page_hwpoison_huge_page(hpage);
+
wait_on_page_writeback(p);
/*
@@ -1034,7 +1109,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
}
}
out:
- unlock_page(p);
+ unlock_page(hpage);
return res;
}
EXPORT_SYMBOL_GPL(__memory_failure);
@@ -1078,6 +1153,7 @@ int unpoison_memory(unsigned long pfn)
struct page *page;
struct page *p;
int freeit = 0;
+ unsigned int nr_pages;
if (!pfn_valid(pfn))
return -ENXIO;
@@ -1086,14 +1162,26 @@ int unpoison_memory(unsigned long pfn)
page = compound_head(p);
if (!PageHWPoison(p)) {
- pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
+ pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
return 0;
}
+ nr_pages = 1 << compound_trans_order(page);
+
if (!get_page_unless_zero(page)) {
+ /*
+ * Since HWPoisoned hugepage should have non-zero refcount,
+ * race between memory failure and unpoison seems to happen.
+ * In such case unpoison fails and memory failure runs
+ * to the end.
+ */
+ if (PageHuge(page)) {
+ pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+ return 0;
+ }
if (TestClearPageHWPoison(p))
- atomic_long_dec(&mce_bad_pages);
- pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
+ atomic_long_sub(nr_pages, &mce_bad_pages);
+ pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
return 0;
}
@@ -1104,10 +1192,12 @@ int unpoison_memory(unsigned long pfn)
* the PG_hwpoison page will be caught and isolated on the entrance to
* the free buddy page pool.
*/
- if (TestClearPageHWPoison(p)) {
- pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
- atomic_long_dec(&mce_bad_pages);
+ if (TestClearPageHWPoison(page)) {
+ pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
+ atomic_long_sub(nr_pages, &mce_bad_pages);
freeit = 1;
+ if (PageHuge(page))
+ clear_page_hwpoison_huge_page(page);
}
unlock_page(page);
@@ -1122,7 +1212,11 @@ EXPORT_SYMBOL(unpoison_memory);
static struct page *new_page(struct page *p, unsigned long private, int **x)
{
int nid = page_to_nid(p);
- return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+ if (PageHuge(p))
+ return alloc_huge_page_node(page_hstate(compound_head(p)),
+ nid);
+ else
+ return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
}
/*
@@ -1139,25 +1233,31 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
return 1;
/*
- * The lock_system_sleep prevents a race with memory hotplug,
- * because the isolation assumes there's only a single user.
+ * The lock_memory_hotplug prevents a race with memory hotplug.
* This is a big hammer, a better would be nicer.
*/
- lock_system_sleep();
+ lock_memory_hotplug();
/*
* Isolate the page, so that it doesn't get reallocated if it
* was free.
*/
set_migratetype_isolate(p);
+ /*
+ * When the target page is a free hugepage, just remove it
+ * from free hugepage list.
+ */
if (!get_page_unless_zero(compound_head(p))) {
- if (is_free_buddy_page(p)) {
- pr_debug("get_any_page: %#lx free buddy page\n", pfn);
+ if (PageHuge(p)) {
+ pr_info("get_any_page: %#lx free huge page\n", pfn);
+ ret = dequeue_hwpoisoned_huge_page(compound_head(p));
+ } else if (is_free_buddy_page(p)) {
+ pr_info("get_any_page: %#lx free buddy page\n", pfn);
/* Set hwpoison bit while page is still isolated */
SetPageHWPoison(p);
ret = 0;
} else {
- pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
+ pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
pfn, p->flags);
ret = -EIO;
}
@@ -1166,7 +1266,48 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
ret = 1;
}
unset_migratetype_isolate(p);
- unlock_system_sleep();
+ unlock_memory_hotplug();
+ return ret;
+}
+
+static int soft_offline_huge_page(struct page *page, int flags)
+{
+ int ret;
+ unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_head(page);
+ LIST_HEAD(pagelist);
+
+ ret = get_any_page(page, pfn, flags);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ goto done;
+
+ if (PageHWPoison(hpage)) {
+ put_page(hpage);
+ pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
+ return -EBUSY;
+ }
+
+ /* Keep page count to indicate a given hugepage is isolated. */
+
+ list_add(&hpage->lru, &pagelist);
+ ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
+ true);
+ if (ret) {
+ putback_lru_pages(&pagelist);
+ pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+ pfn, ret, page->flags);
+ if (ret > 0)
+ ret = -EIO;
+ return ret;
+ }
+done:
+ if (!PageHWPoison(hpage))
+ atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
+ set_page_hwpoison_huge_page(hpage);
+ dequeue_hwpoisoned_huge_page(hpage);
+ /* keep elevated page count for bad page */
return ret;
}
@@ -1197,6 +1338,9 @@ int soft_offline_page(struct page *page, int flags)
int ret;
unsigned long pfn = page_to_pfn(page);
+ if (PageHuge(page))
+ return soft_offline_huge_page(page, flags);
+
ret = get_any_page(page, pfn, flags);
if (ret < 0)
return ret;
@@ -1223,7 +1367,7 @@ int soft_offline_page(struct page *page, int flags)
goto done;
}
if (!PageLRU(page)) {
- pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
+ pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
pfn, page->flags);
return -EIO;
}
@@ -1237,7 +1381,7 @@ int soft_offline_page(struct page *page, int flags)
if (PageHWPoison(page)) {
unlock_page(page);
put_page(page);
- pr_debug("soft offline: %#lx page already poisoned\n", pfn);
+ pr_info("soft offline: %#lx page already poisoned\n", pfn);
return -EBUSY;
}
@@ -1258,7 +1402,7 @@ int soft_offline_page(struct page *page, int flags)
put_page(page);
if (ret == 1) {
ret = 0;
- pr_debug("soft_offline: %#lx: invalidated\n", pfn);
+ pr_info("soft_offline: %#lx: invalidated\n", pfn);
goto done;
}
@@ -1272,15 +1416,16 @@ int soft_offline_page(struct page *page, int flags)
LIST_HEAD(pagelist);
list_add(&page->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+ ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+ 0, true);
if (ret) {
- pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+ pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
}
} else {
- pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
+ pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
pfn, ret, page_count(page), page->flags);
}
if (ret)
@@ -1292,3 +1437,35 @@ done:
/* keep elevated page count for bad page */
return ret;
}
+
+/*
+ * The caller must hold current->mm->mmap_sem in read mode.
+ */
+int is_hwpoison_address(unsigned long addr)
+{
+ pgd_t *pgdp;
+ pud_t pud, *pudp;
+ pmd_t pmd, *pmdp;
+ pte_t pte, *ptep;
+ swp_entry_t entry;
+
+ pgdp = pgd_offset(current->mm, addr);
+ if (!pgd_present(*pgdp))
+ return 0;
+ pudp = pud_offset(pgdp, addr);
+ pud = *pudp;
+ if (!pud_present(pud) || pud_large(pud))
+ return 0;
+ pmdp = pmd_offset(pudp, addr);
+ pmd = *pmdp;
+ if (!pmd_present(pmd) || pmd_large(pmd))
+ return 0;
+ ptep = pte_offset_map(pmdp, addr);
+ pte = *ptep;
+ pte_unmap(ptep);
+ if (!is_swap_pte(pte))
+ return 0;
+ entry = pte_to_swp_entry(pte);
+ return is_hwpoison_entry(entry);
+}
+EXPORT_SYMBOL_GPL(is_hwpoison_address);
diff --git a/mm/memory.c b/mm/memory.c
index 09e4b1be7b67..31250faff390 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -56,6 +56,7 @@
#include <linux/kallsyms.h>
#include <linux/swapops.h>
#include <linux/elf.h>
+#include <linux/gfp.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -121,6 +122,77 @@ static int __init init_zero_pfn(void)
}
core_initcall(init_zero_pfn);
+
+#if defined(SPLIT_RSS_COUNTING)
+
+static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
+{
+ int i;
+
+ for (i = 0; i < NR_MM_COUNTERS; i++) {
+ if (task->rss_stat.count[i]) {
+ add_mm_counter(mm, i, task->rss_stat.count[i]);
+ task->rss_stat.count[i] = 0;
+ }
+ }
+ task->rss_stat.events = 0;
+}
+
+static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
+{
+ struct task_struct *task = current;
+
+ if (likely(task->mm == mm))
+ task->rss_stat.count[member] += val;
+ else
+ add_mm_counter(mm, member, val);
+}
+#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
+#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
+
+/* sync counter once per 64 page faults */
+#define TASK_RSS_EVENTS_THRESH (64)
+static void check_sync_rss_stat(struct task_struct *task)
+{
+ if (unlikely(task != current))
+ return;
+ if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
+ __sync_task_rss_stat(task, task->mm);
+}
+
+unsigned long get_mm_counter(struct mm_struct *mm, int member)
+{
+ long val = 0;
+
+ /*
+ * Don't use task->mm here...for avoiding to use task_get_mm()..
+ * The caller must guarantee task->mm is not invalid.
+ */
+ val = atomic_long_read(&mm->rss_stat.count[member]);
+ /*
+ * counter is updated in asynchronous manner and may go to minus.
+ * But it's never be expected number for users.
+ */
+ if (val < 0)
+ return 0;
+ return (unsigned long)val;
+}
+
+void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
+{
+ __sync_task_rss_stat(task, mm);
+}
+#else
+
+#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
+#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
+
+static void check_sync_rss_stat(struct task_struct *task)
+{
+}
+
+#endif
+
/*
* If a p?d_bad entry is found while walking page tables, report
* the error, before resetting entry to p?d_none. Usually (but
@@ -235,7 +307,6 @@ void free_pgd_range(struct mmu_gather *tlb,
{
pgd_t *pgd;
unsigned long next;
- unsigned long start;
/*
* The next few lines have given us lots of grief...
@@ -279,7 +350,6 @@ void free_pgd_range(struct mmu_gather *tlb,
if (addr > end - 1)
return;
- start = addr;
pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -300,7 +370,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
* Hide vma from rmap and truncate_pagecache before freeing
* pgtables
*/
- anon_vma_unlink(vma);
+ unlink_anon_vmas(vma);
unlink_file_vma(vma);
if (is_vm_hugetlb_page(vma)) {
@@ -314,7 +384,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
&& !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
- anon_vma_unlink(vma);
+ unlink_anon_vmas(vma);
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
@@ -324,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
}
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long address)
{
pgtable_t new = pte_alloc_one(mm, address);
+ int wait_split_huge_page;
if (!new)
return -ENOMEM;
@@ -346,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
spin_lock(&mm->page_table_lock);
- if (!pmd_present(*pmd)) { /* Has another populated it ? */
+ wait_split_huge_page = 0;
+ if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
mm->nr_ptes++;
pmd_populate(mm, pmd, new);
new = NULL;
- }
+ } else if (unlikely(pmd_trans_splitting(*pmd)))
+ wait_split_huge_page = 1;
spin_unlock(&mm->page_table_lock);
if (new)
pte_free(mm, new);
+ if (wait_split_huge_page)
+ wait_split_huge_page(vma->anon_vma, pmd);
return 0;
}
@@ -366,22 +442,31 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&init_mm.page_table_lock);
- if (!pmd_present(*pmd)) { /* Has another populated it ? */
+ if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
pmd_populate_kernel(&init_mm, pmd, new);
new = NULL;
- }
+ } else
+ VM_BUG_ON(pmd_trans_splitting(*pmd));
spin_unlock(&init_mm.page_table_lock);
if (new)
pte_free_kernel(&init_mm, new);
return 0;
}
-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline void init_rss_vec(int *rss)
+{
+ memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
+}
+
+static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
- if (file_rss)
- add_mm_counter(mm, file_rss, file_rss);
- if (anon_rss)
- add_mm_counter(mm, anon_rss, anon_rss);
+ int i;
+
+ if (current->mm == mm)
+ sync_mm_rss(current, mm);
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ if (rss[i])
+ add_mm_counter(mm, i, rss[i]);
}
/*
@@ -430,12 +515,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
"BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
current->comm,
(long long)pte_val(pte), (long long)pmd_val(*pmd));
- if (page) {
- printk(KERN_ALERT
- "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
- page, (void *)page->flags, page_count(page),
- page_mapcount(page), page->mapping, page->index);
- }
+ if (page)
+ dump_page(page);
printk(KERN_ALERT
"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
@@ -597,7 +678,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
&src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
- if (is_write_migration_entry(entry) &&
+ if (likely(!non_swap_entry(entry)))
+ rss[MM_SWAPENTS]++;
+ else if (is_write_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
/*
* COW mappings require pages in both parent
@@ -632,7 +715,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (page) {
get_page(page);
page_dup_rmap(page);
- rss[PageAnon(page)]++;
+ if (PageAnon(page))
+ rss[MM_ANONPAGES]++;
+ else
+ rss[MM_FILEPAGES]++;
}
out_set_pte:
@@ -640,23 +726,24 @@ out_set_pte:
return 0;
}
-static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
{
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
int progress = 0;
- int rss[2];
+ int rss[NR_MM_COUNTERS];
swp_entry_t entry = (swp_entry_t){0};
again:
- rss[1] = rss[0] = 0;
+ init_rss_vec(rss);
+
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
return -ENOMEM;
- src_pte = pte_offset_map_nested(src_pmd, addr);
+ src_pte = pte_offset_map(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte;
@@ -687,8 +774,8 @@ again:
arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
- pte_unmap_nested(orig_src_pte);
- add_mm_rss(dst_mm, rss[0], rss[1]);
+ pte_unmap(orig_src_pte);
+ add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
@@ -715,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*src_pmd)) {
+ int err;
+ VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+ err = copy_huge_pmd(dst_mm, src_mm,
+ dst_pmd, src_pmd, addr, vma);
+ if (err == -ENOMEM)
+ return -ENOMEM;
+ if (!err)
+ continue;
+ /* fall through */
+ }
if (pmd_none_or_clear_bad(src_pmd))
continue;
if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -816,8 +914,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct mm_struct *mm = tlb->mm;
pte_t *pte;
spinlock_t *ptl;
- int file_rss = 0;
- int anon_rss = 0;
+ int rss[NR_MM_COUNTERS];
+
+ init_rss_vec(rss);
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
@@ -863,14 +962,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
set_pte_at(mm, addr, pte,
pgoff_to_pte(page->index));
if (PageAnon(page))
- anon_rss--;
+ rss[MM_ANONPAGES]--;
else {
if (pte_dirty(ptent))
set_page_dirty(page);
if (pte_young(ptent) &&
likely(!VM_SequentialReadHint(vma)))
mark_page_accessed(page);
- file_rss--;
+ rss[MM_FILEPAGES]--;
}
page_remove_rmap(page);
if (unlikely(page_mapcount(page) < 0))
@@ -887,13 +986,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (pte_file(ptent)) {
if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
print_bad_pte(vma, addr, ptent, NULL);
- } else if
- (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
- print_bad_pte(vma, addr, ptent, NULL);
+ } else {
+ swp_entry_t entry = pte_to_swp_entry(ptent);
+
+ if (!non_swap_entry(entry))
+ rss[MM_SWAPENTS]--;
+ if (unlikely(!free_swap_and_cache(entry)))
+ print_bad_pte(vma, addr, ptent, NULL);
+ }
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
- add_mm_rss(mm, file_rss, anon_rss);
+ add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
@@ -911,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (next-addr != HPAGE_PMD_SIZE) {
+ VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+ split_huge_page_pmd(vma->vm_mm, pmd);
+ } else if (zap_huge_pmd(tlb, vma, pmd)) {
+ (*zap_work)--;
+ continue;
+ }
+ /* fall through */
+ }
if (pmd_none_or_clear_bad(pmd)) {
(*zap_work)--;
continue;
@@ -1139,8 +1253,17 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);
-/*
- * Do a quick page-table lookup for a single page.
+/**
+ * follow_page - look up a page descriptor from a user-virtual address
+ * @vma: vm_area_struct mapping @address
+ * @address: virtual address to look up
+ * @flags: flags modifying lookup behaviour
+ *
+ * @flags can have FOLL_ flags set, defined in <linux/mm.h>
+ *
+ * Returns the mapped (struct page *), %NULL if no mapping exists, or
+ * an error pointer if there is a mapping to something not represented
+ * by a page descriptor (see also vm_normal_page()).
*/
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
@@ -1167,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
pud = pud_offset(pgd, address);
if (pud_none(*pud))
goto no_page_table;
- if (pud_huge(*pud)) {
+ if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
BUG_ON(flags & FOLL_GET);
page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
goto out;
@@ -1178,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
goto no_page_table;
- if (pmd_huge(*pmd)) {
+ if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
BUG_ON(flags & FOLL_GET);
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
goto out;
}
+ if (pmd_trans_huge(*pmd)) {
+ if (flags & FOLL_SPLIT) {
+ split_huge_page_pmd(mm, pmd);
+ goto split_fallthrough;
+ }
+ spin_lock(&mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma, pmd);
+ } else {
+ page = follow_trans_huge_pmd(mm, address,
+ pmd, flags);
+ spin_unlock(&mm->page_table_lock);
+ goto out;
+ }
+ } else
+ spin_unlock(&mm->page_table_lock);
+ /* fall through */
+ }
+split_fallthrough:
if (unlikely(pmd_bad(*pmd)))
goto no_page_table;
@@ -1215,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
*/
mark_page_accessed(page);
}
+ if (flags & FOLL_MLOCK) {
+ /*
+ * The preliminary mapping check is mainly to avoid the
+ * pointless overhead of lock_page on the ZERO_PAGE
+ * which might bounce very badly if there is contention.
+ *
+ * If the page is already locked, we don't need to
+ * handle it now - vmscan will handle it later if and
+ * when it attempts to reclaim the page.
+ */
+ if (page->mapping && trylock_page(page)) {
+ lru_add_drain(); /* push cached pages to LRU */
+ /*
+ * Because we lock page here and migration is
+ * blocked by the pte's page reference, we need
+ * only check for file-cache page truncation.
+ */
+ if (page->mapping)
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
+ }
unlock:
pte_unmap_unlock(ptep, ptl);
out:
@@ -1246,7 +1412,8 @@ no_page_table:
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, unsigned int gup_flags,
- struct page **pages, struct vm_area_struct **vmas)
+ struct page **pages, struct vm_area_struct **vmas,
+ int *nonblocking)
{
int i;
unsigned long vm_flags;
@@ -1291,16 +1458,27 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
pmd = pmd_offset(pud, pg);
if (pmd_none(*pmd))
return i ? : -EFAULT;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map(pmd, pg);
if (pte_none(*pte)) {
pte_unmap(pte);
return i ? : -EFAULT;
}
if (pages) {
- struct page *page = vm_normal_page(gate_vma, start, *pte);
+ struct page *page;
+
+ page = vm_normal_page(gate_vma, start, *pte);
+ if (!page) {
+ if (!(gup_flags & FOLL_DUMP) &&
+ is_zero_pfn(pte_pfn(*pte)))
+ page = pte_page(*pte);
+ else {
+ pte_unmap(pte);
+ return i ? : -EFAULT;
+ }
+ }
pages[i] = page;
- if (page)
- get_page(page);
+ get_page(page);
}
pte_unmap(pte);
if (vmas)
@@ -1336,16 +1514,22 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
cond_resched();
while (!(page = follow_page(vma, start, foll_flags))) {
int ret;
+ unsigned int fault_flags = 0;
+
+ if (foll_flags & FOLL_WRITE)
+ fault_flags |= FAULT_FLAG_WRITE;
+ if (nonblocking)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY;
ret = handle_mm_fault(mm, vma, start,
- (foll_flags & FOLL_WRITE) ?
- FAULT_FLAG_WRITE : 0);
+ fault_flags);
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
return i ? i : -ENOMEM;
if (ret &
- (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
+ (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
+ VM_FAULT_SIGBUS))
return i ? i : -EFAULT;
BUG();
}
@@ -1354,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
else
tsk->min_flt++;
+ if (ret & VM_FAULT_RETRY) {
+ *nonblocking = 0;
+ return i;
+ }
+
/*
* The VM_FAULT_WRITE bit tells us that
* do_wp_page has broken COW when necessary,
@@ -1453,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (force)
flags |= FOLL_FORCE;
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+ return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+ NULL);
}
EXPORT_SYMBOL(get_user_pages);
@@ -1478,22 +1668,25 @@ struct page *get_dump_page(unsigned long addr)
struct page *page;
if (__get_user_pages(current, current->mm, addr, 1,
- FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
+ NULL) < 1)
return NULL;
flush_cache_page(vma, addr, page_to_pfn(page));
return page;
}
#endif /* CONFIG_ELF_CORE */
-pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
+pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
spinlock_t **ptl)
{
pgd_t * pgd = pgd_offset(mm, addr);
pud_t * pud = pud_alloc(mm, pgd, addr);
if (pud) {
pmd_t * pmd = pmd_alloc(mm, pud, addr);
- if (pmd)
+ if (pmd) {
+ VM_BUG_ON(pmd_trans_huge(*pmd));
return pte_alloc_map_lock(mm, pmd, addr, ptl);
+ }
}
return NULL;
}
@@ -1527,7 +1720,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
/* Ok, finally just insert the thing.. */
get_page(page);
- inc_mm_counter(mm, file_rss);
+ inc_mm_counter_fast(mm, MM_FILEPAGES);
page_add_file_rmap(page);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
@@ -1593,7 +1786,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
/* Ok, finally just insert the thing.. */
entry = pte_mkspecial(pfn_pte(pfn, prot));
set_pte_at(mm, addr, pte, entry);
- update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */
+ update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
retval = 0;
out_unlock:
@@ -1712,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
return -ENOMEM;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
do {
next = pmd_addr_end(addr, end);
if (remap_pte_range(mm, pmd, addr, next,
@@ -1901,11 +2095,10 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
{
pgd_t *pgd;
unsigned long next;
- unsigned long start = addr, end = addr + size;
+ unsigned long end = addr + size;
int err;
BUG_ON(addr >= end);
- mmu_notifier_invalidate_range_start(mm, start, end);
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -1913,7 +2106,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
if (err)
break;
} while (pgd++, addr = next, addr != end);
- mmu_notifier_invalidate_range_end(mm, start, end);
+
return err;
}
EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1943,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
return same;
}
-/*
- * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
- * servicing faults for write access. In the normal case, do always want
- * pte_mkwrite. But get_user_pages can cause write faults for mappings
- * that do not have writing enabled, when used by access_process_vm.
- */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
-{
- if (likely(vma->vm_flags & VM_WRITE))
- pte = pte_mkwrite(pte);
- return pte;
-}
-
static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
{
/*
@@ -1975,7 +2155,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
* zeroes.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
- memset(kaddr, 0, PAGE_SIZE);
+ clear_page(kaddr);
kunmap_atomic(kaddr, KM_USER0);
flush_dcache_page(dst);
} else
@@ -2003,10 +2183,11 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
spinlock_t *ptl, pte_t orig_pte)
+ __releases(ptl)
{
struct page *old_page, *new_page;
pte_t entry;
- int reuse = 0, ret = 0;
+ int ret = 0;
int page_mkwrite = 0;
struct page *dirty_page = NULL;
@@ -2043,7 +2224,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
page_cache_release(old_page);
}
- reuse = reuse_swap_page(old_page);
+ if (reuse_swap_page(old_page)) {
+ /*
+ * The page is all ours. Move it to our anon_vma so
+ * the rmap code will not search our parent or siblings.
+ * Protected against the rmap code by the page lock.
+ */
+ page_move_anon_rmap(old_page, vma, address);
+ unlock_page(old_page);
+ goto reuse;
+ }
unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
@@ -2107,18 +2297,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
dirty_page = old_page;
get_page(dirty_page);
- reuse = 1;
- }
- if (reuse) {
reuse:
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, address, page_table, entry,1))
- update_mmu_cache(vma, address, entry);
+ update_mmu_cache(vma, address, page_table);
+ pte_unmap_unlock(page_table, ptl);
ret |= VM_FAULT_WRITE;
- goto unlock;
+
+ if (!dirty_page)
+ return ret;
+
+ /*
+ * Yes, Virginia, this is actually required to prevent a race
+ * with clear_page_dirty_for_io() from clearing the page dirty
+ * bit after it clear all dirty ptes, but before a racing
+ * do_wp_page installs a dirty pte.
+ *
+ * do_no_page is protected similarly.
+ */
+ if (!page_mkwrite) {
+ wait_on_page_locked(dirty_page);
+ set_page_dirty_balance(dirty_page, page_mkwrite);
+ }
+ put_page(dirty_page);
+ if (page_mkwrite) {
+ struct address_space *mapping = dirty_page->mapping;
+
+ set_page_dirty(dirty_page);
+ unlock_page(dirty_page);
+ page_cache_release(dirty_page);
+ if (mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
+ }
+
+ /* file_update_time outside page_lock */
+ if (vma->vm_file)
+ file_update_time(vma->vm_file);
+
+ return ret;
}
/*
@@ -2163,11 +2387,11 @@ gotten:
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
- dec_mm_counter(mm, file_rss);
- inc_mm_counter(mm, anon_rss);
+ dec_mm_counter_fast(mm, MM_FILEPAGES);
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
}
} else
- inc_mm_counter(mm, anon_rss);
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2185,7 +2409,7 @@ gotten:
* new page to be mapped directly into the secondary page table.
*/
set_pte_at_notify(mm, address, page_table, entry);
- update_mmu_cache(vma, address, entry);
+ update_mmu_cache(vma, address, page_table);
if (old_page) {
/*
* Only after switching the pte to the new page may
@@ -2224,39 +2448,6 @@ gotten:
page_cache_release(old_page);
unlock:
pte_unmap_unlock(page_table, ptl);
- if (dirty_page) {
- /*
- * Yes, Virginia, this is actually required to prevent a race
- * with clear_page_dirty_for_io() from clearing the page dirty
- * bit after it clear all dirty ptes, but before a racing
- * do_wp_page installs a dirty pte.
- *
- * do_no_page is protected similarly.
- */
- if (!page_mkwrite) {
- wait_on_page_locked(dirty_page);
- set_page_dirty_balance(dirty_page, page_mkwrite);
- }
- put_page(dirty_page);
- if (page_mkwrite) {
- struct address_space *mapping = dirty_page->mapping;
-
- set_page_dirty(dirty_page);
- unlock_page(dirty_page);
- page_cache_release(dirty_page);
- if (mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
- }
-
- /* file_update_time outside page_lock */
- if (vma->vm_file)
- file_update_time(vma->vm_file);
- }
return ret;
oom_free_new:
page_cache_release(new_page);
@@ -2512,10 +2703,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned int flags, pte_t orig_pte)
{
spinlock_t *ptl;
- struct page *page;
+ struct page *page, *swapcache = NULL;
swp_entry_t entry;
pte_t pte;
+ int locked;
struct mem_cgroup *ptr = NULL;
+ int exclusive = 0;
int ret = 0;
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2564,13 +2757,32 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_release;
}
- lock_page(page);
+ locked = lock_page_or_retry(page, mm, flags);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ if (!locked) {
+ ret |= VM_FAULT_RETRY;
+ goto out_release;
+ }
- page = ksm_might_need_to_copy(page, vma, address);
- if (!page) {
- ret = VM_FAULT_OOM;
- goto out;
+ /*
+ * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
+ * release the swapcache from under us. The page pin, and pte_same
+ * test below, are not enough to exclude that. Even if it is still
+ * swapcache, we need to check that the page's swap has not changed.
+ */
+ if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
+ goto out_page;
+
+ if (ksm_might_need_to_copy(page, vma, address)) {
+ swapcache = page;
+ page = ksm_does_need_to_copy(page, vma, address);
+
+ if (unlikely(!page)) {
+ ret = VM_FAULT_OOM;
+ page = swapcache;
+ swapcache = NULL;
+ goto out_page;
+ }
}
if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@@ -2604,15 +2816,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
* discarded at swap_free().
*/
- inc_mm_counter(mm, anon_rss);
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
+ dec_mm_counter_fast(mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
flags &= ~FAULT_FLAG_WRITE;
+ ret |= VM_FAULT_WRITE;
+ exclusive = 1;
}
flush_icache_page(vma, page);
set_pte_at(mm, address, page_table, pte);
- page_add_anon_rmap(page, vma, address);
+ do_page_add_anon_rmap(page, vma, address, exclusive);
/* It's better to call commit-charge after rmap is established */
mem_cgroup_commit_charge_swapin(page, ptr);
@@ -2620,6 +2835,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
+ if (swapcache) {
+ /*
+ * Hold the lock to avoid the swap entry to be reused
+ * until we take the PT lock for the pte_same() check
+ * (to avoid false positives from pte_same). For
+ * further safety release the lock after the swap_free
+ * so that the swap count won't change under a
+ * parallel locked swapcache.
+ */
+ unlock_page(swapcache);
+ page_cache_release(swapcache);
+ }
if (flags & FAULT_FLAG_WRITE) {
ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
@@ -2629,7 +2856,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, address, pte);
+ update_mmu_cache(vma, address, page_table);
unlock:
pte_unmap_unlock(page_table, ptl);
out:
@@ -2641,10 +2868,48 @@ out_page:
unlock_page(page);
out_release:
page_cache_release(page);
+ if (swapcache) {
+ unlock_page(swapcache);
+ page_cache_release(swapcache);
+ }
return ret;
}
/*
+ * This is like a special single-page "expand_{down|up}wards()",
+ * except we must first make sure that 'address{-|+}PAGE_SIZE'
+ * doesn't hit another vma.
+ */
+static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
+{
+ address &= PAGE_MASK;
+ if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
+ struct vm_area_struct *prev = vma->vm_prev;
+
+ /*
+ * Is there a mapping abutting this one below?
+ *
+ * That's only ok if it's the same stack mapping
+ * that has gotten split..
+ */
+ if (prev && prev->vm_end == address)
+ return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
+
+ expand_stack(vma, address - PAGE_SIZE);
+ }
+ if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
+ struct vm_area_struct *next = vma->vm_next;
+
+ /* As VM_GROWSDOWN but s/below/above/ */
+ if (next && next->vm_start == address + PAGE_SIZE)
+ return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
+
+ expand_upwards(vma, address + PAGE_SIZE);
+ }
+ return 0;
+}
+
+/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2657,19 +2922,23 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
spinlock_t *ptl;
pte_t entry;
+ pte_unmap(page_table);
+
+ /* Check if we need to add a guard page to the stack */
+ if (check_stack_guard_page(vma, address) < 0)
+ return VM_FAULT_SIGBUS;
+
+ /* Use the zero-page for reads */
if (!(flags & FAULT_FLAG_WRITE)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
vma->vm_page_prot));
- ptl = pte_lockptr(mm, pmd);
- spin_lock(ptl);
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto unlock;
goto setpte;
}
/* Allocate our own private page. */
- pte_unmap(page_table);
-
if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_zeroed_user_highpage_movable(vma, address);
@@ -2688,13 +2957,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte_none(*page_table))
goto release;
- inc_mm_counter(mm, anon_rss);
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address);
setpte:
set_pte_at(mm, address, page_table, entry);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, address, entry);
+ update_mmu_cache(vma, address, page_table);
unlock:
pte_unmap_unlock(page_table, ptl);
return 0;
@@ -2742,7 +3011,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vmf.page = NULL;
ret = vma->vm_ops->fault(vma, &vmf);
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
+ VM_FAULT_RETRY)))
return ret;
if (unlikely(PageHWPoison(vmf.page))) {
@@ -2842,10 +3112,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_WRITE)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (anon) {
- inc_mm_counter(mm, anon_rss);
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address);
} else {
- inc_mm_counter(mm, file_rss);
+ inc_mm_counter_fast(mm, MM_FILEPAGES);
page_add_file_rmap(page);
if (flags & FAULT_FLAG_WRITE) {
dirty_page = page;
@@ -2855,7 +3125,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
set_pte_at(mm, address, page_table, entry);
/* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, address, entry);
+ update_mmu_cache(vma, address, page_table);
} else {
if (charged)
mem_cgroup_uncharge_page(page);
@@ -2955,9 +3225,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static inline int handle_pte_fault(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long address,
- pte_t *pte, pmd_t *pmd, unsigned int flags)
+int handle_pte_fault(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, pmd_t *pmd, unsigned int flags)
{
pte_t entry;
spinlock_t *ptl;
@@ -2992,7 +3262,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
- update_mmu_cache(vma, address, entry);
+ update_mmu_cache(vma, address, pte);
} else {
/*
* This is needed only for protection faults but the arch code
@@ -3001,7 +3271,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
* with threads.
*/
if (flags & FAULT_FLAG_WRITE)
- flush_tlb_page(vma, address);
+ flush_tlb_fix_spurious_fault(vma, address);
}
unlock:
pte_unmap_unlock(pte, ptl);
@@ -3023,6 +3293,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(PGFAULT);
+ /* do counter updates before entering really critical section. */
+ check_sync_rss_stat(current);
+
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
@@ -3033,9 +3306,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
return VM_FAULT_OOM;
- pte = pte_alloc_map(mm, pmd, address);
- if (!pte)
+ if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
+ if (!vma->vm_ops)
+ return do_huge_pmd_anonymous_page(mm, vma, address,
+ pmd, flags);
+ } else {
+ pmd_t orig_pmd = *pmd;
+ barrier();
+ if (pmd_trans_huge(orig_pmd)) {
+ if (flags & FAULT_FLAG_WRITE &&
+ !pmd_write(orig_pmd) &&
+ !pmd_trans_splitting(orig_pmd))
+ return do_huge_pmd_wp_page(mm, vma, address,
+ pmd, orig_pmd);
+ return 0;
+ }
+ }
+
+ /*
+ * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * run pte_offset_map on the pmd, if an huge pmd could
+ * materialize from under us from a different thread.
+ */
+ if (unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
+ /* if an huge pmd materialized from under us just retry later */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ return 0;
+ /*
+ * A regular pmd is established and it can't morph into a huge pmd
+ * from under us anymore at this point because we hold the mmap_sem
+ * read mode and khugepaged takes it in write mode. So now it's
+ * safe to run pte_offset_map().
+ */
+ pte = pte_offset_map(pmd, address);
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}
@@ -3101,7 +3405,12 @@ int make_pages_present(unsigned long addr, unsigned long end)
vma = find_vma(current->mm, addr);
if (!vma)
return -ENOMEM;
- write = (vma->vm_flags & VM_WRITE) != 0;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
BUG_ON(addr >= end);
BUG_ON(end > vma->vm_end);
len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
@@ -3156,7 +3465,7 @@ int in_gate_area_no_task(unsigned long addr)
#endif /* __HAVE_ARCH_GATE_AREA */
-static int follow_pte(struct mm_struct *mm, unsigned long address,
+static int __follow_pte(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp)
{
pgd_t *pgd;
@@ -3173,6 +3482,7 @@ static int follow_pte(struct mm_struct *mm, unsigned long address,
goto out;
pmd = pmd_offset(pud, address);
+ VM_BUG_ON(pmd_trans_huge(*pmd));
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
@@ -3193,6 +3503,17 @@ out:
return -EINVAL;
}
+static inline int follow_pte(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, spinlock_t **ptlp)
+{
+ int res;
+
+ /* (void) is needed to make gcc happy */
+ (void) __cond_lock(*ptlp,
+ !(res = __follow_pte(mm, address, ptepp, ptlp)));
+ return res;
+}
+
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
@@ -3402,3 +3723,74 @@ void might_fault(void)
}
EXPORT_SYMBOL(might_fault);
#endif
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+static void clear_gigantic_page(struct page *page,
+ unsigned long addr,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+ struct page *p = page;
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page;
+ i++, p = mem_map_next(p, page, i)) {
+ cond_resched();
+ clear_user_highpage(p, addr + i * PAGE_SIZE);
+ }
+}
+void clear_huge_page(struct page *page,
+ unsigned long addr, unsigned int pages_per_huge_page)
+{
+ int i;
+
+ if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+ clear_gigantic_page(page, addr, pages_per_huge_page);
+ return;
+ }
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page; i++) {
+ cond_resched();
+ clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ }
+}
+
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
+ unsigned long addr,
+ struct vm_area_struct *vma,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+
+ for (i = 0; i < pages_per_huge_page; ) {
+ cond_resched();
+ copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
+
+void copy_user_huge_page(struct page *dst, struct page *src,
+ unsigned long addr, struct vm_area_struct *vma,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+
+ if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+ copy_user_gigantic_page(dst, src, addr, vma,
+ pages_per_huge_page);
+ return;
+ }
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page; i++) {
+ cond_resched();
+ copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
+ }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 030ce8a5bb0e..321fc7455df7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -28,11 +28,29 @@
#include <linux/pfn.h>
#include <linux/suspend.h>
#include <linux/mm_inline.h>
+#include <linux/firmware-map.h>
#include <asm/tlbflush.h>
#include "internal.h"
+DEFINE_MUTEX(mem_hotplug_mutex);
+
+void lock_memory_hotplug(void)
+{
+ mutex_lock(&mem_hotplug_mutex);
+
+ /* for exclusive hibernation if CONFIG_HIBERNATION=y */
+ lock_system_sleep();
+}
+
+void unlock_memory_hotplug(void)
+{
+ unlock_system_sleep();
+ mutex_unlock(&mem_hotplug_mutex);
+}
+
+
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
@@ -64,9 +82,10 @@ static void release_memory_resource(struct resource *res)
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
#ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void get_page_bootmem(unsigned long info, struct page *page, int type)
+static void get_page_bootmem(unsigned long info, struct page *page,
+ unsigned long type)
{
- atomic_set(&page->_mapcount, type);
+ page->lru.next = (struct list_head *) type;
SetPagePrivate(page);
set_page_private(page, info);
atomic_inc(&page->_count);
@@ -76,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type)
* so use __ref to tell modpost not to generate a warning */
void __ref put_page_bootmem(struct page *page)
{
- int type;
+ unsigned long type;
- type = atomic_read(&page->_mapcount);
- BUG_ON(type >= -1);
+ type = (unsigned long) page->lru.next;
+ BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+ type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
if (atomic_dec_return(&page->_count) == 1) {
ClearPagePrivate(page);
set_page_private(page, 0);
- reset_page_mapcount(page);
+ INIT_LIST_HEAD(&page->lru);
__free_pages_bootmem(page, 0);
}
@@ -389,6 +409,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
int ret;
struct memory_notify arg;
+ lock_memory_hotplug();
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
arg.status_change_nid = -1;
@@ -401,6 +422,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
ret = notifier_to_errno(ret);
if (ret) {
memory_notify(MEM_CANCEL_ONLINE, &arg);
+ unlock_memory_hotplug();
return ret;
}
/*
@@ -414,22 +436,29 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
* This means the page allocator ignores this zone.
* So, zonelist must be updated after online.
*/
+ mutex_lock(&zonelists_mutex);
if (!populated_zone(zone))
need_zonelists_rebuild = 1;
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
online_pages_range);
if (ret) {
+ mutex_unlock(&zonelists_mutex);
printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
nr_pages, pfn);
memory_notify(MEM_CANCEL_ONLINE, &arg);
+ unlock_memory_hotplug();
return ret;
}
zone->present_pages += onlined_pages;
zone->zone_pgdat->node_present_pages += onlined_pages;
+ if (need_zonelists_rebuild)
+ build_all_zonelists(zone);
+ else
+ zone_pcp_update(zone);
- zone_pcp_update(zone);
+ mutex_unlock(&zonelists_mutex);
setup_per_zone_wmarks();
calculate_zone_inactive_ratio(zone);
if (onlined_pages) {
@@ -437,15 +466,13 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
}
- if (need_zonelists_rebuild)
- build_all_zonelists();
- else
- vm_total_pages = nr_free_pagecache_pages();
+ vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
if (onlined_pages)
memory_notify(MEM_ONLINE, &arg);
+ unlock_memory_hotplug();
return 0;
}
@@ -481,6 +508,29 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
}
+/*
+ * called by cpu_up() to online a node without onlined memory.
+ */
+int mem_online_node(int nid)
+{
+ pg_data_t *pgdat;
+ int ret;
+
+ lock_memory_hotplug();
+ pgdat = hotadd_new_pgdat(nid, 0);
+ if (pgdat) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ node_set_online(nid);
+ ret = register_one_node(nid);
+ BUG_ON(ret);
+
+out:
+ unlock_memory_hotplug();
+ return ret;
+}
+
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
int __ref add_memory(int nid, u64 start, u64 size)
{
@@ -489,7 +539,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
struct resource *res;
int ret;
- lock_system_sleep();
+ lock_memory_hotplug();
res = register_memory_resource(start, size);
ret = -EEXIST;
@@ -523,6 +573,9 @@ int __ref add_memory(int nid, u64 start, u64 size)
BUG_ON(ret);
}
+ /* create new memmap entry */
+ firmware_map_add_hotplug(start, start + size, "System RAM");
+
goto out;
error:
@@ -533,7 +586,7 @@ error:
release_memory_resource(res);
out:
- unlock_system_sleep();
+ unlock_memory_hotplug();
return ret;
}
EXPORT_SYMBOL_GPL(add_memory);
@@ -554,45 +607,32 @@ static inline int pageblock_free(struct page *page)
/* Return the start of the next active pageblock after a given page */
static struct page *next_active_pageblock(struct page *page)
{
- int pageblocks_stride;
-
/* Ensure the starting page is pageblock-aligned */
BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
- /* Move forward by at least 1 * pageblock_nr_pages */
- pageblocks_stride = 1;
-
/* If the entire pageblock is free, move to the end of free page */
- if (pageblock_free(page))
- pageblocks_stride += page_order(page) - pageblock_order;
+ if (pageblock_free(page)) {
+ int order;
+ /* be careful. we don't have locks, page_order can be changed.*/
+ order = page_order(page);
+ if ((order < MAX_ORDER) && (order >= pageblock_order))
+ return page + (1 << order);
+ }
- return page + (pageblocks_stride * pageblock_nr_pages);
+ return page + pageblock_nr_pages;
}
/* Checks if this range of memory is likely to be hot-removable. */
int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
- int type;
struct page *page = pfn_to_page(start_pfn);
struct page *end_page = page + nr_pages;
/* Check the starting page of each pageblock within the range */
for (; page < end_page; page = next_active_pageblock(page)) {
- type = get_pageblock_migratetype(page);
-
- /*
- * A pageblock containing MOVABLE or free pages is considered
- * removable
- */
- if (type != MIGRATE_MOVABLE && !pageblock_free(page))
- return 0;
-
- /*
- * A pageblock starting with a PageReserved page is not
- * considered removable.
- */
- if (PageReserved(page))
+ if (!is_pageblock_removable_nolock(page))
return 0;
+ cond_resched();
}
/* All pageblocks in the memory block are likely to be hot-removable */
@@ -629,7 +669,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
* Scanning pfn is much easier than scanning lru list.
* Scan pfn from start to end and Find LRU page.
*/
-int scan_lru_pages(unsigned long start, unsigned long end)
+static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
{
unsigned long pfn;
struct page *page;
@@ -679,29 +719,31 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
page_is_file_cache(page));
} else {
+#ifdef CONFIG_DEBUG_VM
+ printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
+ pfn);
+ dump_page(page);
+#endif
/* Becasue we don't have big zone->lock. we should
check this again here. */
- if (page_count(page))
+ if (page_count(page)) {
not_managed++;
-#ifdef CONFIG_DEBUG_VM
- printk(KERN_INFO "removing from LRU failed"
- " %lx/%d/%lx\n",
- pfn, page_count(page), page->flags);
-#endif
+ ret = -EBUSY;
+ break;
+ }
}
}
- ret = -EBUSY;
- if (not_managed) {
- if (!list_empty(&source))
+ if (!list_empty(&source)) {
+ if (not_managed) {
+ putback_lru_pages(&source);
+ goto out;
+ }
+ /* this function returns # of failed pages */
+ ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
+ true, true);
+ if (ret)
putback_lru_pages(&source);
- goto out;
}
- ret = 0;
- if (list_empty(&source))
- goto out;
- /* this function returns # of failed pages */
- ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
-
out:
return ret;
}
@@ -773,7 +815,7 @@ static int offline_pages(unsigned long start_pfn,
if (!test_pages_in_a_zone(start_pfn, end_pfn))
return -EINVAL;
- lock_system_sleep();
+ lock_memory_hotplug();
zone = page_zone(pfn_to_page(start_pfn));
node = zone_to_nid(zone);
@@ -810,7 +852,6 @@ repeat:
ret = 0;
if (drain) {
lru_add_drain_all();
- flush_scheduled_work();
cond_resched();
drain_all_pages();
}
@@ -832,7 +873,6 @@ repeat:
}
/* drain all zone's lru pagevec, this is asyncronous... */
lru_add_drain_all();
- flush_scheduled_work();
yield();
/* drain pcp pages , this is synchrouns. */
drain_all_pages();
@@ -864,7 +904,7 @@ repeat:
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
- unlock_system_sleep();
+ unlock_memory_hotplug();
return 0;
failed_removal:
@@ -875,7 +915,7 @@ failed_removal:
undo_isolate_page_range(start_pfn, end_pfn);
out:
- unlock_system_sleep();
+ unlock_memory_hotplug();
return ret;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 290fb5bf0440..368fc9d23610 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -73,7 +73,6 @@
#include <linux/sched.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
-#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/module.h>
@@ -120,7 +119,22 @@ struct mempolicy default_policy = {
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
- void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
+ /*
+ * If read-side task has no lock to protect task->mempolicy, write-side
+ * task will rebind the task->mempolicy by two step. The first step is
+ * setting all the newly nodes, and the second step is cleaning all the
+ * disallowed nodes. In this way, we can avoid finding no node to alloc
+ * page.
+ * If we have a lock to protect task->mempolicy in read-side, we do
+ * rebind directly.
+ *
+ * step:
+ * MPOL_REBIND_ONCE - do rebind work at once
+ * MPOL_REBIND_STEP1 - set all the newly nodes
+ * MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ */
+ void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
+ enum mpol_rebind_step step);
} mpol_ops[MPOL_MAX];
/* Check that the nodemask contains at least one populated zone */
@@ -128,9 +142,6 @@ static int is_valid_nodemask(const nodemask_t *nodemask)
{
int nd, k;
- /* Check that there is something useful in this mask */
- k = policy_zone;
-
for_each_node_mask(nd, *nodemask) {
struct zone *z;
@@ -146,7 +157,7 @@ static int is_valid_nodemask(const nodemask_t *nodemask)
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
- return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
+ return pol->flags & MPOL_MODE_FLAGS;
}
static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
@@ -278,12 +289,19 @@ void __mpol_put(struct mempolicy *p)
kmem_cache_free(policy_cache, p);
}
-static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
+ enum mpol_rebind_step step)
{
}
-static void mpol_rebind_nodemask(struct mempolicy *pol,
- const nodemask_t *nodes)
+/*
+ * step:
+ * MPOL_REBIND_ONCE - do rebind work at once
+ * MPOL_REBIND_STEP1 - set all the newly nodes
+ * MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ */
+static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
+ enum mpol_rebind_step step)
{
nodemask_t tmp;
@@ -292,12 +310,31 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,
else if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
else {
- nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
- *nodes);
- pol->w.cpuset_mems_allowed = *nodes;
+ /*
+ * if step == 1, we use ->w.cpuset_mems_allowed to cache the
+ * result
+ */
+ if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
+ nodes_remap(tmp, pol->v.nodes,
+ pol->w.cpuset_mems_allowed, *nodes);
+ pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
+ } else if (step == MPOL_REBIND_STEP2) {
+ tmp = pol->w.cpuset_mems_allowed;
+ pol->w.cpuset_mems_allowed = *nodes;
+ } else
+ BUG();
}
- pol->v.nodes = tmp;
+ if (nodes_empty(tmp))
+ tmp = *nodes;
+
+ if (step == MPOL_REBIND_STEP1)
+ nodes_or(pol->v.nodes, pol->v.nodes, tmp);
+ else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
+ pol->v.nodes = tmp;
+ else
+ BUG();
+
if (!node_isset(current->il_next, tmp)) {
current->il_next = next_node(current->il_next, tmp);
if (current->il_next >= MAX_NUMNODES)
@@ -308,7 +345,8 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,
}
static void mpol_rebind_preferred(struct mempolicy *pol,
- const nodemask_t *nodes)
+ const nodemask_t *nodes,
+ enum mpol_rebind_step step)
{
nodemask_t tmp;
@@ -331,16 +369,45 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
}
}
-/* Migrate a policy to a different set of nodes */
-static void mpol_rebind_policy(struct mempolicy *pol,
- const nodemask_t *newmask)
+/*
+ * mpol_rebind_policy - Migrate a policy to a different set of nodes
+ *
+ * If read-side task has no lock to protect task->mempolicy, write-side
+ * task will rebind the task->mempolicy by two step. The first step is
+ * setting all the newly nodes, and the second step is cleaning all the
+ * disallowed nodes. In this way, we can avoid finding no node to alloc
+ * page.
+ * If we have a lock to protect task->mempolicy in read-side, we do
+ * rebind directly.
+ *
+ * step:
+ * MPOL_REBIND_ONCE - do rebind work at once
+ * MPOL_REBIND_STEP1 - set all the newly nodes
+ * MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ */
+static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
+ enum mpol_rebind_step step)
{
if (!pol)
return;
- if (!mpol_store_user_nodemask(pol) &&
+ if (!mpol_store_user_nodemask(pol) && step == 0 &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
- mpol_ops[pol->mode].rebind(pol, newmask);
+
+ if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
+ return;
+
+ if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
+ BUG();
+
+ if (step == MPOL_REBIND_STEP1)
+ pol->flags |= MPOL_F_REBINDING;
+ else if (step == MPOL_REBIND_STEP2)
+ pol->flags &= ~MPOL_F_REBINDING;
+ else if (step >= MPOL_REBIND_NSTEP)
+ BUG();
+
+ mpol_ops[pol->mode].rebind(pol, newmask, step);
}
/*
@@ -350,9 +417,10 @@ static void mpol_rebind_policy(struct mempolicy *pol,
* Called with task's alloc_lock held.
*/
-void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
+ enum mpol_rebind_step step)
{
- mpol_rebind_policy(tsk->mempolicy, new);
+ mpol_rebind_policy(tsk->mempolicy, new, step);
}
/*
@@ -367,7 +435,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
down_write(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next)
- mpol_rebind_policy(vma->vm_policy, new);
+ mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
up_write(&mm->mmap_sem);
}
@@ -446,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ split_huge_page_pmd(vma->vm_mm, pmd);
if (pmd_none_or_clear_bad(pmd))
continue;
if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -563,24 +632,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
}
/* Step 2: apply policy to a range and do splits. */
-static int mbind_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct mempolicy *new)
+static int mbind_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct mempolicy *new_pol)
{
struct vm_area_struct *next;
- int err;
+ struct vm_area_struct *prev;
+ struct vm_area_struct *vma;
+ int err = 0;
+ pgoff_t pgoff;
+ unsigned long vmstart;
+ unsigned long vmend;
- err = 0;
- for (; vma && vma->vm_start < end; vma = next) {
+ vma = find_vma_prev(mm, start, &prev);
+ if (!vma || vma->vm_start > start)
+ return -EFAULT;
+
+ for (; vma && vma->vm_start < end; prev = vma, vma = next) {
next = vma->vm_next;
- if (vma->vm_start < start)
- err = split_vma(vma->vm_mm, vma, start, 1);
- if (!err && vma->vm_end > end)
- err = split_vma(vma->vm_mm, vma, end, 0);
- if (!err)
- err = policy_vma(vma, new);
+ vmstart = max(start, vma->vm_start);
+ vmend = min(end, vma->vm_end);
+
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
+ vma->anon_vma, vma->vm_file, pgoff, new_pol);
+ if (prev) {
+ vma = prev;
+ next = vma->vm_next;
+ continue;
+ }
+ if (vma->vm_start != vmstart) {
+ err = split_vma(vma->vm_mm, vma, vmstart, 1);
+ if (err)
+ goto out;
+ }
+ if (vma->vm_end != vmend) {
+ err = split_vma(vma->vm_mm, vma, vmend, 0);
+ if (err)
+ goto out;
+ }
+ err = policy_vma(vma, new_pol);
if (err)
- break;
+ goto out;
}
+
+ out:
return err;
}
@@ -780,9 +875,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
err = 0;
if (nmask) {
- task_lock(current);
- get_policy_nodemask(pol, nmask);
- task_unlock(current);
+ if (mpol_store_user_nodemask(pol)) {
+ *nmask = pol->w.user_nodemask;
+ } else {
+ task_lock(current);
+ get_policy_nodemask(pol, nmask);
+ task_unlock(current);
+ }
}
out:
@@ -826,15 +925,22 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
nodemask_t nmask;
LIST_HEAD(pagelist);
int err = 0;
+ struct vm_area_struct *vma;
nodes_clear(nmask);
node_set(source, nmask);
- check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
+ vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
- if (!list_empty(&pagelist))
- err = migrate_pages(&pagelist, new_node_page, dest, 0);
+ if (!list_empty(&pagelist)) {
+ err = migrate_pages(&pagelist, new_node_page, dest,
+ false, true);
+ if (err)
+ putback_lru_pages(&pagelist);
+ }
return err;
}
@@ -862,36 +968,36 @@ int do_migrate_pages(struct mm_struct *mm,
if (err)
goto out;
-/*
- * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
- * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
- * bit in 'tmp', and return that <source, dest> pair for migration.
- * The pair of nodemasks 'to' and 'from' define the map.
- *
- * If no pair of bits is found that way, fallback to picking some
- * pair of 'source' and 'dest' bits that are not the same. If the
- * 'source' and 'dest' bits are the same, this represents a node
- * that will be migrating to itself, so no pages need move.
- *
- * If no bits are left in 'tmp', or if all remaining bits left
- * in 'tmp' correspond to the same bit in 'to', return false
- * (nothing left to migrate).
- *
- * This lets us pick a pair of nodes to migrate between, such that
- * if possible the dest node is not already occupied by some other
- * source node, minimizing the risk of overloading the memory on a
- * node that would happen if we migrated incoming memory to a node
- * before migrating outgoing memory source that same node.
- *
- * A single scan of tmp is sufficient. As we go, we remember the
- * most recent <s, d> pair that moved (s != d). If we find a pair
- * that not only moved, but what's better, moved to an empty slot
- * (d is not set in tmp), then we break out then, with that pair.
- * Otherwise when we finish scannng from_tmp, we at least have the
- * most recent <s, d> pair that moved. If we get all the way through
- * the scan of tmp without finding any node that moved, much less
- * moved to an empty node, then there is nothing left worth migrating.
- */
+ /*
+ * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
+ * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
+ * bit in 'tmp', and return that <source, dest> pair for migration.
+ * The pair of nodemasks 'to' and 'from' define the map.
+ *
+ * If no pair of bits is found that way, fallback to picking some
+ * pair of 'source' and 'dest' bits that are not the same. If the
+ * 'source' and 'dest' bits are the same, this represents a node
+ * that will be migrating to itself, so no pages need move.
+ *
+ * If no bits are left in 'tmp', or if all remaining bits left
+ * in 'tmp' correspond to the same bit in 'to', return false
+ * (nothing left to migrate).
+ *
+ * This lets us pick a pair of nodes to migrate between, such that
+ * if possible the dest node is not already occupied by some other
+ * source node, minimizing the risk of overloading the memory on a
+ * node that would happen if we migrated incoming memory to a node
+ * before migrating outgoing memory source that same node.
+ *
+ * A single scan of tmp is sufficient. As we go, we remember the
+ * most recent <s, d> pair that moved (s != d). If we find a pair
+ * that not only moved, but what's better, moved to an empty slot
+ * (d is not set in tmp), then we break out then, with that pair.
+ * Otherwise when we finish scannng from_tmp, we at least have the
+ * most recent <s, d> pair that moved. If we get all the way through
+ * the scan of tmp without finding any node that moved, much less
+ * moved to an empty node, then there is nothing left worth migrating.
+ */
tmp = *from_nodes;
while (!nodes_empty(tmp)) {
@@ -1047,11 +1153,15 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!IS_ERR(vma)) {
int nr_failed = 0;
- err = mbind_range(vma, start, end, new);
+ err = mbind_range(mm, start, end, new);
- if (!list_empty(&pagelist))
+ if (!list_empty(&pagelist)) {
nr_failed = migrate_pages(&pagelist, new_vma_page,
- (unsigned long)vma, 0);
+ (unsigned long)vma,
+ false, true);
+ if (nr_failed)
+ putback_lru_pages(&pagelist);
+ }
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
@@ -1177,33 +1287,42 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
const unsigned long __user *, new_nodes)
{
const struct cred *cred = current_cred(), *tcred;
- struct mm_struct *mm;
+ struct mm_struct *mm = NULL;
struct task_struct *task;
- nodemask_t old;
- nodemask_t new;
nodemask_t task_nodes;
int err;
+ nodemask_t *old;
+ nodemask_t *new;
+ NODEMASK_SCRATCH(scratch);
+
+ if (!scratch)
+ return -ENOMEM;
- err = get_nodes(&old, old_nodes, maxnode);
+ old = &scratch->mask1;
+ new = &scratch->mask2;
+
+ err = get_nodes(old, old_nodes, maxnode);
if (err)
- return err;
+ goto out;
- err = get_nodes(&new, new_nodes, maxnode);
+ err = get_nodes(new, new_nodes, maxnode);
if (err)
- return err;
+ goto out;
/* Find the mm_struct */
- read_lock(&tasklist_lock);
+ rcu_read_lock();
task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
- read_unlock(&tasklist_lock);
- return -ESRCH;
+ rcu_read_unlock();
+ err = -ESRCH;
+ goto out;
}
mm = get_task_mm(task);
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
+ err = -EINVAL;
if (!mm)
- return -EINVAL;
+ goto out;
/*
* Check if this process has the right to modify the specified
@@ -1224,12 +1343,12 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
- if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
+ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out;
}
- if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
+ if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
err = -EINVAL;
goto out;
}
@@ -1238,10 +1357,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
if (err)
goto out;
- err = do_migrate_pages(mm, &old, &new,
+ err = do_migrate_pages(mm, old, new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
out:
- mmput(mm);
+ if (mm)
+ mmput(mm);
+ NODEMASK_SCRATCH_FREE(scratch);
+
return err;
}
@@ -1415,15 +1537,13 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
/*
* Normally, MPOL_BIND allocations are node-local within the
* allowed nodemask. However, if __GFP_THISNODE is set and the
- * current node is part of the mask, we use the zonelist for
+ * current node isn't part of the mask, we use the zonelist for
* the first node in the mask instead.
*/
if (unlikely(gfp & __GFP_THISNODE) &&
unlikely(!node_isset(nd, policy->v.nodes)))
nd = first_node(policy->v.nodes);
break;
- case MPOL_INTERLEAVE: /* should not happen */
- break;
default:
BUG();
}
@@ -1480,7 +1600,7 @@ unsigned slab_node(struct mempolicy *policy)
(void)first_zones_zonelist(zonelist, highest_zoneidx,
&policy->v.nodes,
&zone);
- return zone->node;
+ return zone ? zone->node : numa_node_id();
}
default:
@@ -1543,6 +1663,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
* to the struct mempolicy for conditional unref after allocation.
* If the effective policy is 'BIND, returns a pointer to the mempolicy's
* @nodemask for filtering the zonelist.
+ *
+ * Must be protected by get_mems_allowed()
*/
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
gfp_t gfp_flags, struct mempolicy **mpol,
@@ -1588,6 +1710,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
if (!(mask && current->mempolicy))
return false;
+ task_lock(current);
mempolicy = current->mempolicy;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
@@ -1607,11 +1730,56 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
default:
BUG();
}
+ task_unlock(current);
return true;
}
#endif
+/*
+ * mempolicy_nodemask_intersects
+ *
+ * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
+ * policy. Otherwise, check for intersection between mask and the policy
+ * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
+ * policy, always return true since it may allocate elsewhere on fallback.
+ *
+ * Takes task_lock(tsk) to prevent freeing of its mempolicy.
+ */
+bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+ const nodemask_t *mask)
+{
+ struct mempolicy *mempolicy;
+ bool ret = true;
+
+ if (!mask)
+ return ret;
+ task_lock(tsk);
+ mempolicy = tsk->mempolicy;
+ if (!mempolicy)
+ goto out;
+
+ switch (mempolicy->mode) {
+ case MPOL_PREFERRED:
+ /*
+ * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
+ * allocate from, they may fallback to other nodes when oom.
+ * Thus, it's possible for tsk to have allocated memory from
+ * nodes in mask.
+ */
+ break;
+ case MPOL_BIND:
+ case MPOL_INTERLEAVE:
+ ret = nodes_intersects(mempolicy->v.nodes, *mask);
+ break;
+ default:
+ BUG();
+ }
+out:
+ task_unlock(tsk);
+ return ret;
+}
+
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -1628,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
}
/**
- * alloc_page_vma - Allocate a page for a VMA.
+ * alloc_pages_vma - Allocate a page for a VMA.
*
* @gfp:
* %GFP_USER user allocation.
@@ -1637,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* %GFP_FS allocation should not call back into a file system.
* %GFP_ATOMIC don't sleep.
*
+ * @order:Order of the GFP allocation.
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual Address of the allocation. Must be inside the VMA.
*
@@ -1650,32 +1819,41 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* Should be called with the mm_sem of the vma hold.
*/
struct page *
-alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
+alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+ unsigned long addr)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct zonelist *zl;
+ struct page *page;
+ get_mems_allowed();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
mpol_cond_put(pol);
- return alloc_page_interleave(gfp, 0, nid);
+ page = alloc_page_interleave(gfp, order, nid);
+ put_mems_allowed();
+ return page;
}
zl = policy_zonelist(gfp, pol);
if (unlikely(mpol_needs_cond_ref(pol))) {
/*
* slow path: ref counted shared policy
*/
- struct page *page = __alloc_pages_nodemask(gfp, 0,
+ struct page *page = __alloc_pages_nodemask(gfp, order,
zl, policy_nodemask(gfp, pol));
__mpol_put(pol);
+ put_mems_allowed();
return page;
}
/*
* fast path: default or task policy
*/
- return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+ page = __alloc_pages_nodemask(gfp, order, zl,
+ policy_nodemask(gfp, pol));
+ put_mems_allowed();
+ return page;
}
/**
@@ -1700,18 +1878,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = current->mempolicy;
+ struct page *page;
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
+ get_mems_allowed();
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
*/
if (pol->mode == MPOL_INTERLEAVE)
- return alloc_page_interleave(gfp, order, interleave_nodes(pol));
- return __alloc_pages_nodemask(gfp, order,
+ page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+ else
+ page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+ put_mems_allowed();
+ return page;
}
EXPORT_SYMBOL(alloc_pages_current);
@@ -1721,6 +1904,9 @@ EXPORT_SYMBOL(alloc_pages_current);
* with the mems_allowed returned by cpuset_mems_allowed(). This
* keeps mempolicies cpuset relative after its cpuset moves. See
* further kernel/cpuset.c update_nodemask().
+ *
+ * current's mempolicy may be rebinded by the other task(the task that changes
+ * cpuset's mems), so we needn't do rebind work for current task.
*/
/* Slow path of a mempolicy duplicate */
@@ -1730,11 +1916,24 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
if (!new)
return ERR_PTR(-ENOMEM);
+
+ /* task's mempolicy is protected by alloc_lock */
+ if (old == current->mempolicy) {
+ task_lock(current);
+ *new = *old;
+ task_unlock(current);
+ } else
+ *new = *old;
+
+ rcu_read_lock();
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
- mpol_rebind_policy(old, &mems);
+ if (new->flags & MPOL_F_REBINDING)
+ mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
+ else
+ mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
}
- *new = *old;
+ rcu_read_unlock();
atomic_set(&new->refcnt, 1);
return new;
}
@@ -1761,16 +1960,6 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
return tompol;
}
-static int mpol_match_intent(const struct mempolicy *a,
- const struct mempolicy *b)
-{
- if (a->flags != b->flags)
- return 0;
- if (!mpol_store_user_nodemask(a))
- return 1;
- return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
-}
-
/* Slow path of a mempolicy comparison */
int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
@@ -1778,8 +1967,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
return 0;
if (a->mode != b->mode)
return 0;
- if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
+ if (a->flags != b->flags)
return 0;
+ if (mpol_store_user_nodemask(a))
+ if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
+ return 0;
+
switch (a->mode) {
case MPOL_BIND:
/* Fall through */
@@ -1972,31 +2165,29 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
NODEMASK_SCRATCH(scratch);
if (!scratch)
- return;
+ goto put_mpol;
/* contextualize the tmpfs mount point mempolicy */
new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
- if (IS_ERR(new)) {
- mpol_put(mpol); /* drop our ref on sb mpol */
- NODEMASK_SCRATCH_FREE(scratch);
- return; /* no valid nodemask intersection */
- }
+ if (IS_ERR(new))
+ goto free_scratch; /* no valid nodemask intersection */
task_lock(current);
ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
task_unlock(current);
- mpol_put(mpol); /* drop our ref on sb mpol */
- if (ret) {
- NODEMASK_SCRATCH_FREE(scratch);
- mpol_put(new);
- return;
- }
+ if (ret)
+ goto put_new;
/* Create pseudo-vma that contains just the policy */
memset(&pvma, 0, sizeof(struct vm_area_struct));
pvma.vm_end = TASK_SIZE; /* policy covers entire file */
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
+
+put_new:
mpol_put(new); /* drop initial ref */
+free_scratch:
NODEMASK_SCRATCH_FREE(scratch);
+put_mpol:
+ mpol_put(mpol); /* drop our incoming ref on sb mpol */
}
}
@@ -2101,9 +2292,15 @@ void numa_default_policy(void)
* "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
* Used only for mpol_parse_str() and mpol_to_str()
*/
-#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
-static const char * const policy_types[] =
- { "default", "prefer", "bind", "interleave", "local" };
+#define MPOL_LOCAL MPOL_MAX
+static const char * const policy_modes[] =
+{
+ [MPOL_DEFAULT] = "default",
+ [MPOL_PREFERRED] = "prefer",
+ [MPOL_BIND] = "bind",
+ [MPOL_INTERLEAVE] = "interleave",
+ [MPOL_LOCAL] = "local"
+};
#ifdef CONFIG_TMPFS
@@ -2128,12 +2325,11 @@ static const char * const policy_types[] =
int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
{
struct mempolicy *new = NULL;
- unsigned short uninitialized_var(mode);
+ unsigned short mode;
unsigned short uninitialized_var(mode_flags);
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
- int i;
int err = 1;
if (nodelist) {
@@ -2149,13 +2345,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
if (flags)
*flags++ = '\0'; /* terminate mode string */
- for (i = 0; i <= MPOL_LOCAL; i++) {
- if (!strcmp(str, policy_types[i])) {
- mode = i;
+ for (mode = 0; mode <= MPOL_LOCAL; mode++) {
+ if (!strcmp(str, policy_modes[mode])) {
break;
}
}
- if (i > MPOL_LOCAL)
+ if (mode > MPOL_LOCAL)
goto out;
switch (mode) {
@@ -2167,8 +2362,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
char *rest = nodelist;
while (isdigit(*rest))
rest++;
- if (!*rest)
- err = 0;
+ if (*rest)
+ goto out;
}
break;
case MPOL_INTERLEAVE:
@@ -2177,7 +2372,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
*/
if (!nodelist)
nodes = node_states[N_HIGH_MEMORY];
- err = 0;
break;
case MPOL_LOCAL:
/*
@@ -2187,11 +2381,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
goto out;
mode = MPOL_PREFERRED;
break;
-
- /*
- * case MPOL_BIND: mpol_new() enforces non-empty nodemask.
- * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
- */
+ case MPOL_DEFAULT:
+ /*
+ * Insist on a empty nodelist
+ */
+ if (!nodelist)
+ err = 0;
+ goto out;
+ case MPOL_BIND:
+ /*
+ * Insist on a nodelist
+ */
+ if (!nodelist)
+ goto out;
}
mode_flags = 0;
@@ -2205,13 +2407,17 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
else if (!strcmp(flags, "relative"))
mode_flags |= MPOL_F_RELATIVE_NODES;
else
- err = 1;
+ goto out;
}
new = mpol_new(mode, mode_flags, &nodes);
if (IS_ERR(new))
- err = 1;
- else {
+ goto out;
+
+ if (no_context) {
+ /* save for contextualization */
+ new->w.user_nodemask = nodes;
+ } else {
int ret;
NODEMASK_SCRATCH(scratch);
if (scratch) {
@@ -2222,13 +2428,11 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
ret = -ENOMEM;
NODEMASK_SCRATCH_FREE(scratch);
if (ret) {
- err = 1;
mpol_put(new);
- } else if (no_context) {
- /* save for contextualization */
- new->w.user_nodemask = nodes;
+ goto out;
}
}
+ err = 0;
out:
/* Restore string for error message */
@@ -2297,11 +2501,11 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
BUG();
}
- l = strlen(policy_types[mode]);
+ l = strlen(policy_modes[mode]);
if (buffer + maxlen < p + l + 1)
return -ENOSPC;
- strcpy(p, policy_types[mode]);
+ strcpy(p, policy_modes[mode]);
p += l;
if (flags & MPOL_MODE_FLAGS) {
diff --git a/mm/migrate.c b/mm/migrate.c
index efddbf0926b2..46fe8cc13d67 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,6 +32,10 @@
#include <linux/security.h>
#include <linux/memcontrol.h>
#include <linux/syscalls.h>
+#include <linux/hugetlb.h>
+#include <linux/gfp.h>
+
+#include <asm/tlbflush.h>
#include "internal.h"
@@ -39,7 +43,8 @@
/*
* migrate_prep() needs to be called before we start compiling a list of pages
- * to be migrated using isolate_lru_page().
+ * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
+ * undesirable, use migrate_prep_local()
*/
int migrate_prep(void)
{
@@ -54,26 +59,29 @@ int migrate_prep(void)
return 0;
}
+/* Do the necessary work of migrate_prep but not if it involves other CPUs */
+int migrate_prep_local(void)
+{
+ lru_add_drain();
+
+ return 0;
+}
+
/*
* Add isolated pages on the list back to the LRU under page lock
* to avoid leaking evictable pages back onto unevictable list.
- *
- * returns the number of pages put back.
*/
-int putback_lru_pages(struct list_head *l)
+void putback_lru_pages(struct list_head *l)
{
struct page *page;
struct page *page2;
- int count = 0;
list_for_each_entry_safe(page, page2, l, lru) {
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
putback_lru_page(page);
- count++;
}
- return count;
}
/*
@@ -90,26 +98,36 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte_t *ptep, pte;
spinlock_t *ptl;
- pgd = pgd_offset(mm, addr);
- if (!pgd_present(*pgd))
- goto out;
+ if (unlikely(PageHuge(new))) {
+ ptep = huge_pte_offset(mm, addr);
+ if (!ptep)
+ goto out;
+ ptl = &mm->page_table_lock;
+ } else {
+ pgd = pgd_offset(mm, addr);
+ if (!pgd_present(*pgd))
+ goto out;
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud))
- goto out;
+ pud = pud_offset(pgd, addr);
+ if (!pud_present(*pud))
+ goto out;
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
- goto out;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_trans_huge(*pmd))
+ goto out;
+ if (!pmd_present(*pmd))
+ goto out;
- ptep = pte_offset_map(pmd, addr);
+ ptep = pte_offset_map(pmd, addr);
- if (!is_swap_pte(*ptep)) {
- pte_unmap(ptep);
- goto out;
- }
+ if (!is_swap_pte(*ptep)) {
+ pte_unmap(ptep);
+ goto out;
+ }
+
+ ptl = pte_lockptr(mm, pmd);
+ }
- ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
pte = *ptep;
if (!is_swap_pte(pte))
@@ -125,16 +143,25 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
if (is_write_migration_entry(entry))
pte = pte_mkwrite(pte);
+#ifdef CONFIG_HUGETLB_PAGE
+ if (PageHuge(new))
+ pte = pte_mkhuge(pte);
+#endif
flush_cache_page(vma, addr, pte_pfn(pte));
set_pte_at(mm, addr, ptep, pte);
- if (PageAnon(new))
+ if (PageHuge(new)) {
+ if (PageAnon(new))
+ hugepage_add_anon_rmap(new, vma, addr);
+ else
+ page_dup_rmap(new);
+ } else if (PageAnon(new))
page_add_anon_rmap(new, vma, addr);
else
page_add_file_rmap(new);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, addr, pte);
+ update_mmu_cache(vma, addr, ptep);
unlock:
pte_unmap_unlock(ptep, ptl);
out:
@@ -221,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
expected_count = 2 + page_has_private(page);
if (page_count(page) != expected_count ||
- (struct page *)radix_tree_deref_slot(pslot) != page) {
+ radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -271,13 +298,59 @@ static int migrate_page_move_mapping(struct address_space *mapping,
}
/*
- * Copy the page to its new location
+ * The expected number of remaining references is the same as that
+ * of migrate_page_move_mapping().
*/
-static void migrate_page_copy(struct page *newpage, struct page *page)
+int migrate_huge_page_move_mapping(struct address_space *mapping,
+ struct page *newpage, struct page *page)
{
- int anon;
+ int expected_count;
+ void **pslot;
+
+ if (!mapping) {
+ if (page_count(page) != 1)
+ return -EAGAIN;
+ return 0;
+ }
- copy_highpage(newpage, page);
+ spin_lock_irq(&mapping->tree_lock);
+
+ pslot = radix_tree_lookup_slot(&mapping->page_tree,
+ page_index(page));
+
+ expected_count = 2 + page_has_private(page);
+ if (page_count(page) != expected_count ||
+ radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ if (!page_freeze_refs(page, expected_count)) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ get_page(newpage);
+
+ radix_tree_replace_slot(pslot, newpage);
+
+ page_unfreeze_refs(page, expected_count);
+
+ __put_page(page);
+
+ spin_unlock_irq(&mapping->tree_lock);
+ return 0;
+}
+
+/*
+ * Copy the page to its new location
+ */
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+ if (PageHuge(page))
+ copy_huge_page(newpage, page);
+ else
+ copy_highpage(newpage, page);
if (PageError(page))
SetPageError(newpage);
@@ -313,8 +386,6 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
ClearPageSwapCache(page);
ClearPagePrivate(page);
set_page_private(page, 0);
- /* page->mapping contains a flag for PageAnon() */
- anon = PageAnon(page);
page->mapping = NULL;
/*
@@ -430,7 +501,6 @@ static int writeout(struct address_space *mapping, struct page *page)
.nr_to_write = 1,
.range_start = 0,
.range_end = LLONG_MAX,
- .nonblocking = 1,
.for_reclaim = 1
};
int rc;
@@ -493,7 +563,8 @@ static int fallback_migrate_page(struct address_space *mapping,
* < 0 - error code
* == 0 - success
*/
-static int move_to_new_page(struct page *newpage, struct page *page)
+static int move_to_new_page(struct page *newpage, struct page *page,
+ int remap_swapcache)
{
struct address_space *mapping;
int rc;
@@ -528,10 +599,12 @@ static int move_to_new_page(struct page *newpage, struct page *page)
else
rc = fallback_migrate_page(mapping, newpage, page);
- if (!rc)
- remove_migration_ptes(page, newpage);
- else
+ if (rc) {
newpage->mapping = NULL;
+ } else {
+ if (remap_swapcache)
+ remove_migration_ptes(page, newpage);
+ }
unlock_page(newpage);
@@ -543,14 +616,15 @@ static int move_to_new_page(struct page *newpage, struct page *page)
* to the newly allocated page in newpage.
*/
static int unmap_and_move(new_page_t get_new_page, unsigned long private,
- struct page *page, int force, int offlining)
+ struct page *page, int force, bool offlining, bool sync)
{
int rc = 0;
int *result = NULL;
struct page *newpage = get_new_page(page, private, &result);
- int rcu_locked = 0;
+ int remap_swapcache = 1;
int charge = 0;
struct mem_cgroup *mem = NULL;
+ struct anon_vma *anon_vma = NULL;
if (!newpage)
return -ENOMEM;
@@ -559,6 +633,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
/* page was freed from under us. So we are done. */
goto move_newpage;
}
+ if (unlikely(PageTransHuge(page)))
+ if (unlikely(split_huge_page(page)))
+ goto move_newpage;
/* prepare cgroup just returns 0 or -ENOMEM */
rc = -EAGAIN;
@@ -566,6 +643,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
if (!trylock_page(page)) {
if (!force)
goto move_newpage;
+
+ /*
+ * It's not safe for direct compaction to call lock_page.
+ * For example, during page readahead pages are added locked
+ * to the LRU. Later, when the IO completes the pages are
+ * marked uptodate and unlocked. However, the queueing
+ * could be merging multiple pages for one bio (e.g.
+ * mpage_readpages). If an allocation happens for the
+ * second or third page, the process can end up locking
+ * the same page twice and deadlocking. Rather than
+ * trying to be clever about what pages can be locked,
+ * avoid the use of lock_page for direct compaction
+ * altogether.
+ */
+ if (current->flags & PF_MEMALLOC)
+ goto move_newpage;
+
lock_page(page);
}
@@ -584,7 +678,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
}
/* charge against new page */
- charge = mem_cgroup_prepare_migration(page, &mem);
+ charge = mem_cgroup_prepare_migration(page, newpage, &mem);
if (charge == -ENOMEM) {
rc = -ENOMEM;
goto unlock;
@@ -592,21 +686,49 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
BUG_ON(charge);
if (PageWriteback(page)) {
- if (!force)
+ if (!force || !sync)
goto uncharge;
wait_on_page_writeback(page);
}
/*
* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
* we cannot notice that anon_vma is freed while we migrates a page.
- * This rcu_read_lock() delays freeing anon_vma pointer until the end
+ * This get_anon_vma() delays freeing anon_vma pointer until the end
* of migration. File cache pages are no problem because of page_lock()
* File Caches may use write_page() or lock_page() in migration, then,
* just care Anon page here.
*/
if (PageAnon(page)) {
- rcu_read_lock();
- rcu_locked = 1;
+ /*
+ * Only page_lock_anon_vma() understands the subtleties of
+ * getting a hold on an anon_vma from outside one of its mms.
+ */
+ anon_vma = page_lock_anon_vma(page);
+ if (anon_vma) {
+ /*
+ * Take a reference count on the anon_vma if the
+ * page is mapped so that it is guaranteed to
+ * exist when the page is remapped later
+ */
+ get_anon_vma(anon_vma);
+ page_unlock_anon_vma(anon_vma);
+ } else if (PageSwapCache(page)) {
+ /*
+ * We cannot be sure that the anon_vma of an unmapped
+ * swapcache page is safe to use because we don't
+ * know in advance if the VMA that this page belonged
+ * to still exists. If the VMA and others sharing the
+ * data have been freed, then the anon_vma could
+ * already be invalid.
+ *
+ * To avoid this possibility, swapcache pages get
+ * migrated but are not remapped when migration
+ * completes
+ */
+ remap_swapcache = 0;
+ } else {
+ goto uncharge;
+ }
}
/*
@@ -622,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
* free the metadata, so the page can be freed.
*/
if (!page->mapping) {
- if (!PageAnon(page) && page_has_private(page)) {
- /*
- * Go direct to try_to_free_buffers() here because
- * a) that's what try_to_release_page() would do anyway
- * b) we may be under rcu_read_lock() here, so we can't
- * use GFP_KERNEL which is what try_to_release_page()
- * needs to be effective.
- */
+ VM_BUG_ON(PageAnon(page));
+ if (page_has_private(page)) {
try_to_free_buffers(page);
- goto rcu_unlock;
+ goto uncharge;
}
goto skip_unmap;
}
@@ -641,16 +757,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
skip_unmap:
if (!page_mapped(page))
- rc = move_to_new_page(newpage, page);
+ rc = move_to_new_page(newpage, page, remap_swapcache);
- if (rc)
+ if (rc && remap_swapcache)
remove_migration_ptes(page, page);
-rcu_unlock:
- if (rcu_locked)
- rcu_read_unlock();
+
+ /* Drop an anon_vma reference if we took one */
+ if (anon_vma)
+ drop_anon_vma(anon_vma);
+
uncharge:
if (!charge)
- mem_cgroup_end_migration(mem, page, newpage);
+ mem_cgroup_end_migration(mem, page, newpage, rc == 0);
unlock:
unlock_page(page);
@@ -685,6 +803,81 @@ move_newpage:
}
/*
+ * Counterpart of unmap_and_move_page() for hugepage migration.
+ *
+ * This function doesn't wait the completion of hugepage I/O
+ * because there is no race between I/O and migration for hugepage.
+ * Note that currently hugepage I/O occurs only in direct I/O
+ * where no lock is held and PG_writeback is irrelevant,
+ * and writeback status of all subpages are counted in the reference
+ * count of the head page (i.e. if all subpages of a 2MB hugepage are
+ * under direct I/O, the reference of the head page is 512 and a bit more.)
+ * This means that when we try to migrate hugepage whose subpages are
+ * doing direct I/O, some references remain after try_to_unmap() and
+ * hugepage migration fails without data corruption.
+ *
+ * There is also no race when direct I/O is issued on the page under migration,
+ * because then pte is replaced with migration swap entry and direct I/O code
+ * will wait in the page fault for migration to complete.
+ */
+static int unmap_and_move_huge_page(new_page_t get_new_page,
+ unsigned long private, struct page *hpage,
+ int force, bool offlining, bool sync)
+{
+ int rc = 0;
+ int *result = NULL;
+ struct page *new_hpage = get_new_page(hpage, private, &result);
+ struct anon_vma *anon_vma = NULL;
+
+ if (!new_hpage)
+ return -ENOMEM;
+
+ rc = -EAGAIN;
+
+ if (!trylock_page(hpage)) {
+ if (!force || !sync)
+ goto out;
+ lock_page(hpage);
+ }
+
+ if (PageAnon(hpage)) {
+ anon_vma = page_lock_anon_vma(hpage);
+ if (anon_vma) {
+ get_anon_vma(anon_vma);
+ page_unlock_anon_vma(anon_vma);
+ }
+ }
+
+ try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+
+ if (!page_mapped(hpage))
+ rc = move_to_new_page(new_hpage, hpage, 1);
+
+ if (rc)
+ remove_migration_ptes(hpage, hpage);
+
+ if (anon_vma)
+ drop_anon_vma(anon_vma);
+out:
+ unlock_page(hpage);
+
+ if (rc != -EAGAIN) {
+ list_del(&hpage->lru);
+ put_page(hpage);
+ }
+
+ put_page(new_hpage);
+
+ if (result) {
+ if (rc)
+ *result = rc;
+ else
+ *result = page_to_nid(new_hpage);
+ }
+ return rc;
+}
+
+/*
* migrate_pages
*
* The function takes one list of pages to migrate and a function
@@ -693,13 +886,15 @@ move_newpage:
*
* The function returns after 10 attempts or if no pages
* are movable anymore because to has become empty
- * or no retryable pages exist anymore. All pages will be
- * returned to the LRU or freed.
+ * or no retryable pages exist anymore.
+ * Caller should call putback_lru_pages to return pages to the LRU
+ * or free list.
*
* Return: Number of pages not migrated or error code.
*/
int migrate_pages(struct list_head *from,
- new_page_t get_new_page, unsigned long private, int offlining)
+ new_page_t get_new_page, unsigned long private, bool offlining,
+ bool sync)
{
int retry = 1;
int nr_failed = 0;
@@ -719,7 +914,8 @@ int migrate_pages(struct list_head *from,
cond_resched();
rc = unmap_and_move(get_new_page, private,
- page, pass > 2, offlining);
+ page, pass > 2, offlining,
+ sync);
switch(rc) {
case -ENOMEM:
@@ -741,7 +937,53 @@ out:
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
- putback_lru_pages(from);
+ if (rc)
+ return rc;
+
+ return nr_failed + retry;
+}
+
+int migrate_huge_pages(struct list_head *from,
+ new_page_t get_new_page, unsigned long private, bool offlining,
+ bool sync)
+{
+ int retry = 1;
+ int nr_failed = 0;
+ int pass = 0;
+ struct page *page;
+ struct page *page2;
+ int rc;
+
+ for (pass = 0; pass < 10 && retry; pass++) {
+ retry = 0;
+
+ list_for_each_entry_safe(page, page2, from, lru) {
+ cond_resched();
+
+ rc = unmap_and_move_huge_page(get_new_page,
+ private, page, pass > 2, offlining,
+ sync);
+
+ switch(rc) {
+ case -ENOMEM:
+ goto out;
+ case -EAGAIN:
+ retry++;
+ break;
+ case 0:
+ break;
+ default:
+ /* Permanent failure */
+ nr_failed++;
+ break;
+ }
+ }
+ }
+ rc = 0;
+out:
+
+ list_for_each_entry_safe(page, page2, from, lru)
+ put_page(page);
if (rc)
return rc;
@@ -802,10 +1044,10 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
err = -EFAULT;
vma = find_vma(mm, pp->addr);
- if (!vma || !vma_migratable(vma))
+ if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
goto set_status;
- page = follow_page(vma, pp->addr, FOLL_GET);
+ page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
err = PTR_ERR(page);
if (IS_ERR(page))
@@ -851,9 +1093,12 @@ set_status:
}
err = 0;
- if (!list_empty(&pagelist))
+ if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_page_node,
- (unsigned long)pm, 0);
+ (unsigned long)pm, 0, true);
+ if (err)
+ putback_lru_pages(&pagelist);
+ }
up_read(&mm->mmap_sem);
return err;
@@ -912,6 +1157,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
goto out_pm;
err = -ENODEV;
+ if (node < 0 || node >= MAX_NUMNODES)
+ goto out_pm;
+
if (!node_state(node, N_HIGH_MEMORY))
goto out_pm;
@@ -963,7 +1211,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
int err = -EFAULT;
vma = find_vma(mm, addr);
- if (!vma)
+ if (!vma || addr < vma->vm_start)
goto set_status;
page = follow_page(vma, addr, 0);
@@ -999,33 +1247,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
#define DO_PAGES_STAT_CHUNK_NR 16
const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
int chunk_status[DO_PAGES_STAT_CHUNK_NR];
- unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR;
- int err;
- for (i = 0; i < nr_pages; i += chunk_nr) {
- if (chunk_nr > nr_pages - i)
- chunk_nr = nr_pages - i;
+ while (nr_pages) {
+ unsigned long chunk_nr;
- err = copy_from_user(chunk_pages, &pages[i],
- chunk_nr * sizeof(*chunk_pages));
- if (err) {
- err = -EFAULT;
- goto out;
- }
+ chunk_nr = nr_pages;
+ if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
+ chunk_nr = DO_PAGES_STAT_CHUNK_NR;
+
+ if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
+ break;
do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
- err = copy_to_user(&status[i], chunk_status,
- chunk_nr * sizeof(*chunk_status));
- if (err) {
- err = -EFAULT;
- goto out;
- }
- }
- err = 0;
+ if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
+ break;
-out:
- return err;
+ pages += chunk_nr;
+ status += chunk_nr;
+ nr_pages -= chunk_nr;
+ }
+ return nr_pages ? -EFAULT : 0;
}
/*
diff --git a/mm/mincore.c b/mm/mincore.c
index 7a3436ef39eb..a4e6b9d75c76 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -7,8 +7,8 @@
/*
* The mincore() system call.
*/
-#include <linux/slab.h>
#include <linux/pagemap.h>
+#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/syscalls.h>
@@ -19,6 +19,40 @@
#include <asm/uaccess.h>
#include <asm/pgtable.h>
+static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ struct hstate *h;
+
+ h = hstate_vma(vma);
+ while (1) {
+ unsigned char present;
+ pte_t *ptep;
+ /*
+ * Huge pages are always in RAM for now, but
+ * theoretically it needs to be checked.
+ */
+ ptep = huge_pte_offset(current->mm,
+ addr & huge_page_mask(h));
+ present = ptep && !huge_pte_none(huge_ptep_get(ptep));
+ while (1) {
+ *vec = present;
+ vec++;
+ addr += PAGE_SIZE;
+ if (addr == end)
+ return;
+ /* check hugepage border */
+ if (!(addr & ~huge_page_mask(h)))
+ break;
+ }
+ }
+#else
+ BUG();
+#endif
+}
+
/*
* Later we can get more picky about what "in core" means precisely.
* For now, simply check to see if the page is in the page cache,
@@ -49,145 +83,157 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
return present;
}
-/*
- * Do a chunk of "sys_mincore()". We've already checked
- * all the arguments, we hold the mmap semaphore: we should
- * just return the amount of info we're asked for.
- */
-static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
+static void mincore_unmapped_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *ptep;
- spinlock_t *ptl;
- unsigned long nr;
+ unsigned long nr = (end - addr) >> PAGE_SHIFT;
int i;
- pgoff_t pgoff;
- struct vm_area_struct *vma = find_vma(current->mm, addr);
-
- /*
- * find_vma() didn't find anything above us, or we're
- * in an unmapped hole in the address space: ENOMEM.
- */
- if (!vma || addr < vma->vm_start)
- return -ENOMEM;
-#ifdef CONFIG_HUGETLB_PAGE
- if (is_vm_hugetlb_page(vma)) {
- struct hstate *h;
- unsigned long nr_huge;
- unsigned char present;
+ if (vma->vm_file) {
+ pgoff_t pgoff;
- i = 0;
- nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT);
- h = hstate_vma(vma);
- nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h))
- - (addr >> huge_page_shift(h)) + 1;
- nr_huge = min(nr_huge,
- (vma->vm_end - addr) >> huge_page_shift(h));
- while (1) {
- /* hugepage always in RAM for now,
- * but generally it needs to be check */
- ptep = huge_pte_offset(current->mm,
- addr & huge_page_mask(h));
- present = !!(ptep &&
- !huge_pte_none(huge_ptep_get(ptep)));
- while (1) {
- vec[i++] = present;
- addr += PAGE_SIZE;
- /* reach buffer limit */
- if (i == nr)
- return nr;
- /* check hugepage border */
- if (!((addr & ~huge_page_mask(h))
- >> PAGE_SHIFT))
- break;
- }
- }
- return nr;
+ pgoff = linear_page_index(vma, addr);
+ for (i = 0; i < nr; i++, pgoff++)
+ vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+ } else {
+ for (i = 0; i < nr; i++)
+ vec[i] = 0;
}
-#endif
-
- /*
- * Calculate how many pages there are left in the last level of the
- * PTE array for our address.
- */
- nr = PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1));
-
- /*
- * Don't overrun this vma
- */
- nr = min(nr, (vma->vm_end - addr) >> PAGE_SHIFT);
-
- /*
- * Don't return more than the caller asked for
- */
- nr = min(nr, pages);
+}
- pgd = pgd_offset(vma->vm_mm, addr);
- if (pgd_none_or_clear_bad(pgd))
- goto none_mapped;
- pud = pud_offset(pgd, addr);
- if (pud_none_or_clear_bad(pud))
- goto none_mapped;
- pmd = pmd_offset(pud, addr);
- if (pmd_none_or_clear_bad(pmd))
- goto none_mapped;
+static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ unsigned long next;
+ spinlock_t *ptl;
+ pte_t *ptep;
ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) {
- unsigned char present;
+ do {
pte_t pte = *ptep;
+ pgoff_t pgoff;
- if (pte_present(pte)) {
- present = 1;
-
- } else if (pte_none(pte)) {
- if (vma->vm_file) {
- pgoff = linear_page_index(vma, addr);
- present = mincore_page(vma->vm_file->f_mapping,
- pgoff);
- } else
- present = 0;
-
- } else if (pte_file(pte)) {
+ next = addr + PAGE_SIZE;
+ if (pte_none(pte))
+ mincore_unmapped_range(vma, addr, next, vec);
+ else if (pte_present(pte))
+ *vec = 1;
+ else if (pte_file(pte)) {
pgoff = pte_to_pgoff(pte);
- present = mincore_page(vma->vm_file->f_mapping, pgoff);
-
+ *vec = mincore_page(vma->vm_file->f_mapping, pgoff);
} else { /* pte is a swap entry */
swp_entry_t entry = pte_to_swp_entry(pte);
+
if (is_migration_entry(entry)) {
/* migration entries are always uptodate */
- present = 1;
+ *vec = 1;
} else {
#ifdef CONFIG_SWAP
pgoff = entry.val;
- present = mincore_page(&swapper_space, pgoff);
+ *vec = mincore_page(&swapper_space, pgoff);
#else
WARN_ON(1);
- present = 1;
+ *vec = 1;
#endif
}
}
+ vec++;
+ } while (ptep++, addr = next, addr != end);
+ pte_unmap_unlock(ptep - 1, ptl);
+}
- vec[i] = present;
- }
- pte_unmap_unlock(ptep-1, ptl);
+static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ unsigned long next;
+ pmd_t *pmd;
- return nr;
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
+ vec += (next - addr) >> PAGE_SHIFT;
+ continue;
+ }
+ /* fall through */
+ }
+ if (pmd_none_or_clear_bad(pmd))
+ mincore_unmapped_range(vma, addr, next, vec);
+ else
+ mincore_pte_range(vma, pmd, addr, next, vec);
+ vec += (next - addr) >> PAGE_SHIFT;
+ } while (pmd++, addr = next, addr != end);
+}
-none_mapped:
- if (vma->vm_file) {
- pgoff = linear_page_index(vma, addr);
- for (i = 0; i < nr; i++, pgoff++)
- vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
- } else {
- for (i = 0; i < nr; i++)
- vec[i] = 0;
+static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ unsigned long next;
+ pud_t *pud;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ mincore_unmapped_range(vma, addr, next, vec);
+ else
+ mincore_pmd_range(vma, pud, addr, next, vec);
+ vec += (next - addr) >> PAGE_SHIFT;
+ } while (pud++, addr = next, addr != end);
+}
+
+static void mincore_page_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ unsigned long next;
+ pgd_t *pgd;
+
+ pgd = pgd_offset(vma->vm_mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ mincore_unmapped_range(vma, addr, next, vec);
+ else
+ mincore_pud_range(vma, pgd, addr, next, vec);
+ vec += (next - addr) >> PAGE_SHIFT;
+ } while (pgd++, addr = next, addr != end);
+}
+
+/*
+ * Do a chunk of "sys_mincore()". We've already checked
+ * all the arguments, we hold the mmap semaphore: we should
+ * just return the amount of info we're asked for.
+ */
+static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
+{
+ struct vm_area_struct *vma;
+ unsigned long end;
+
+ vma = find_vma(current->mm, addr);
+ if (!vma || addr < vma->vm_start)
+ return -ENOMEM;
+
+ end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
+
+ if (is_vm_hugetlb_page(vma)) {
+ mincore_hugetlb_page_range(vma, addr, end, vec);
+ return (end - addr) >> PAGE_SHIFT;
}
- return nr;
+ end = pmd_addr_end(addr, end);
+
+ if (is_vm_hugetlb_page(vma))
+ mincore_hugetlb_page_range(vma, addr, end, vec);
+ else
+ mincore_page_range(vma, addr, end, vec);
+
+ return (end - addr) >> PAGE_SHIFT;
}
/*
@@ -247,7 +293,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
* the temporary buffer size.
*/
down_read(&current->mm->mmap_sem);
- retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
+ retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
up_read(&current->mm->mmap_sem);
if (retval <= 0)
diff --git a/mm/mlock.c b/mm/mlock.c
index 2b8335a89400..13e81ee8be9d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -25,7 +25,7 @@ int can_do_mlock(void)
{
if (capable(CAP_IPC_LOCK))
return 1;
- if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
+ if (rlimit(RLIMIT_MEMLOCK) != 0)
return 1;
return 0;
}
@@ -135,6 +135,13 @@ void munlock_vma_page(struct page *page)
}
}
+static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
+{
+ return (vma->vm_flags & VM_GROWSDOWN) &&
+ (vma->vm_start == addr) &&
+ !vma_stack_continue(vma->vm_prev, addr);
+}
+
/**
* __mlock_vma_pages_range() - mlock a range of pages in the vma.
* @vma: target vma
@@ -148,13 +155,12 @@ void munlock_vma_page(struct page *page)
* vma->vm_mm->mmap_sem must be held for at least read.
*/
static long __mlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end,
+ int *nonblocking)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = start;
- struct page *pages[16]; /* 16 gives a reasonable batch */
int nr_pages = (end - start) / PAGE_SIZE;
- int ret = 0;
int gup_flags;
VM_BUG_ON(start & ~PAGE_MASK);
@@ -163,67 +169,26 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
VM_BUG_ON(end > vma->vm_end);
VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
- gup_flags = FOLL_TOUCH | FOLL_GET;
- if (vma->vm_flags & VM_WRITE)
+ gup_flags = FOLL_TOUCH;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
gup_flags |= FOLL_WRITE;
- while (nr_pages > 0) {
- int i;
-
- cond_resched();
-
- /*
- * get_user_pages makes pages present if we are
- * setting mlock. and this extra reference count will
- * disable migration of this page. However, page may
- * still be truncated out from under us.
- */
- ret = __get_user_pages(current, mm, addr,
- min_t(int, nr_pages, ARRAY_SIZE(pages)),
- gup_flags, pages, NULL);
- /*
- * This can happen for, e.g., VM_NONLINEAR regions before
- * a page has been allocated and mapped at a given offset,
- * or for addresses that map beyond end of a file.
- * We'll mlock the pages if/when they get faulted in.
- */
- if (ret < 0)
- break;
+ if (vma->vm_flags & VM_LOCKED)
+ gup_flags |= FOLL_MLOCK;
- lru_add_drain(); /* push cached pages to LRU */
-
- for (i = 0; i < ret; i++) {
- struct page *page = pages[i];
-
- if (page->mapping) {
- /*
- * That preliminary check is mainly to avoid
- * the pointless overhead of lock_page on the
- * ZERO_PAGE: which might bounce very badly if
- * there is contention. However, we're still
- * dirtying its cacheline with get/put_page:
- * we'll add another __get_user_pages flag to
- * avoid it if that case turns out to matter.
- */
- lock_page(page);
- /*
- * Because we lock page here and migration is
- * blocked by the elevated reference, we need
- * only check for file-cache page truncation.
- */
- if (page->mapping)
- mlock_vma_page(page);
- unlock_page(page);
- }
- put_page(page); /* ref from get_user_pages() */
- }
-
- addr += ret * PAGE_SIZE;
- nr_pages -= ret;
- ret = 0;
+ /* We don't try to access the guard page of a stack vma */
+ if (stack_guard_page(vma, start)) {
+ addr += PAGE_SIZE;
+ nr_pages--;
}
- return ret; /* 0 or negative error code */
+ return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
+ NULL, NULL, nonblocking);
}
/*
@@ -267,7 +232,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current))) {
- __mlock_vma_pages_range(vma, start, end);
+ __mlock_vma_pages_range(vma, start, end, NULL);
/* Hide errors from mmap() and other callers */
return 0;
@@ -359,18 +324,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
int ret = 0;
int lock = newflags & VM_LOCKED;
- if (newflags == vma->vm_flags ||
- (vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
+ is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))
goto out; /* don't set VM_LOCKED, don't count */
- if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
- is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current)) {
- if (lock)
- make_pages_present(start, end);
- goto out; /* don't set VM_LOCKED, don't count */
- }
-
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma));
@@ -406,14 +363,10 @@ success:
* set VM_LOCKED, __mlock_vma_pages_range will bring it back.
*/
- if (lock) {
+ if (lock)
vma->vm_flags = newflags;
- ret = __mlock_vma_pages_range(vma, start, end);
- if (ret < 0)
- ret = __mlock_posix_error_return(ret);
- } else {
+ else
munlock_vma_pages_range(vma, start, end);
- }
out:
*prev = vma;
@@ -426,7 +379,8 @@ static int do_mlock(unsigned long start, size_t len, int on)
struct vm_area_struct * vma, * prev;
int error;
- len = PAGE_ALIGN(len);
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(len != PAGE_ALIGN(len));
end = start + len;
if (end < start)
return -EINVAL;
@@ -469,6 +423,62 @@ static int do_mlock(unsigned long start, size_t len, int on)
return error;
}
+static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long end, nstart, nend;
+ struct vm_area_struct *vma = NULL;
+ int locked = 0;
+ int ret = 0;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(len != PAGE_ALIGN(len));
+ end = start + len;
+
+ for (nstart = start; nstart < end; nstart = nend) {
+ /*
+ * We want to fault in pages for [nstart; end) address range.
+ * Find first corresponding VMA.
+ */
+ if (!locked) {
+ locked = 1;
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, nstart);
+ } else if (nstart >= vma->vm_end)
+ vma = vma->vm_next;
+ if (!vma || vma->vm_start >= end)
+ break;
+ /*
+ * Set [nstart; nend) to intersection of desired address
+ * range with the first VMA. Also, skip undesirable VMA types.
+ */
+ nend = min(end, vma->vm_end);
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ continue;
+ if (nstart < vma->vm_start)
+ nstart = vma->vm_start;
+ /*
+ * Now fault in a range of pages. __mlock_vma_pages_range()
+ * double checks the vma flags, so that it won't mlock pages
+ * if the vma was already munlocked.
+ */
+ ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
+ if (ret < 0) {
+ if (ignore_errors) {
+ ret = 0;
+ continue; /* continue at next VMA */
+ }
+ ret = __mlock_posix_error_return(ret);
+ break;
+ }
+ nend = nstart + ret * PAGE_SIZE;
+ ret = 0;
+ }
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret; /* 0 or negative error code */
+}
+
SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
{
unsigned long locked;
@@ -487,13 +497,15 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
locked = len >> PAGE_SHIFT;
locked += current->mm->locked_vm;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
/* check against resource limits */
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
error = do_mlock(start, len, 1);
up_write(&current->mm->mmap_sem);
+ if (!error)
+ error = do_mlock_pages(start, len, 0);
return error;
}
@@ -550,7 +562,7 @@ SYSCALL_DEFINE1(mlockall, int, flags)
down_write(&current->mm->mmap_sem);
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
ret = -ENOMEM;
@@ -558,6 +570,10 @@ SYSCALL_DEFINE1(mlockall, int, flags)
capable(CAP_IPC_LOCK))
ret = do_mlockall(flags);
up_write(&current->mm->mmap_sem);
+ if (!ret && (flags & MCL_CURRENT)) {
+ /* Ignore errors */
+ do_mlock_pages(0, TASK_SIZE, 1);
+ }
out:
return ret;
}
@@ -584,7 +600,7 @@ int user_shm_lock(size_t size, struct user_struct *user)
int allowed = 0;
locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
if (lock_limit == RLIM_INFINITY)
allowed = 1;
lock_limit >>= PAGE_SHIFT;
@@ -607,44 +623,3 @@ void user_shm_unlock(size_t size, struct user_struct *user)
spin_unlock(&shmlock_user_lock);
free_uid(user);
}
-
-int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
- size_t size)
-{
- unsigned long lim, vm, pgsz;
- int error = -ENOMEM;
-
- pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
- down_write(&mm->mmap_sem);
-
- lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
- vm = mm->total_vm + pgsz;
- if (lim < vm)
- goto out;
-
- lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
- vm = mm->locked_vm + pgsz;
- if (lim < vm)
- goto out;
-
- mm->total_vm += pgsz;
- mm->locked_vm += pgsz;
-
- error = 0;
- out:
- up_write(&mm->mmap_sem);
- return error;
-}
-
-void refund_locked_memory(struct mm_struct *mm, size_t size)
-{
- unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
- down_write(&mm->mmap_sem);
-
- mm->total_vm -= pgsz;
- mm->locked_vm -= pgsz;
-
- up_write(&mm->mmap_sem);
-}
diff --git a/mm/mmap.c b/mm/mmap.c
index ee2298936fe6..2ec8eb5a9cdd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,8 @@
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/perf_event.h>
+#include <linux/audit.h>
+#include <linux/khugepaged.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -252,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
down_write(&mm->mmap_sem);
#ifdef CONFIG_COMPAT_BRK
- min_brk = mm->end_code;
+ /*
+ * CONFIG_COMPAT_BRK can still be overridden by setting
+ * randomize_va_space to 2, which will still cause mm->start_brk
+ * to be arbitrarily shifted
+ */
+ if (mm->start_brk > PAGE_ALIGN(mm->end_data))
+ min_brk = mm->start_brk;
+ else
+ min_brk = mm->end_data;
#else
min_brk = mm->start_brk;
#endif
@@ -265,7 +275,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
* segment grow beyond its set limit the in case where the limit is
* not page aligned -Ram Gupta
*/
- rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+ rlim = rlimit(RLIMIT_DATA);
if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
(mm->end_data - mm->start_data) > rlim)
goto out;
@@ -388,17 +398,23 @@ static inline void
__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent)
{
+ struct vm_area_struct *next;
+
+ vma->vm_prev = prev;
if (prev) {
- vma->vm_next = prev->vm_next;
+ next = prev->vm_next;
prev->vm_next = vma;
} else {
mm->mmap = vma;
if (rb_parent)
- vma->vm_next = rb_entry(rb_parent,
+ next = rb_entry(rb_parent,
struct vm_area_struct, vm_rb);
else
- vma->vm_next = NULL;
+ next = NULL;
}
+ vma->vm_next = next;
+ if (next)
+ next->vm_prev = vma;
}
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -437,7 +453,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
{
__vma_link_list(mm, vma, prev, rb_parent);
__vma_link_rb(mm, vma, rb_link, rb_parent);
- __anon_vma_link(vma);
}
static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -453,12 +468,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
spin_lock(&mapping->i_mmap_lock);
vma->vm_truncate_count = mapping->truncate_count;
}
- anon_vma_lock(vma);
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
- anon_vma_unlock(vma);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
@@ -486,7 +499,11 @@ static inline void
__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev)
{
- prev->vm_next = vma->vm_next;
+ struct vm_area_struct *next = vma->vm_next;
+
+ prev->vm_next = next;
+ if (next)
+ next->vm_prev = prev;
rb_erase(&vma->vm_rb, &mm->mm_rb);
if (mm->mmap_cache == vma)
mm->mmap_cache = prev;
@@ -499,7 +516,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
* are necessary. The "insert" vma (if any) is to be inserted
* before we drop the necessary locks.
*/
-void vma_adjust(struct vm_area_struct *vma, unsigned long start,
+int vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
{
struct mm_struct *mm = vma->vm_mm;
@@ -507,12 +524,14 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start,
struct vm_area_struct *importer = NULL;
struct address_space *mapping = NULL;
struct prio_tree_root *root = NULL;
- struct file *file = vma->vm_file;
struct anon_vma *anon_vma = NULL;
+ struct file *file = vma->vm_file;
long adjust_next = 0;
int remove_next = 0;
if (next && !insert) {
+ struct vm_area_struct *exporter = NULL;
+
if (end >= next->vm_end) {
/*
* vma expands, overlapping all the next, and
@@ -520,7 +539,7 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start,
*/
again: remove_next = 1 + (end > next->vm_end);
end = next->vm_end;
- anon_vma = next->anon_vma;
+ exporter = next;
importer = vma;
} else if (end > next->vm_start) {
/*
@@ -528,7 +547,7 @@ again: remove_next = 1 + (end > next->vm_end);
* mprotect case 5 shifting the boundary up.
*/
adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
- anon_vma = next->anon_vma;
+ exporter = next;
importer = vma;
} else if (end < vma->vm_end) {
/*
@@ -537,9 +556,20 @@ again: remove_next = 1 + (end > next->vm_end);
* mprotect case 4 shifting the boundary down.
*/
adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
- anon_vma = next->anon_vma;
+ exporter = vma;
importer = next;
}
+
+ /*
+ * Easily overlooked: when mprotect shifts the boundary,
+ * make sure the expanding vma has anon_vma set if the
+ * shrinking vma had, to cover any anon pages imported.
+ */
+ if (exporter && exporter->anon_vma && !importer->anon_vma) {
+ if (anon_vma_clone(importer, exporter))
+ return -ENOMEM;
+ importer->anon_vma = exporter->anon_vma;
+ }
}
if (file) {
@@ -567,23 +597,17 @@ again: remove_next = 1 + (end > next->vm_end);
}
}
+ vma_adjust_trans_huge(vma, start, end, adjust_next);
+
/*
- * When changing only vma->vm_end, we don't really need
- * anon_vma lock.
+ * When changing only vma->vm_end, we don't really need anon_vma
+ * lock. This is a fairly rare case by itself, but the anon_vma
+ * lock may be shared between many sibling processes. Skipping
+ * the lock for brk adjustments makes a difference sometimes.
*/
- if (vma->anon_vma && (insert || importer || start != vma->vm_start))
+ if (vma->anon_vma && (insert || importer || start != vma->vm_start)) {
anon_vma = vma->anon_vma;
- if (anon_vma) {
- spin_lock(&anon_vma->lock);
- /*
- * Easily overlooked: when mprotect shifts the boundary,
- * make sure the expanding vma has anon_vma set if the
- * shrinking vma had, to cover any anon pages imported.
- */
- if (importer && !importer->anon_vma) {
- importer->anon_vma = anon_vma;
- __anon_vma_link(importer);
- }
+ anon_vma_lock(anon_vma);
}
if (root) {
@@ -616,8 +640,6 @@ again: remove_next = 1 + (end > next->vm_end);
__vma_unlink(mm, next, vma);
if (file)
__remove_shared_vm_struct(next, file, mapping);
- if (next->anon_vma)
- __anon_vma_merge(vma, next);
} else if (insert) {
/*
* split_vma has split insert from vma, and needs
@@ -628,7 +650,7 @@ again: remove_next = 1 + (end > next->vm_end);
}
if (anon_vma)
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
@@ -638,6 +660,8 @@ again: remove_next = 1 + (end > next->vm_end);
if (next->vm_flags & VM_EXECUTABLE)
removed_exe_file_vma(mm);
}
+ if (next->anon_vma)
+ anon_vma_merge(vma, next);
mm->map_count--;
mpol_put(vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
@@ -653,6 +677,8 @@ again: remove_next = 1 + (end > next->vm_end);
}
validate_mm(mm);
+
+ return 0;
}
/*
@@ -759,6 +785,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
+ int err;
/*
* We later require that vma->vm_flags == vm_flags,
@@ -792,11 +819,14 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma)) {
/* cases 1, 6 */
- vma_adjust(prev, prev->vm_start,
+ err = vma_adjust(prev, prev->vm_start,
next->vm_end, prev->vm_pgoff, NULL);
} else /* cases 2, 5, 7 */
- vma_adjust(prev, prev->vm_start,
+ err = vma_adjust(prev, prev->vm_start,
end, prev->vm_pgoff, NULL);
+ if (err)
+ return NULL;
+ khugepaged_enter_vma_merge(prev);
return prev;
}
@@ -808,11 +838,14 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen)) {
if (prev && addr < prev->vm_end) /* case 4 */
- vma_adjust(prev, prev->vm_start,
+ err = vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL);
else /* cases 3, 8 */
- vma_adjust(area, addr, next->vm_end,
+ err = vma_adjust(area, addr, next->vm_end,
next->vm_pgoff - pglen, NULL);
+ if (err)
+ return NULL;
+ khugepaged_enter_vma_merge(area);
return area;
}
@@ -820,6 +853,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
}
/*
+ * Rough compatbility check to quickly see if it's even worth looking
+ * at sharing an anon_vma.
+ *
+ * They need to have the same vm_file, and the flags can only differ
+ * in things that mprotect may change.
+ *
+ * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
+ * we can merge the two vma's. For example, we refuse to merge a vma if
+ * there is a vm_ops->close() function, because that indicates that the
+ * driver is doing some kind of reference counting. But that doesn't
+ * really matter for the anon_vma sharing case.
+ */
+static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
+{
+ return a->vm_end == b->vm_start &&
+ mpol_equal(vma_policy(a), vma_policy(b)) &&
+ a->vm_file == b->vm_file &&
+ !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&
+ b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
+}
+
+/*
+ * Do some basic sanity checking to see if we can re-use the anon_vma
+ * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
+ * the same as 'old', the other will be the new one that is trying
+ * to share the anon_vma.
+ *
+ * NOTE! This runs with mm_sem held for reading, so it is possible that
+ * the anon_vma of 'old' is concurrently in the process of being set up
+ * by another page fault trying to merge _that_. But that's ok: if it
+ * is being set up, that automatically means that it will be a singleton
+ * acceptable for merging, so we can do all of this optimistically. But
+ * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
+ *
+ * IOW: that the "list_is_singular()" test on the anon_vma_chain only
+ * matters for the 'stable anon_vma' case (ie the thing we want to avoid
+ * is to return an anon_vma that is "complex" due to having gone through
+ * a fork).
+ *
+ * We also make sure that the two vma's are compatible (adjacent,
+ * and with the same memory policies). That's all stable, even with just
+ * a read lock on the mm_sem.
+ */
+static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
+{
+ if (anon_vma_compatible(a, b)) {
+ struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
+
+ if (anon_vma && list_is_singular(&old->anon_vma_chain))
+ return anon_vma;
+ }
+ return NULL;
+}
+
+/*
* find_mergeable_anon_vma is used by anon_vma_prepare, to check
* neighbouring vmas for a suitable anon_vma, before it goes off
* to allocate a new anon_vma. It checks because a repetitive
@@ -829,28 +917,16 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
*/
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
+ struct anon_vma *anon_vma;
struct vm_area_struct *near;
- unsigned long vm_flags;
near = vma->vm_next;
if (!near)
goto try_prev;
- /*
- * Since only mprotect tries to remerge vmas, match flags
- * which might be mprotected into each other later on.
- * Neither mlock nor madvise tries to remerge at present,
- * so leave their flags as obstructing a merge.
- */
- vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
- vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
-
- if (near->anon_vma && vma->vm_end == near->vm_start &&
- mpol_equal(vma_policy(vma), vma_policy(near)) &&
- can_vma_merge_before(near, vm_flags,
- NULL, vma->vm_file, vma->vm_pgoff +
- ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
- return near->anon_vma;
+ anon_vma = reusable_anon_vma(near, vma, near);
+ if (anon_vma)
+ return anon_vma;
try_prev:
/*
* It is potentially slow to have to call find_vma_prev here.
@@ -863,14 +939,9 @@ try_prev:
if (!near)
goto none;
- vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
- vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
-
- if (near->anon_vma && near->vm_end == vma->vm_start &&
- mpol_equal(vma_policy(near), vma_policy(vma)) &&
- can_vma_merge_after(near, vm_flags,
- NULL, vma->vm_file, vma->vm_pgoff))
- return near->anon_vma;
+ anon_vma = reusable_anon_vma(near, near, vma);
+ if (anon_vma)
+ return anon_vma;
none:
/*
* There's no absolute need to look only at touching neighbours:
@@ -967,7 +1038,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long locked, lock_limit;
locked = len >> PAGE_SHIFT;
locked += mm->locked_vm;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
return -EAGAIN;
@@ -1051,6 +1122,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long retval = -EBADF;
if (!(flags & MAP_ANONYMOUS)) {
+ audit_mmap_fd(fd, flags);
if (unlikely(flags & MAP_HUGETLB))
return -EINVAL;
file = fget(fd);
@@ -1083,6 +1155,30 @@ out:
return retval;
}
+#ifdef __ARCH_WANT_SYS_OLD_MMAP
+struct mmap_arg_struct {
+ unsigned long addr;
+ unsigned long len;
+ unsigned long prot;
+ unsigned long flags;
+ unsigned long fd;
+ unsigned long offset;
+};
+
+SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
+{
+ struct mmap_arg_struct a;
+
+ if (copy_from_user(&a, arg, sizeof(a)))
+ return -EFAULT;
+ if (a.offset & ~PAGE_MASK)
+ return -EINVAL;
+
+ return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
+ a.offset >> PAGE_SHIFT);
+}
+#endif /* __ARCH_WANT_SYS_OLD_MMAP */
+
/*
* Some shared mappigns will want the pages marked read-only
* to track write events. If so, we'll downgrade vm_page_prot
@@ -1205,6 +1301,7 @@ munmap_back:
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
+ INIT_LIST_HEAD(&vma->anon_vma_chain);
if (file) {
error = -EINVAL;
@@ -1265,13 +1362,8 @@ out:
mm->total_vm += len >> PAGE_SHIFT;
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
- /*
- * makes pages present; downgrades, drops, reacquires mmap_sem
- */
- long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
- if (nr_pages < 0)
- return nr_pages; /* vma gone! */
- mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
+ if (!mlock_vma_pages_range(vma, addr, addr + len))
+ mm->locked_vm += (len >> PAGE_SHIFT);
} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
make_pages_present(addr, addr + len);
return addr;
@@ -1599,7 +1691,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
return -ENOMEM;
/* Stack limit test */
- if (size > rlim[RLIMIT_STACK].rlim_cur)
+ if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
return -ENOMEM;
/* mlock limit tests */
@@ -1607,7 +1699,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
unsigned long locked;
unsigned long limit;
locked = mm->locked_vm + grow;
- limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+ limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
+ limit >>= PAGE_SHIFT;
if (locked > limit && !capable(CAP_IPC_LOCK))
return -ENOMEM;
}
@@ -1638,9 +1731,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
* PA-RISC uses this for its stack; IA64 for its Register Backing Store.
* vma is the last one with address > vma->vm_end. Have to extend vma.
*/
-#ifndef CONFIG_IA64
-static
-#endif
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
int error;
@@ -1654,7 +1744,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
*/
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
- anon_vma_lock(vma);
+ vma_lock_anon_vma(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
@@ -1665,7 +1755,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (address < PAGE_ALIGN(address+4))
address = PAGE_ALIGN(address+4);
else {
- anon_vma_unlock(vma);
+ vma_unlock_anon_vma(vma);
return -ENOMEM;
}
error = 0;
@@ -1678,10 +1768,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
grow = (address - vma->vm_end) >> PAGE_SHIFT;
error = acct_stack_growth(vma, size, grow);
- if (!error)
+ if (!error) {
vma->vm_end = address;
+ perf_event_mmap(vma);
+ }
}
- anon_vma_unlock(vma);
+ vma_unlock_anon_vma(vma);
+ khugepaged_enter_vma_merge(vma);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1706,7 +1799,7 @@ static int expand_downwards(struct vm_area_struct *vma,
if (error)
return error;
- anon_vma_lock(vma);
+ vma_lock_anon_vma(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
@@ -1725,9 +1818,11 @@ static int expand_downwards(struct vm_area_struct *vma,
if (!error) {
vma->vm_start = address;
vma->vm_pgoff -= grow;
+ perf_event_mmap(vma);
}
}
- anon_vma_unlock(vma);
+ vma_unlock_anon_vma(vma);
+ khugepaged_enter_vma_merge(vma);
return error;
}
@@ -1754,8 +1849,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
if (!prev || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED) {
- if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
- return NULL; /* vma gone! */
+ mlock_vma_pages_range(prev, addr, prev->vm_end);
}
return prev;
}
@@ -1783,8 +1877,7 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
if (expand_stack(vma, addr))
return NULL;
if (vma->vm_flags & VM_LOCKED) {
- if (mlock_vma_pages_range(vma, addr, start) < 0)
- return NULL; /* vma gone! */
+ mlock_vma_pages_range(vma, addr, start);
}
return vma;
}
@@ -1846,6 +1939,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr;
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
+ vma->vm_prev = NULL;
do {
rb_erase(&vma->vm_rb, &mm->mm_rb);
mm->map_count--;
@@ -1853,6 +1947,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
*insertion_point = vma;
+ if (vma)
+ vma->vm_prev = prev;
tail_vma->vm_next = NULL;
if (mm->unmap_area == arch_unmap_area)
addr = prev ? prev->vm_end : mm->mmap_base;
@@ -1871,6 +1967,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
{
struct mempolicy *pol;
struct vm_area_struct *new;
+ int err = -ENOMEM;
if (is_vm_hugetlb_page(vma) && (addr &
~(huge_page_mask(hstate_vma(vma)))))
@@ -1878,11 +1975,13 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!new)
- return -ENOMEM;
+ goto out_err;
/* most fields are the same, copy all, and then fixup */
*new = *vma;
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+
if (new_below)
new->vm_end = addr;
else {
@@ -1892,11 +1991,14 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
pol = mpol_dup(vma_policy(vma));
if (IS_ERR(pol)) {
- kmem_cache_free(vm_area_cachep, new);
- return PTR_ERR(pol);
+ err = PTR_ERR(pol);
+ goto out_free_vma;
}
vma_set_policy(new, pol);
+ if (anon_vma_clone(new, vma))
+ goto out_free_mpol;
+
if (new->vm_file) {
get_file(new->vm_file);
if (vma->vm_flags & VM_EXECUTABLE)
@@ -1907,12 +2009,30 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
new->vm_ops->open(new);
if (new_below)
- vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
+ err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
((addr - new->vm_start) >> PAGE_SHIFT), new);
else
- vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
+ err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
- return 0;
+ /* Success. */
+ if (!err)
+ return 0;
+
+ /* Clean everything up if vma_adjust failed. */
+ if (new->vm_ops && new->vm_ops->close)
+ new->vm_ops->close(new);
+ if (new->vm_file) {
+ if (vma->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(mm);
+ fput(new->vm_file);
+ }
+ unlink_anon_vmas(new);
+ out_free_mpol:
+ mpol_put(pol);
+ out_free_vma:
+ kmem_cache_free(vm_area_cachep, new);
+ out_err:
+ return err;
}
/*
@@ -2074,7 +2194,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
unsigned long locked, lock_limit;
locked = len >> PAGE_SHIFT;
locked += mm->locked_vm;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
return -EAGAIN;
@@ -2122,6 +2242,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
return -ENOMEM;
}
+ INIT_LIST_HEAD(&vma->anon_vma_chain);
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
@@ -2130,6 +2251,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
vma->vm_page_prot = vm_get_page_prot(flags);
vma_link(mm, vma, prev, rb_link, rb_parent);
out:
+ perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED) {
if (!mlock_vma_pages_range(vma, addr, addr + len))
@@ -2258,10 +2380,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
if (new_vma) {
*new_vma = *vma;
pol = mpol_dup(vma_policy(vma));
- if (IS_ERR(pol)) {
- kmem_cache_free(vm_area_cachep, new_vma);
- return NULL;
- }
+ if (IS_ERR(pol))
+ goto out_free_vma;
+ INIT_LIST_HEAD(&new_vma->anon_vma_chain);
+ if (anon_vma_clone(new_vma, vma))
+ goto out_free_mempol;
vma_set_policy(new_vma, pol);
new_vma->vm_start = addr;
new_vma->vm_end = addr + len;
@@ -2277,6 +2400,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
}
}
return new_vma;
+
+ out_free_mempol:
+ mpol_put(pol);
+ out_free_vma:
+ kmem_cache_free(vm_area_cachep, new_vma);
+ return NULL;
}
/*
@@ -2288,7 +2417,7 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages)
unsigned long cur = mm->total_vm; /* pages */
unsigned long lim;
- lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+ lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
if (cur + npages > lim)
return 0;
@@ -2348,12 +2477,14 @@ int install_special_mapping(struct mm_struct *mm,
unsigned long addr, unsigned long len,
unsigned long vm_flags, struct page **pages)
{
+ int ret;
struct vm_area_struct *vma;
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (unlikely(vma == NULL))
return -ENOMEM;
+ INIT_LIST_HEAD(&vma->anon_vma_chain);
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
@@ -2364,39 +2495,46 @@ int install_special_mapping(struct mm_struct *mm,
vma->vm_ops = &special_mapping_vmops;
vma->vm_private_data = pages;
- if (unlikely(insert_vm_struct(mm, vma))) {
- kmem_cache_free(vm_area_cachep, vma);
- return -ENOMEM;
- }
+ ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
+ if (ret)
+ goto out;
+
+ ret = insert_vm_struct(mm, vma);
+ if (ret)
+ goto out;
mm->total_vm += len >> PAGE_SHIFT;
perf_event_mmap(vma);
return 0;
+
+out:
+ kmem_cache_free(vm_area_cachep, vma);
+ return ret;
}
static DEFINE_MUTEX(mm_all_locks_mutex);
static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
- if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
/*
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
- spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
+ spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
/*
* We can safely modify head.next after taking the
- * anon_vma->lock. If some other vma in this mm shares
+ * anon_vma->root->lock. If some other vma in this mm shares
* the same anon_vma we won't take it again.
*
* No need of atomic instructions here, head.next
* can't change from under us thanks to the
- * anon_vma->lock.
+ * anon_vma->root->lock.
*/
if (__test_and_set_bit(0, (unsigned long *)
- &anon_vma->head.next))
+ &anon_vma->root->head.next))
BUG();
}
}
@@ -2454,6 +2592,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
int mm_take_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
+ struct anon_vma_chain *avc;
int ret = -EINTR;
BUG_ON(down_read_trylock(&mm->mmap_sem));
@@ -2471,7 +2610,8 @@ int mm_take_all_locks(struct mm_struct *mm)
if (signal_pending(current))
goto out_unlock;
if (vma->anon_vma)
- vm_lock_anon_vma(mm, vma->anon_vma);
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ vm_lock_anon_vma(mm, avc->anon_vma);
}
ret = 0;
@@ -2485,7 +2625,7 @@ out_unlock:
static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
{
- if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
/*
* The LSB of head.next can't change to 0 from under
* us because we hold the mm_all_locks_mutex.
@@ -2496,12 +2636,12 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
*
* No need of atomic instructions here, head.next
* can't change from under us until we release the
- * anon_vma->lock.
+ * anon_vma->root->lock.
*/
if (!__test_and_clear_bit(0, (unsigned long *)
- &anon_vma->head.next))
+ &anon_vma->root->head.next))
BUG();
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
}
}
@@ -2526,13 +2666,15 @@ static void vm_unlock_mapping(struct address_space *mapping)
void mm_drop_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
+ struct anon_vma_chain *avc;
BUG_ON(down_read_trylock(&mm->mmap_sem));
BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->anon_vma)
- vm_unlock_anon_vma(vma->anon_vma);
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ vm_unlock_anon_vma(avc->anon_vma);
if (vma->vm_file && vma->vm_file->f_mapping)
vm_unlock_mapping(vma->vm_file->f_mapping);
}
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index ded9081f4021..9e82e937000e 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -5,6 +5,7 @@
#include <linux/mm.h>
#include <linux/mmu_context.h>
+#include <linux/module.h>
#include <linux/sched.h>
#include <asm/mmu_context.h>
@@ -37,6 +38,7 @@ void use_mm(struct mm_struct *mm)
if (active_mm != mm)
mmdrop(active_mm);
}
+EXPORT_SYMBOL_GPL(use_mm);
/*
* unuse_mm
@@ -51,8 +53,10 @@ void unuse_mm(struct mm_struct *mm)
struct task_struct *tsk = current;
task_lock(tsk);
+ sync_mm_rss(tsk, mm);
tsk->mm = NULL;
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
task_unlock(tsk);
}
+EXPORT_SYMBOL_GPL(unuse_mm);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 7e33f2cb3c77..8d032de4088e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -16,6 +16,7 @@
#include <linux/err.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/slab.h>
/*
* This function can't run concurrently against mmu_notifier_register
@@ -99,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
return young;
}
+int __mmu_notifier_test_young(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+ int young = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->test_young) {
+ young = mn->ops->test_young(mn, mm, address);
+ if (young)
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return young;
+}
+
void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
pte_t pte)
{
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8bc969d8112d..5a688a2756be 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -10,7 +10,6 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
-#include <linux/slab.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/fs.h>
@@ -79,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
pte_unmap_unlock(pte - 1, ptl);
}
-static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
+static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
{
@@ -89,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (next - addr != HPAGE_PMD_SIZE)
+ split_huge_page_pmd(vma->vm_mm, pmd);
+ else if (change_huge_pmd(vma, pmd, addr, newprot))
+ continue;
+ /* fall through */
+ }
if (pmd_none_or_clear_bad(pmd))
continue;
- change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
+ change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
+ dirty_accountable);
} while (pmd++, addr = next, addr != end);
}
-static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
{
@@ -107,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
+ change_pmd_range(vma, pud, addr, next, newprot,
+ dirty_accountable);
} while (pud++, addr = next, addr != end);
}
@@ -127,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
+ change_pud_range(vma, pgd, addr, next, newprot,
+ dirty_accountable);
} while (pgd++, addr = next, addr != end);
flush_tlb_range(vma, start, end);
}
@@ -212,6 +221,7 @@ success:
mmu_notifier_invalidate_range_end(mm, start, end);
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+ perf_event_mmap(vma);
return 0;
fail:
@@ -300,7 +310,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
if (error)
goto out;
- perf_event_mmap(vma);
nstart = tmp;
if (nstart < prev->vm_end)
diff --git a/mm/mremap.c b/mm/mremap.c
index 845190898d59..9925b6391b80 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -9,7 +9,6 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
-#include <linux/slab.h>
#include <linux/shm.h>
#include <linux/ksm.h>
#include <linux/mman.h>
@@ -42,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
return NULL;
pmd = pmd_offset(pud, addr);
+ split_huge_page_pmd(mm, pmd);
if (pmd_none_or_clear_bad(pmd))
return NULL;
return pmd;
}
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
@@ -63,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
if (!pmd)
return NULL;
- if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+ if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
return NULL;
return pmd;
@@ -102,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* pte locks because exclusive mmap_sem prevents deadlock.
*/
old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
- new_pte = pte_offset_map_nested(new_pmd, new_addr);
+ new_pte = pte_offset_map(new_pmd, new_addr);
new_ptl = pte_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -120,7 +122,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
arch_leave_lazy_mmu_mode();
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
- pte_unmap_nested(new_pte - 1);
+ pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
@@ -148,7 +150,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
old_pmd = get_old_pmd(vma->vm_mm, old_addr);
if (!old_pmd)
continue;
- new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+ new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd)
break;
next = (new_addr + PMD_SIZE) & PMD_MASK;
@@ -285,7 +287,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (vma->vm_flags & VM_LOCKED) {
unsigned long locked, lock_limit;
locked = mm->locked_vm << PAGE_SHIFT;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
locked += new_len - old_len;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
goto Eagain;
@@ -460,8 +462,11 @@ unsigned long do_mremap(unsigned long addr,
if (vma_expandable(vma, new_len - old_len)) {
int pages = (new_len - old_len) >> PAGE_SHIFT;
- vma_adjust(vma, vma->vm_start,
- addr + new_len, vma->vm_pgoff, NULL);
+ if (vma_adjust(vma, vma->vm_start, addr + new_len,
+ vma->vm_pgoff, NULL)) {
+ ret = -ENOMEM;
+ goto out;
+ }
mm->total_vm += pages;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
diff --git a/mm/msync.c b/mm/msync.c
index 4083209b7f02..632df4527c01 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -82,7 +82,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
(vma->vm_flags & VM_SHARED)) {
get_file(file);
up_read(&mm->mmap_sem);
- error = vfs_fsync(file, file->f_path.dentry, 0);
+ error = vfs_fsync(file, 0);
fput(file);
if (error || start >= end)
goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index 48a2ecfaf059..f59e1424d3db 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,7 +10,7 @@
* Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
* Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
* Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
- * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org>
+ * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
*/
#include <linux/module.h>
@@ -29,6 +29,7 @@
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/audit.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
@@ -36,11 +37,6 @@
#include <asm/mmu_context.h>
#include "internal.h"
-static inline __attribute__((format(printf, 1, 2)))
-void no_printk(const char *fmt, ...)
-{
-}
-
#if 0
#define kenter(FMT, ...) \
printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
@@ -131,7 +127,8 @@ unsigned int kobjsize(const void *objp)
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, unsigned int foll_flags,
- struct page **pages, struct vm_area_struct **vmas)
+ struct page **pages, struct vm_area_struct **vmas,
+ int *retry)
{
struct vm_area_struct *vma;
unsigned long vm_flags;
@@ -162,7 +159,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
}
if (vmas)
vmas[i] = vma;
- start += PAGE_SIZE;
+ start = (start + PAGE_SIZE) & PAGE_MASK;
}
return i;
@@ -189,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (force)
flags |= FOLL_FORCE;
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+ return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+ NULL);
}
EXPORT_SYMBOL(get_user_pages);
@@ -298,12 +296,60 @@ void *vmalloc(unsigned long size)
}
EXPORT_SYMBOL(vmalloc);
+/*
+ * vzalloc - allocate virtually continguos memory with zero fill
+ *
+ * @size: allocation size
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into continguos kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL);
+}
+EXPORT_SYMBOL(vzalloc);
+
+/**
+ * vmalloc_node - allocate memory on a specific node
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
void *vmalloc_node(unsigned long size, int node)
{
return vmalloc(size);
}
EXPORT_SYMBOL(vmalloc_node);
+/**
+ * vzalloc_node - allocate memory on a specific node with zero fill
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc_node(unsigned long size, int node)
+{
+ return vzalloc(size);
+}
+EXPORT_SYMBOL(vzalloc_node);
+
#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif
@@ -397,6 +443,31 @@ void __attribute__((weak)) vmalloc_sync_all(void)
{
}
+/**
+ * alloc_vm_area - allocate a range of kernel address space
+ * @size: size of the area
+ *
+ * Returns: NULL on failure, vm_struct on success
+ *
+ * This function reserves a range of kernel address space, and
+ * allocates pagetables to map that range. No actual mappings
+ * are created. If the kernel address space is not shared
+ * between processes, it syncs the pagetable across all
+ * processes.
+ */
+struct vm_struct *alloc_vm_area(size_t size)
+{
+ BUG();
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(alloc_vm_area);
+
+void free_vm_area(struct vm_struct *area)
+{
+ BUG();
+}
+EXPORT_SYMBOL_GPL(free_vm_area);
+
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page)
{
@@ -609,7 +680,7 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
*/
static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct *pvma, **pp;
+ struct vm_area_struct *pvma, **pp, *next;
struct address_space *mapping;
struct rb_node **p, *parent;
@@ -669,8 +740,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
break;
}
- vma->vm_next = *pp;
+ next = *pp;
*pp = vma;
+ vma->vm_next = next;
+ if (next)
+ next->vm_prev = vma;
}
/*
@@ -918,14 +992,6 @@ static int validate_mmap_request(struct file *file,
if (!(capabilities & BDI_CAP_MAP_DIRECT))
return -ENODEV;
- if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
- ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
- ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
- ) {
- printk("MAP_SHARED not completely supported on !MMU\n");
- return -EINVAL;
- }
-
/* we mustn't privatise shared mappings */
capabilities &= ~BDI_CAP_MAP_COPY;
}
@@ -941,6 +1007,20 @@ static int validate_mmap_request(struct file *file,
capabilities &= ~BDI_CAP_MAP_DIRECT;
}
+ if (capabilities & BDI_CAP_MAP_DIRECT) {
+ if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
+ ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
+ ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
+ ) {
+ capabilities &= ~BDI_CAP_MAP_DIRECT;
+ if (flags & MAP_SHARED) {
+ printk(KERN_WARNING
+ "MAP_SHARED not completely supported on !MMU\n");
+ return -EINVAL;
+ }
+ }
+ }
+
/* handle executable mappings and implied executable
* mappings */
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
@@ -996,22 +1076,20 @@ static unsigned long determine_vm_flags(struct file *file,
unsigned long vm_flags;
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
- vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
/* vm_flags |= mm->def_flags; */
if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
/* attempt to share read-only copies of mapped file chunks */
+ vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (file && !(prot & PROT_WRITE))
vm_flags |= VM_MAYSHARE;
- }
- else {
+ } else {
/* overlay a shareable mapping on the backing device or inode
* if possible - used for chardevs, ramfs/tmpfs/shmfs and
* romfs/cramfs */
+ vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS);
if (flags & MAP_SHARED)
- vm_flags |= VM_MAYSHARE | VM_SHARED;
- else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0)
- vm_flags |= VM_MAYSHARE;
+ vm_flags |= VM_SHARED;
}
/* refuse to let anyone share private mappings with this process if
@@ -1040,10 +1118,9 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
if (ret != -ENOSYS)
return ret;
- /* getting an ENOSYS error indicates that direct mmap isn't
- * possible (as opposed to tried but failed) so we'll fall
- * through to making a private copy of the data and mapping
- * that if we can */
+ /* getting -ENOSYS indicates that direct mmap isn't possible (as
+ * opposed to tried but failed) so we can only give a suitable error as
+ * it's not possible to make a private copy if MAP_SHARED was given */
return -ENODEV;
}
@@ -1209,7 +1286,7 @@ unsigned long do_mmap_pgoff(struct file *file,
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;
- INIT_LIST_HEAD(&vma->anon_vma_node);
+ INIT_LIST_HEAD(&vma->anon_vma_chain);
vma->vm_flags = vm_flags;
vma->vm_pgoff = pgoff;
@@ -1410,6 +1487,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
struct file *file = NULL;
unsigned long retval = -EBADF;
+ audit_mmap_fd(fd, flags);
if (!(flags & MAP_ANONYMOUS)) {
file = fget(fd);
if (!file)
@@ -1428,6 +1506,30 @@ out:
return retval;
}
+#ifdef __ARCH_WANT_SYS_OLD_MMAP
+struct mmap_arg_struct {
+ unsigned long addr;
+ unsigned long len;
+ unsigned long prot;
+ unsigned long flags;
+ unsigned long fd;
+ unsigned long offset;
+};
+
+SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
+{
+ struct mmap_arg_struct a;
+
+ if (copy_from_user(&a, arg, sizeof(a)))
+ return -EFAULT;
+ if (a.offset & ~PAGE_MASK)
+ return -EINVAL;
+
+ return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
+ a.offset >> PAGE_SHIFT);
+}
+#endif /* __ARCH_WANT_SYS_OLD_MMAP */
+
/*
* split a vma into two pieces at address 'addr', a new vma is allocated either
* for the first part or the tail.
@@ -1643,6 +1745,7 @@ void exit_mmap(struct mm_struct *mm)
mm->mmap = vma->vm_next;
delete_vma_from_mm(vma);
delete_vma(mm, vma);
+ cond_resched();
}
kleave("");
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f52481b1c1e5..7dcca55ede7c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -4,6 +4,8 @@
* Copyright (C) 1998,2000 Rik van Riel
* Thanks go out to Claus Fischer for some serious inspiration and
* for goading me into coding this file...
+ * Copyright (C) 2010 Google, Inc.
+ * Rewritten by David Rientjes
*
* The routines in this file are used to kill a process when
* we're seriously out of memory. This gets called from __alloc_pages()
@@ -18,6 +20,7 @@
#include <linux/oom.h>
#include <linux/mm.h>
#include <linux/err.h>
+#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/swap.h>
#include <linux/timex.h>
@@ -26,171 +29,194 @@
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/memcontrol.h>
+#include <linux/mempolicy.h>
#include <linux/security.h>
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
-int sysctl_oom_dump_tasks;
+int sysctl_oom_dump_tasks = 1;
static DEFINE_SPINLOCK(zone_scan_lock);
-/* #define DEBUG */
+
+#ifdef CONFIG_NUMA
+/**
+ * has_intersects_mems_allowed() - check task eligiblity for kill
+ * @tsk: task struct of which task to consider
+ * @mask: nodemask passed to page allocator for mempolicy ooms
+ *
+ * Task eligibility is determined by whether or not a candidate task, @tsk,
+ * shares the same mempolicy nodes as current if it is bound by such a policy
+ * and whether or not it has the same set of allowed cpuset nodes.
+ */
+static bool has_intersects_mems_allowed(struct task_struct *tsk,
+ const nodemask_t *mask)
+{
+ struct task_struct *start = tsk;
+
+ do {
+ if (mask) {
+ /*
+ * If this is a mempolicy constrained oom, tsk's
+ * cpuset is irrelevant. Only return true if its
+ * mempolicy intersects current, otherwise it may be
+ * needlessly killed.
+ */
+ if (mempolicy_nodemask_intersects(tsk, mask))
+ return true;
+ } else {
+ /*
+ * This is not a mempolicy constrained oom, so only
+ * check the mems of tsk's cpuset.
+ */
+ if (cpuset_mems_allowed_intersects(current, tsk))
+ return true;
+ }
+ } while_each_thread(start, tsk);
+
+ return false;
+}
+#else
+static bool has_intersects_mems_allowed(struct task_struct *tsk,
+ const nodemask_t *mask)
+{
+ return true;
+}
+#endif /* CONFIG_NUMA */
+
+/*
+ * If this is a system OOM (not a memcg OOM) and the task selected to be
+ * killed is not already running at high (RT) priorities, speed up the
+ * recovery by boosting the dying task to the lowest FIFO priority.
+ * That helps with the recovery and avoids interfering with RT tasks.
+ */
+static void boost_dying_task_prio(struct task_struct *p,
+ struct mem_cgroup *mem)
+{
+ struct sched_param param = { .sched_priority = 1 };
+
+ if (mem)
+ return;
+
+ if (!rt_task(p))
+ sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+}
/*
- * Is all threads of the target process nodes overlap ours?
+ * The process p may have detached its own ->mm while exiting or through
+ * use_mm(), but one or more of its subthreads may still have a valid
+ * pointer. Return p, or any of its subthreads with a valid ->mm, with
+ * task_lock() held.
*/
-static int has_intersects_mems_allowed(struct task_struct *tsk)
+struct task_struct *find_lock_task_mm(struct task_struct *p)
{
- struct task_struct *t;
+ struct task_struct *t = p;
- t = tsk;
do {
- if (cpuset_mems_allowed_intersects(current, t))
- return 1;
- t = next_thread(t);
- } while (t != tsk);
+ task_lock(t);
+ if (likely(t->mm))
+ return t;
+ task_unlock(t);
+ } while_each_thread(p, t);
- return 0;
+ return NULL;
+}
+
+/* return true if the task is not adequate as candidate victim task. */
+static bool oom_unkillable_task(struct task_struct *p,
+ const struct mem_cgroup *mem, const nodemask_t *nodemask)
+{
+ if (is_global_init(p))
+ return true;
+ if (p->flags & PF_KTHREAD)
+ return true;
+
+ /* When mem_cgroup_out_of_memory() and p is not member of the group */
+ if (mem && !task_in_mem_cgroup(p, mem))
+ return true;
+
+ /* p may not have freeable memory in nodemask */
+ if (!has_intersects_mems_allowed(p, nodemask))
+ return true;
+
+ return false;
}
/**
- * badness - calculate a numeric value for how bad this task has been
+ * oom_badness - heuristic function to determine which candidate task to kill
* @p: task struct of which task we should calculate
- * @uptime: current uptime in seconds
- *
- * The formula used is relatively simple and documented inline in the
- * function. The main rationale is that we want to select a good task
- * to kill when we run out of memory.
+ * @totalpages: total present RAM allowed for page allocation
*
- * Good in this context means that:
- * 1) we lose the minimum amount of work done
- * 2) we recover a large amount of memory
- * 3) we don't kill anything innocent of eating tons of memory
- * 4) we want to kill the minimum amount of processes (one)
- * 5) we try to kill the process the user expects us to kill, this
- * algorithm has been meticulously tuned to meet the principle
- * of least surprise ... (be careful when you change it)
+ * The heuristic for determining which task to kill is made to be as simple and
+ * predictable as possible. The goal is to return the highest value for the
+ * task consuming the most memory to avoid subsequent oom failures.
*/
-
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
+ const nodemask_t *nodemask, unsigned long totalpages)
{
- unsigned long points, cpu_time, run_time;
- struct mm_struct *mm;
- struct task_struct *child;
- int oom_adj = p->signal->oom_adj;
- struct task_cputime task_time;
- unsigned long utime;
- unsigned long stime;
+ int points;
- if (oom_adj == OOM_DISABLE)
+ if (oom_unkillable_task(p, mem, nodemask))
return 0;
- task_lock(p);
- mm = p->mm;
- if (!mm) {
- task_unlock(p);
+ p = find_lock_task_mm(p);
+ if (!p)
return 0;
- }
/*
- * The memory size of the process is the basis for the badness.
+ * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
+ * so the entire heuristic doesn't need to be executed for something
+ * that cannot be killed.
*/
- points = mm->total_vm;
-
- /*
- * After this unlock we can no longer dereference local variable `mm'
- */
- task_unlock(p);
-
- /*
- * swapoff can easily use up all memory, so kill those first.
- */
- if (p->flags & PF_OOM_ORIGIN)
- return ULONG_MAX;
-
- /*
- * Processes which fork a lot of child processes are likely
- * a good choice. We add half the vmsize of the children if they
- * have an own mm. This prevents forking servers to flood the
- * machine with an endless amount of children. In case a single
- * child is eating the vast majority of memory, adding only half
- * to the parents will make the child our kill candidate of choice.
- */
- list_for_each_entry(child, &p->children, sibling) {
- task_lock(child);
- if (child->mm != mm && child->mm)
- points += child->mm->total_vm/2 + 1;
- task_unlock(child);
+ if (atomic_read(&p->mm->oom_disable_count)) {
+ task_unlock(p);
+ return 0;
}
/*
- * CPU time is in tens of seconds and run time is in thousands
- * of seconds. There is no particular reason for this other than
- * that it turned out to work very well in practice.
+ * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
+ * priority for oom killing.
*/
- thread_group_cputime(p, &task_time);
- utime = cputime_to_jiffies(task_time.utime);
- stime = cputime_to_jiffies(task_time.stime);
- cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
-
-
- if (uptime >= p->start_time.tv_sec)
- run_time = (uptime - p->start_time.tv_sec) >> 10;
- else
- run_time = 0;
-
- if (cpu_time)
- points /= int_sqrt(cpu_time);
- if (run_time)
- points /= int_sqrt(int_sqrt(run_time));
+ if (p->flags & PF_OOM_ORIGIN) {
+ task_unlock(p);
+ return 1000;
+ }
/*
- * Niced processes are most likely less important, so double
- * their badness points.
+ * The memory controller may have a limit of 0 bytes, so avoid a divide
+ * by zero, if necessary.
*/
- if (task_nice(p) > 0)
- points *= 2;
+ if (!totalpages)
+ totalpages = 1;
/*
- * Superuser processes are usually more important, so we make it
- * less likely that we kill those.
+ * The baseline for the badness score is the proportion of RAM that each
+ * task's rss and swap space use.
*/
- if (has_capability_noaudit(p, CAP_SYS_ADMIN) ||
- has_capability_noaudit(p, CAP_SYS_RESOURCE))
- points /= 4;
+ points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
+ totalpages;
+ task_unlock(p);
/*
- * We don't want to kill a process with direct hardware access.
- * Not only could that mess up the hardware, but usually users
- * tend to only have this flag set on applications they think
- * of as important.
+ * Root processes get 3% bonus, just like the __vm_enough_memory()
+ * implementation used by LSMs.
*/
- if (has_capability_noaudit(p, CAP_SYS_RAWIO))
- points /= 4;
+ if (has_capability_noaudit(p, CAP_SYS_ADMIN))
+ points -= 30;
/*
- * If p's nodes don't overlap ours, it may still help to kill p
- * because p may have allocated or otherwise mapped memory on
- * this node before. However it will be less likely.
+ * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
+ * either completely disable oom killing or always prefer a certain
+ * task.
*/
- if (!has_intersects_mems_allowed(p))
- points /= 8;
+ points += p->signal->oom_score_adj;
/*
- * Adjust the score by oom_adj.
+ * Never return 0 for an eligible task that may be killed since it's
+ * possible that no single user task uses more than 0.1% of memory and
+ * no single admin tasks uses more than 3.0%.
*/
- if (oom_adj) {
- if (oom_adj > 0) {
- if (!points)
- points = 1;
- points <<= oom_adj;
- } else
- points >>= -(oom_adj);
- }
-
-#ifdef DEBUG
- printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n",
- p->pid, p->comm, points);
-#endif
- return points;
+ if (points <= 0)
+ return 1;
+ return (points < 1000) ? points : 1000;
}
/*
@@ -198,12 +224,20 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
*/
#ifdef CONFIG_NUMA
static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask)
+ gfp_t gfp_mask, nodemask_t *nodemask,
+ unsigned long *totalpages)
{
struct zone *zone;
struct zoneref *z;
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ bool cpuset_limited = false;
+ int nid;
+ /* Default to all available memory */
+ *totalpages = totalram_pages + total_swap_pages;
+
+ if (!zonelist)
+ return CONSTRAINT_NONE;
/*
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
* to kill current.We have to random task kill in this case.
@@ -213,26 +247,37 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
return CONSTRAINT_NONE;
/*
- * The nodemask here is a nodemask passed to alloc_pages(). Now,
- * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
- * feature. mempolicy is an only user of nodemask here.
- * check mempolicy's nodemask contains all N_HIGH_MEMORY
+ * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
+ * the page allocator means a mempolicy is in effect. Cpuset policy
+ * is enforced in get_page_from_freelist().
*/
- if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask))
+ if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
+ *totalpages = total_swap_pages;
+ for_each_node_mask(nid, *nodemask)
+ *totalpages += node_spanned_pages(nid);
return CONSTRAINT_MEMORY_POLICY;
+ }
/* Check this allocation failure is caused by cpuset's wall function */
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask)
if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
- return CONSTRAINT_CPUSET;
+ cpuset_limited = true;
+ if (cpuset_limited) {
+ *totalpages = total_swap_pages;
+ for_each_node_mask(nid, cpuset_current_mems_allowed)
+ *totalpages += node_spanned_pages(nid);
+ return CONSTRAINT_CPUSET;
+ }
return CONSTRAINT_NONE;
}
#else
static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask)
+ gfp_t gfp_mask, nodemask_t *nodemask,
+ unsigned long *totalpages)
{
+ *totalpages = totalram_pages + total_swap_pages;
return CONSTRAINT_NONE;
}
#endif
@@ -243,28 +288,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct *select_bad_process(unsigned long *ppoints,
- struct mem_cgroup *mem)
+static struct task_struct *select_bad_process(unsigned int *ppoints,
+ unsigned long totalpages, struct mem_cgroup *mem,
+ const nodemask_t *nodemask)
{
struct task_struct *p;
struct task_struct *chosen = NULL;
- struct timespec uptime;
*ppoints = 0;
- do_posix_clock_monotonic_gettime(&uptime);
for_each_process(p) {
- unsigned long points;
+ unsigned int points;
- /*
- * skip kernel threads and tasks which have already released
- * their mm.
- */
- if (!p->mm)
- continue;
- /* skip the init task */
- if (is_global_init(p))
- continue;
- if (mem && !task_in_mem_cgroup(p, mem))
+ if (oom_unkillable_task(p, mem, nodemask))
continue;
/*
@@ -289,19 +324,16 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
* the process of exiting and releasing its resources.
* Otherwise we could get an easy OOM deadlock.
*/
- if (p->flags & PF_EXITING) {
+ if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) {
if (p != current)
return ERR_PTR(-1UL);
chosen = p;
- *ppoints = ULONG_MAX;
+ *ppoints = 1000;
}
- if (p->signal->oom_adj == OOM_DISABLE)
- continue;
-
- points = badness(p, uptime.tv_sec);
- if (points > *ppoints || !chosen) {
+ points = oom_badness(p, mem, nodemask, totalpages);
+ if (points > *ppoints) {
chosen = p;
*ppoints = points;
}
@@ -312,175 +344,208 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
/**
* dump_tasks - dump current memory state of all system tasks
- * @mem: target memory controller
+ * @mem: current's memory controller, if constrained
+ * @nodemask: nodemask passed to page allocator for mempolicy ooms
*
- * Dumps the current memory state of all system tasks, excluding kernel threads.
+ * Dumps the current memory state of all eligible tasks. Tasks not in the same
+ * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
+ * are not shown.
* State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
- * score, and name.
- *
- * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
- * shown.
+ * value, oom_score_adj value, and name.
*
* Call with tasklist_lock read-locked.
*/
-static void dump_tasks(const struct mem_cgroup *mem)
+static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)
{
- struct task_struct *g, *p;
-
- printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj "
- "name\n");
- do_each_thread(g, p) {
- struct mm_struct *mm;
+ struct task_struct *p;
+ struct task_struct *task;
- if (mem && !task_in_mem_cgroup(p, mem))
- continue;
- if (!thread_group_leader(p))
+ pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
+ for_each_process(p) {
+ if (oom_unkillable_task(p, mem, nodemask))
continue;
- task_lock(p);
- mm = p->mm;
- if (!mm) {
+ task = find_lock_task_mm(p);
+ if (!task) {
/*
- * total_vm and rss sizes do not exist for tasks with no
- * mm so there's no need to report them; they can't be
- * oom killed anyway.
+ * This is a kthread or all of p's threads have already
+ * detached their mm's. There's no need to report
+ * them; they can't be oom killed anyway.
*/
- task_unlock(p);
continue;
}
- printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
- p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
- get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
- p->comm);
- task_unlock(p);
- } while_each_thread(g, p);
+
+ pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n",
+ task->pid, task_uid(task), task->tgid,
+ task->mm->total_vm, get_mm_rss(task->mm),
+ task_cpu(task), task->signal->oom_adj,
+ task->signal->oom_score_adj, task->comm);
+ task_unlock(task);
+ }
}
static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
- struct mem_cgroup *mem)
+ struct mem_cgroup *mem, const nodemask_t *nodemask)
{
- pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
- "oom_adj=%d\n",
- current->comm, gfp_mask, order, current->signal->oom_adj);
task_lock(current);
+ pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
+ "oom_adj=%d, oom_score_adj=%d\n",
+ current->comm, gfp_mask, order, current->signal->oom_adj,
+ current->signal->oom_score_adj);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
dump_stack();
mem_cgroup_print_oom_info(mem, p);
show_mem();
if (sysctl_oom_dump_tasks)
- dump_tasks(mem);
+ dump_tasks(mem, nodemask);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
-
-/*
- * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
- * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
- * set.
- */
-static void __oom_kill_task(struct task_struct *p, int verbose)
+static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
{
- if (is_global_init(p)) {
- WARN_ON(1);
- printk(KERN_WARNING "tried to kill init!\n");
- return;
- }
+ struct task_struct *q;
+ struct mm_struct *mm;
- task_lock(p);
- if (!p->mm) {
- WARN_ON(1);
- printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n",
- task_pid_nr(p), p->comm);
- task_unlock(p);
- return;
- }
+ p = find_lock_task_mm(p);
+ if (!p)
+ return 1;
- if (verbose)
- printk(KERN_ERR "Killed process %d (%s) "
- "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
- task_pid_nr(p), p->comm,
- K(p->mm->total_vm),
- K(get_mm_counter(p->mm, anon_rss)),
- K(get_mm_counter(p->mm, file_rss)));
+ /* mm cannot be safely dereferenced after task_unlock(p) */
+ mm = p->mm;
+
+ pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+ task_pid_nr(p), p->comm, K(p->mm->total_vm),
+ K(get_mm_counter(p->mm, MM_ANONPAGES)),
+ K(get_mm_counter(p->mm, MM_FILEPAGES)));
task_unlock(p);
/*
- * We give our sacrificial lamb high priority and access to
- * all the memory it needs. That way it should be able to
- * exit() and clear out its resources quickly...
+ * Kill all processes sharing p->mm in other thread groups, if any.
+ * They don't get access to memory reserves or a higher scheduler
+ * priority, though, to avoid depletion of all memory or task
+ * starvation. This prevents mm->mmap_sem livelock when an oom killed
+ * task cannot exit because it requires the semaphore and its contended
+ * by another thread trying to allocate memory itself. That thread will
+ * now get access to memory reserves since it has a pending fatal
+ * signal.
*/
- p->rt.time_slice = HZ;
- set_tsk_thread_flag(p, TIF_MEMDIE);
+ for_each_process(q)
+ if (q->mm == mm && !same_thread_group(q, p)) {
+ task_lock(q); /* Protect ->comm from prctl() */
+ pr_err("Kill process %d (%s) sharing same memory\n",
+ task_pid_nr(q), q->comm);
+ task_unlock(q);
+ force_sig(SIGKILL, q);
+ }
+ set_tsk_thread_flag(p, TIF_MEMDIE);
force_sig(SIGKILL, p);
-}
-static int oom_kill_task(struct task_struct *p)
-{
- /* WARNING: mm may not be dereferenced since we did not obtain its
- * value from get_task_mm(p). This is OK since all we need to do is
- * compare mm to q->mm below.
- *
- * Furthermore, even if mm contains a non-NULL value, p->mm may
- * change to NULL at any time since we do not hold task_lock(p).
- * However, this is of no concern to us.
+ /*
+ * We give our sacrificial lamb high priority and access to
+ * all the memory it needs. That way it should be able to
+ * exit() and clear out its resources quickly...
*/
- if (!p->mm || p->signal->oom_adj == OOM_DISABLE)
- return 1;
-
- __oom_kill_task(p, 1);
+ boost_dying_task_prio(p, mem);
return 0;
}
+#undef K
static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
- unsigned long points, struct mem_cgroup *mem,
+ unsigned int points, unsigned long totalpages,
+ struct mem_cgroup *mem, nodemask_t *nodemask,
const char *message)
{
- struct task_struct *c;
+ struct task_struct *victim = p;
+ struct task_struct *child;
+ struct task_struct *t = p;
+ unsigned int victim_points = 0;
if (printk_ratelimit())
- dump_header(p, gfp_mask, order, mem);
+ dump_header(p, gfp_mask, order, mem, nodemask);
/*
* If the task is already exiting, don't alarm the sysadmin or kill
* its children or threads, just set TIF_MEMDIE so it can die quickly
*/
if (p->flags & PF_EXITING) {
- __oom_kill_task(p, 0);
+ set_tsk_thread_flag(p, TIF_MEMDIE);
+ boost_dying_task_prio(p, mem);
return 0;
}
- printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
- message, task_pid_nr(p), p->comm, points);
+ task_lock(p);
+ pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
+ message, task_pid_nr(p), p->comm, points);
+ task_unlock(p);
- /* Try to kill a child first */
- list_for_each_entry(c, &p->children, sibling) {
- if (c->mm == p->mm)
- continue;
- if (!oom_kill_task(c))
- return 0;
+ /*
+ * If any of p's children has a different mm and is eligible for kill,
+ * the one with the highest badness() score is sacrificed for its
+ * parent. This attempts to lose the minimal amount of work done while
+ * still freeing memory.
+ */
+ do {
+ list_for_each_entry(child, &t->children, sibling) {
+ unsigned int child_points;
+
+ /*
+ * oom_badness() returns 0 if the thread is unkillable
+ */
+ child_points = oom_badness(child, mem, nodemask,
+ totalpages);
+ if (child_points > victim_points) {
+ victim = child;
+ victim_points = child_points;
+ }
+ }
+ } while_each_thread(p, t);
+
+ return oom_kill_task(victim, mem);
+}
+
+/*
+ * Determines whether the kernel must panic because of the panic_on_oom sysctl.
+ */
+static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
+ int order, const nodemask_t *nodemask)
+{
+ if (likely(!sysctl_panic_on_oom))
+ return;
+ if (sysctl_panic_on_oom != 2) {
+ /*
+ * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
+ * does not panic for cpuset, mempolicy, or memcg allocation
+ * failures.
+ */
+ if (constraint != CONSTRAINT_NONE)
+ return;
}
- return oom_kill_task(p);
+ read_lock(&tasklist_lock);
+ dump_header(NULL, gfp_mask, order, NULL, nodemask);
+ read_unlock(&tasklist_lock);
+ panic("Out of memory: %s panic_on_oom is enabled\n",
+ sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
{
- unsigned long points = 0;
+ unsigned long limit;
+ unsigned int points = 0;
struct task_struct *p;
+ check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
+ limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
read_lock(&tasklist_lock);
retry:
- p = select_bad_process(&points, mem);
- if (PTR_ERR(p) == -1UL)
+ p = select_bad_process(&points, limit, mem, NULL);
+ if (!p || PTR_ERR(p) == -1UL)
goto out;
- if (!p)
- p = current;
-
- if (oom_kill_process(p, gfp_mask, 0, points, mem,
+ if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
"Memory cgroup out of memory"))
goto retry;
out:
@@ -507,7 +572,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
* if a parallel OOM killing is already taking place that includes a zone in
* the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
*/
-int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
{
struct zoneref *z;
struct zone *zone;
@@ -524,7 +589,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
/*
* Lock each zone in the zonelist under zone_scan_lock so a
- * parallel invocation of try_set_zone_oom() doesn't succeed
+ * parallel invocation of try_set_zonelist_oom() doesn't succeed
* when it shouldn't.
*/
zone_set_flag(zone, ZONE_OOM_LOCKED);
@@ -553,73 +618,40 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
}
/*
- * Must be called with tasklist_lock held for read.
+ * Try to acquire the oom killer lock for all system zones. Returns zero if a
+ * parallel oom killing is taking place, otherwise locks all zones and returns
+ * non-zero.
*/
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static int try_set_system_oom(void)
{
- struct task_struct *p;
- unsigned long points;
-
- if (sysctl_oom_kill_allocating_task)
- if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
- "Out of memory (oom_kill_allocating_task)"))
- return;
-retry:
- /*
- * Rambo mode: Shoot down a process and hope it solves whatever
- * issues we may have.
- */
- p = select_bad_process(&points, NULL);
-
- if (PTR_ERR(p) == -1UL)
- return;
-
- /* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
- read_unlock(&tasklist_lock);
- dump_header(NULL, gfp_mask, order, NULL);
- panic("Out of memory and no killable processes...\n");
- }
+ struct zone *zone;
+ int ret = 1;
- if (oom_kill_process(p, gfp_mask, order, points, NULL,
- "Out of memory"))
- goto retry;
+ spin_lock(&zone_scan_lock);
+ for_each_populated_zone(zone)
+ if (zone_is_oom_locked(zone)) {
+ ret = 0;
+ goto out;
+ }
+ for_each_populated_zone(zone)
+ zone_set_flag(zone, ZONE_OOM_LOCKED);
+out:
+ spin_unlock(&zone_scan_lock);
+ return ret;
}
/*
- * pagefault handler calls into here because it is out of memory but
- * doesn't know exactly how or why.
+ * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
+ * attempts or page faults may now recall the oom killer, if necessary.
*/
-void pagefault_out_of_memory(void)
+static void clear_system_oom(void)
{
- unsigned long freed = 0;
-
- blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
- if (freed > 0)
- /* Got some memory back in the last second. */
- return;
-
- /*
- * If this is from memcg, oom-killer is already invoked.
- * and not worth to go system-wide-oom.
- */
- if (mem_cgroup_oom_called(current))
- goto rest_and_return;
-
- if (sysctl_panic_on_oom)
- panic("out of memory from page fault. panic_on_oom is selected.\n");
-
- read_lock(&tasklist_lock);
- __out_of_memory(0, 0); /* unknown gfp_mask and order */
- read_unlock(&tasklist_lock);
+ struct zone *zone;
- /*
- * Give "p" a good chance of killing itself before we
- * retry to allocate memory.
- */
-rest_and_return:
- if (!test_thread_flag(TIF_MEMDIE))
- schedule_timeout_uninterruptible(1);
+ spin_lock(&zone_scan_lock);
+ for_each_populated_zone(zone)
+ zone_clear_flag(zone, ZONE_OOM_LOCKED);
+ spin_unlock(&zone_scan_lock);
}
/**
@@ -627,6 +659,7 @@ rest_and_return:
* @zonelist: zonelist pointer
* @gfp_mask: memory allocation flags
* @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
@@ -636,49 +669,93 @@ rest_and_return:
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *nodemask)
{
+ const nodemask_t *mpol_mask;
+ struct task_struct *p;
+ unsigned long totalpages;
unsigned long freed = 0;
- enum oom_constraint constraint;
+ unsigned int points;
+ enum oom_constraint constraint = CONSTRAINT_NONE;
+ int killed = 0;
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
/* Got some memory back in the last second. */
return;
- if (sysctl_panic_on_oom == 2) {
- dump_header(NULL, gfp_mask, order, NULL);
- panic("out of memory. Compulsory panic_on_oom is selected.\n");
+ /*
+ * If current has a pending SIGKILL, then automatically select it. The
+ * goal is to allow it to allocate so that it may quickly exit and free
+ * its memory.
+ */
+ if (fatal_signal_pending(current)) {
+ set_thread_flag(TIF_MEMDIE);
+ boost_dying_task_prio(current, NULL);
+ return;
}
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
- constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
+ constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
+ &totalpages);
+ mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
+ check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
+
read_lock(&tasklist_lock);
+ if (sysctl_oom_kill_allocating_task &&
+ !oom_unkillable_task(current, NULL, nodemask) &&
+ current->mm && !atomic_read(&current->mm->oom_disable_count)) {
+ /*
+ * oom_kill_process() needs tasklist_lock held. If it returns
+ * non-zero, current could not be killed so we must fallback to
+ * the tasklist scan.
+ */
+ if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
+ NULL, nodemask,
+ "Out of memory (oom_kill_allocating_task)"))
+ goto out;
+ }
- switch (constraint) {
- case CONSTRAINT_MEMORY_POLICY:
- oom_kill_process(current, gfp_mask, order, 0, NULL,
- "No available memory (MPOL_BIND)");
- break;
+retry:
+ p = select_bad_process(&points, totalpages, NULL, mpol_mask);
+ if (PTR_ERR(p) == -1UL)
+ goto out;
- case CONSTRAINT_NONE:
- if (sysctl_panic_on_oom) {
- dump_header(NULL, gfp_mask, order, NULL);
- panic("out of memory. panic_on_oom is selected\n");
- }
- /* Fall-through */
- case CONSTRAINT_CPUSET:
- __out_of_memory(gfp_mask, order);
- break;
+ /* Found nothing?!?! Either we hang forever, or we panic. */
+ if (!p) {
+ dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
+ read_unlock(&tasklist_lock);
+ panic("Out of memory and no killable processes...\n");
}
+ if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+ nodemask, "Out of memory"))
+ goto retry;
+ killed = 1;
+out:
read_unlock(&tasklist_lock);
/*
* Give "p" a good chance of killing itself before we
* retry to allocate memory unless "p" is current
*/
+ if (killed && !test_thread_flag(TIF_MEMDIE))
+ schedule_timeout_uninterruptible(1);
+}
+
+/*
+ * The pagefault handler calls here because it is out of memory, so kill a
+ * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel
+ * oom killing is already in progress so do nothing. If a task is found with
+ * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
+ */
+void pagefault_out_of_memory(void)
+{
+ if (try_set_system_oom()) {
+ out_of_memory(NULL, 0, 0, NULL);
+ clear_system_oom();
+ }
if (!test_thread_flag(TIF_MEMDIE))
schedule_timeout_uninterruptible(1);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943ecf8b..2cb01f6ec5d0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
#include <linux/syscalls.h>
#include <linux/buffer_head.h>
#include <linux/pagevec.h>
+#include <trace/events/writeback.h>
/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
@@ -252,32 +253,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
}
}
-/*
- * Clip the earned share of dirty pages to that which is actually available.
- * This avoids exceeding the total dirty_limit when the floating averages
- * fluctuate too quickly.
- */
-static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
- unsigned long dirty, unsigned long *pbdi_dirty)
-{
- unsigned long avail_dirty;
-
- avail_dirty = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_WRITEBACK) +
- global_page_state(NR_UNSTABLE_NFS) +
- global_page_state(NR_WRITEBACK_TEMP);
-
- if (avail_dirty < dirty)
- avail_dirty = dirty - avail_dirty;
- else
- avail_dirty = 0;
-
- avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
- bdi_stat(bdi, BDI_WRITEBACK);
-
- *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
-}
-
static inline void task_dirties_fraction(struct task_struct *tsk,
long *numerator, long *denominator)
{
@@ -286,16 +261,24 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
}
/*
- * scale the dirty limit
+ * task_dirty_limit - scale down dirty throttling threshold for one task
*
* task specific dirty limit:
*
* dirty -= (dirty/8) * p_{t}
+ *
+ * To protect light/slow dirtying tasks from heavier/fast ones, we start
+ * throttling individual tasks before reaching the bdi dirty limit.
+ * Relatively low thresholds will be allocated to heavy dirtiers. So when
+ * dirty pages grow large, heavy dirtiers will be throttled first, which will
+ * effectively curb the growth of dirty pages. Light dirtiers with high enough
+ * dirty threshold may never get throttled.
*/
-static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
+static unsigned long task_dirty_limit(struct task_struct *tsk,
+ unsigned long bdi_dirty)
{
long numerator, denominator;
- unsigned long dirty = *pdirty;
+ unsigned long dirty = bdi_dirty;
u64 inv = dirty >> 3;
task_dirties_fraction(tsk, &numerator, &denominator);
@@ -303,10 +286,8 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
do_div(inv, denominator);
dirty -= inv;
- if (dirty < *pdirty/2)
- dirty = *pdirty/2;
- *pdirty = dirty;
+ return max(dirty, bdi_dirty/2);
}
/*
@@ -416,25 +397,29 @@ unsigned long determine_dirtyable_memory(void)
return x + 1; /* Ensure that we never return 0 */
}
-void
-get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
- unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
+/*
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ *
+ * Calculate the dirty thresholds based on sysctl parameters
+ * - vm.dirty_background_ratio or vm.dirty_background_bytes
+ * - vm.dirty_ratio or vm.dirty_bytes
+ * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * real-time tasks.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
unsigned long background;
unsigned long dirty;
- unsigned long available_memory = determine_dirtyable_memory();
+ unsigned long uninitialized_var(available_memory);
struct task_struct *tsk;
+ if (!vm_dirty_bytes || !dirty_background_bytes)
+ available_memory = determine_dirtyable_memory();
+
if (vm_dirty_bytes)
dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
- else {
- int dirty_ratio;
-
- dirty_ratio = vm_dirty_ratio;
- if (dirty_ratio < 5)
- dirty_ratio = 5;
- dirty = (dirty_ratio * available_memory) / 100;
- }
+ else
+ dirty = (vm_dirty_ratio * available_memory) / 100;
if (dirty_background_bytes)
background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
@@ -450,27 +435,37 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
}
*pbackground = background;
*pdirty = dirty;
+}
- if (bdi) {
- u64 bdi_dirty;
- long numerator, denominator;
+/*
+ * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ *
+ * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * - starving fast devices
+ * - piling up dirty pages (that will take long time to sync) on slow devices
+ *
+ * The bdi's share of dirty limit will be adapting to its throughput and
+ * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
+ */
+unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
+{
+ u64 bdi_dirty;
+ long numerator, denominator;
- /*
- * Calculate this BDI's share of the dirty ratio.
- */
- bdi_writeout_fraction(bdi, &numerator, &denominator);
-
- bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
- bdi_dirty *= numerator;
- do_div(bdi_dirty, denominator);
- bdi_dirty += (dirty * bdi->min_ratio) / 100;
- if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
- bdi_dirty = dirty * bdi->max_ratio / 100;
-
- *pbdi_dirty = bdi_dirty;
- clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
- task_dirty_limit(current, pbdi_dirty);
- }
+ /*
+ * Calculate this BDI's share of the dirty ratio.
+ */
+ bdi_writeout_fraction(bdi, &numerator, &denominator);
+
+ bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
+ bdi_dirty *= numerator;
+ do_div(bdi_dirty, denominator);
+
+ bdi_dirty += (dirty * bdi->min_ratio) / 100;
+ if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
+ bdi_dirty = dirty * bdi->max_ratio / 100;
+
+ return bdi_dirty;
}
/*
@@ -490,58 +485,34 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long bdi_thresh;
unsigned long pages_written = 0;
unsigned long pause = 1;
-
+ bool dirty_exceeded = false;
struct backing_dev_info *bdi = mapping->backing_dev_info;
for (;;) {
struct writeback_control wbc = {
- .bdi = bdi,
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.nr_to_write = write_chunk,
.range_cyclic = 1,
};
- get_dirty_limits(&background_thresh, &dirty_thresh,
- &bdi_thresh, bdi);
-
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
nr_writeback = global_page_state(NR_WRITEBACK);
- bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
- bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
-
- if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
- break;
+ global_dirty_limits(&background_thresh, &dirty_thresh);
/*
* Throttle it only when the background writeback cannot
* catch-up. This avoids (excessively) small writeouts
* when the bdi limits are ramping up.
*/
- if (nr_reclaimable + nr_writeback <
+ if (nr_reclaimable + nr_writeback <=
(background_thresh + dirty_thresh) / 2)
break;
- if (!bdi->dirty_exceeded)
- bdi->dirty_exceeded = 1;
-
- /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
- * Unstable writes are a feature of certain networked
- * filesystems (i.e. NFS) in which data may have been
- * written to the server's write cache, but has not yet
- * been flushed to permanent storage.
- * Only move pages to writeback if this bdi is over its
- * threshold otherwise wait until the disk writes catch
- * up.
- */
- if (bdi_nr_reclaimable > bdi_thresh) {
- writeback_inodes_wbc(&wbc);
- pages_written += write_chunk - wbc.nr_to_write;
- get_dirty_limits(&background_thresh, &dirty_thresh,
- &bdi_thresh, bdi);
- }
+ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+ bdi_thresh = task_dirty_limit(current, bdi_thresh);
/*
* In order to avoid the stacked BDI deadlock we need
@@ -556,17 +527,46 @@ static void balance_dirty_pages(struct address_space *mapping,
if (bdi_thresh < 2*bdi_stat_error(bdi)) {
bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
- } else if (bdi_nr_reclaimable) {
+ } else {
bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
}
- if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+ /*
+ * The bdi thresh is somehow "soft" limit derived from the
+ * global "hard" limit. The former helps to prevent heavy IO
+ * bdi or process from holding back light ones; The latter is
+ * the last resort safeguard.
+ */
+ dirty_exceeded =
+ (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
+ || (nr_reclaimable + nr_writeback > dirty_thresh);
+
+ if (!dirty_exceeded)
break;
- if (pages_written >= write_chunk)
- break; /* We've done our duty */
- __set_current_state(TASK_INTERRUPTIBLE);
+ if (!bdi->dirty_exceeded)
+ bdi->dirty_exceeded = 1;
+
+ /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
+ * Unstable writes are a feature of certain networked
+ * filesystems (i.e. NFS) in which data may have been
+ * written to the server's write cache, but has not yet
+ * been flushed to permanent storage.
+ * Only move pages to writeback if this bdi is over its
+ * threshold otherwise wait until the disk writes catch
+ * up.
+ */
+ trace_wbc_balance_dirty_start(&wbc, bdi);
+ if (bdi_nr_reclaimable > bdi_thresh) {
+ writeback_inodes_wb(&bdi->wb, &wbc);
+ pages_written += write_chunk - wbc.nr_to_write;
+ trace_wbc_balance_dirty_written(&wbc, bdi);
+ if (pages_written >= write_chunk)
+ break; /* We've done our duty */
+ }
+ trace_wbc_balance_dirty_wait(&wbc, bdi);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
io_schedule_timeout(pause);
/*
@@ -578,8 +578,7 @@ static void balance_dirty_pages(struct address_space *mapping,
pause = HZ / 10;
}
- if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
- bdi->dirty_exceeded)
+ if (!dirty_exceeded && bdi->dirty_exceeded)
bdi->dirty_exceeded = 0;
if (writeback_in_progress(bdi))
@@ -594,10 +593,8 @@ static void balance_dirty_pages(struct address_space *mapping,
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
- (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
- + global_page_state(NR_UNSTABLE_NFS))
- > background_thresh)))
- bdi_start_writeback(bdi, NULL, 0);
+ (!laptop_mode && (nr_reclaimable > background_thresh)))
+ bdi_start_background_writeback(bdi);
}
void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -660,7 +657,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
unsigned long dirty_thresh;
for ( ; ; ) {
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+ global_dirty_limits(&background_thresh, &dirty_thresh);
/*
* Boost the allowable dirty threshold a bit for page
@@ -683,10 +680,6 @@ void throttle_vm_writeout(gfp_t gfp_mask)
}
}
-static void laptop_timer_fn(unsigned long unused);
-
-static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
-
/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
@@ -694,24 +687,23 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec(table, write, buffer, length, ppos);
+ bdi_arm_supers_timer();
return 0;
}
-static void do_laptop_sync(struct work_struct *work)
-{
- wakeup_flusher_threads(0);
- kfree(work);
-}
-
-static void laptop_timer_fn(unsigned long unused)
+#ifdef CONFIG_BLOCK
+void laptop_mode_timer_fn(unsigned long data)
{
- struct work_struct *work;
+ struct request_queue *q = (struct request_queue *)data;
+ int nr_pages = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
- work = kmalloc(sizeof(*work), GFP_ATOMIC);
- if (work) {
- INIT_WORK(work, do_laptop_sync);
- schedule_work(work);
- }
+ /*
+ * We want to write everything out, not just down to the dirty
+ * threshold
+ */
+ if (bdi_has_dirty_io(&q->backing_dev_info))
+ bdi_start_writeback(&q->backing_dev_info, nr_pages);
}
/*
@@ -719,9 +711,9 @@ static void laptop_timer_fn(unsigned long unused)
* of all dirty data a few seconds from now. If the flush is already scheduled
* then push it back - the user is still using the disk.
*/
-void laptop_io_completion(void)
+void laptop_io_completion(struct backing_dev_info *info)
{
- mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
+ mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
}
/*
@@ -731,8 +723,16 @@ void laptop_io_completion(void)
*/
void laptop_sync_completion(void)
{
- del_timer(&laptop_mode_wb_timer);
+ struct backing_dev_info *bdi;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
+ del_timer(&bdi->laptop_mode_wb_timer);
+
+ rcu_read_unlock();
}
+#endif
/*
* If ratelimit_pages is too high then we can get into dirty-data overload
@@ -803,6 +803,42 @@ void __init page_writeback_init(void)
}
/**
+ * tag_pages_for_writeback - tag pages to be written by write_cache_pages
+ * @mapping: address space structure to write
+ * @start: starting page index
+ * @end: ending page index (inclusive)
+ *
+ * This function scans the page range from @start to @end (inclusive) and tags
+ * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
+ * that write_cache_pages (or whoever calls this function) will then use
+ * TOWRITE tag to identify pages eligible for writeback. This mechanism is
+ * used to avoid livelocking of writeback by a process steadily creating new
+ * dirty pages in the file (thus it is important for this function to be quick
+ * so that it can tag pages faster than a dirtying process can create them).
+ */
+/*
+ * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
+ */
+void tag_pages_for_writeback(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+#define WRITEBACK_TAG_BATCH 4096
+ unsigned long tagged;
+
+ do {
+ spin_lock_irq(&mapping->tree_lock);
+ tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
+ &start, end, WRITEBACK_TAG_BATCH,
+ PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+ spin_unlock_irq(&mapping->tree_lock);
+ WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
+ cond_resched();
+ /* We check 'start' to handle wrapping when end == ~0UL */
+ } while (tagged >= WRITEBACK_TAG_BATCH && start);
+}
+EXPORT_SYMBOL(tag_pages_for_writeback);
+
+/**
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
@@ -816,6 +852,13 @@ void __init page_writeback_init(void)
* the call was made get new I/O started against them. If wbc->sync_mode is
* WB_SYNC_ALL then we were called for data integrity and we must wait for
* existing IO to complete.
+ *
+ * To avoid livelocks (when other process dirties new pages), we first tag
+ * pages which should be written back with TOWRITE tag and only then start
+ * writing them. For data-integrity sync we have to be careful so that we do
+ * not miss some pages (e.g., because some other process has cleared TOWRITE
+ * tag we set). The rule we follow is that TOWRITE tag can be cleared only
+ * by the process clearing the DIRTY tag (and submitting the page for IO).
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
@@ -831,7 +874,7 @@ int write_cache_pages(struct address_space *mapping,
pgoff_t done_index;
int cycled;
int range_whole = 0;
- long nr_to_write = wbc->nr_to_write;
+ int tag;
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
@@ -849,13 +892,18 @@ int write_cache_pages(struct address_space *mapping,
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
}
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
retry:
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && (index <= end)) {
int i;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
break;
@@ -913,6 +961,7 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
+ trace_wbc_writepage(wbc, mapping->backing_dev_info);
ret = (*writepage)(page, wbc, data);
if (unlikely(ret)) {
if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -931,25 +980,18 @@ continue_unlock:
done = 1;
break;
}
- }
+ }
- if (nr_to_write > 0) {
- nr_to_write--;
- if (nr_to_write == 0 &&
- wbc->sync_mode == WB_SYNC_NONE) {
- /*
- * We stop writing back only if we are
- * not doing integrity sync. In case of
- * integrity sync we have to keep going
- * because someone may be concurrently
- * dirtying pages, and we might have
- * synced a lot of newly appeared dirty
- * pages, but have not synced all of the
- * old dirty pages.
- */
- done = 1;
- break;
- }
+ /*
+ * We stop writing back only if we are not doing
+ * integrity sync. In case of integrity sync we have to
+ * keep going until we have written all the pages
+ * we tagged for writeback prior to entering this loop.
+ */
+ if (--wbc->nr_to_write <= 0 &&
+ wbc->sync_mode == WB_SYNC_NONE) {
+ done = 1;
+ break;
}
}
pagevec_release(&pvec);
@@ -966,11 +1008,8 @@ continue_unlock:
end = writeback_index - 1;
goto retry;
}
- if (!wbc->no_nrwrite_index_update) {
- if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
- mapping->writeback_index = done_index;
- wbc->nr_to_write = nr_to_write;
- }
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ mapping->writeback_index = done_index;
return ret;
}
@@ -1067,7 +1106,7 @@ EXPORT_SYMBOL(write_one_page);
int __set_page_dirty_no_writeback(struct page *page)
{
if (!PageDirty(page))
- SetPageDirty(page);
+ return !TestSetPageDirty(page);
return 0;
}
@@ -1079,11 +1118,25 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
{
if (mapping_cap_account_dirty(mapping)) {
__inc_zone_page_state(page, NR_FILE_DIRTY);
+ __inc_zone_page_state(page, NR_DIRTIED);
__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
task_dirty_inc(current);
task_io_account_write(PAGE_CACHE_SIZE);
}
}
+EXPORT_SYMBOL(account_page_dirtied);
+
+/*
+ * Helper function for set_page_writeback family.
+ * NOTE: Unlike account_page_dirtied this does not rely on being atomic
+ * wrt interrupts.
+ */
+void account_page_writeback(struct page *page)
+{
+ inc_zone_page_state(page, NR_WRITEBACK);
+ inc_zone_page_state(page, NR_WRITTEN);
+}
+EXPORT_SYMBOL(account_page_writeback);
/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
@@ -1315,12 +1368,15 @@ int test_set_page_writeback(struct page *page)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_TOWRITE);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
}
if (!ret)
- inc_zone_page_state(page, NR_WRITEBACK);
+ account_page_writeback(page);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8deb9d0fd5b1..90c1439549fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -21,6 +21,7 @@
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kmemcheck.h>
@@ -49,12 +50,30 @@
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
#include <linux/memory.h>
+#include <linux/compaction.h>
#include <trace/events/kmem.h>
+#include <linux/ftrace_event.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include "internal.h"
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+DEFINE_PER_CPU(int, numa_node);
+EXPORT_PER_CPU_SYMBOL(numa_node);
+#endif
+
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+/*
+ * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
+ * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
+ * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
+ * defined in <linux/topology.h>.
+ */
+DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
+EXPORT_PER_CPU_SYMBOL(_numa_mem_);
+#endif
+
/*
* Array of node states.
*/
@@ -76,6 +95,36 @@ unsigned long totalreserve_pages __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+#ifdef CONFIG_PM_SLEEP
+/*
+ * The following functions are used by the suspend/hibernate code to temporarily
+ * change gfp_allowed_mask in order to avoid using I/O during memory allocations
+ * while devices are suspended. To avoid races with the suspend/hibernate code,
+ * they should always be called with pm_mutex held (gfp_allowed_mask also should
+ * only be modified with pm_mutex held, unless the suspend/hibernate code is
+ * guaranteed not to run in parallel with that modification).
+ */
+
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
+{
+ WARN_ON(!mutex_is_locked(&pm_mutex));
+ if (saved_gfp_mask) {
+ gfp_allowed_mask = saved_gfp_mask;
+ saved_gfp_mask = 0;
+ }
+}
+
+void pm_restrict_gfp_mask(void)
+{
+ WARN_ON(!mutex_is_locked(&pm_mutex));
+ WARN_ON(saved_gfp_mask);
+ saved_gfp_mask = gfp_allowed_mask;
+ gfp_allowed_mask &= ~GFP_IOFS;
+}
+#endif /* CONFIG_PM_SLEEP */
+
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
int pageblock_order __read_mostly;
#endif
@@ -263,10 +312,7 @@ static void bad_page(struct page *page)
printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
- printk(KERN_ALERT
- "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
- page, (void *)page->flags, page_count(page),
- page_mapcount(page), page->mapping, page->index);
+ dump_page(page);
dump_stack();
out:
@@ -311,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order)
}
}
+/* update __split_huge_page_refcount if you change this function */
static int destroy_compound_page(struct page *page, unsigned long order)
{
int i;
@@ -380,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
*
* Assumption: *_mem_map is contiguous at least up to MAX_ORDER
*/
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
- unsigned long buddy_idx = page_idx ^ (1 << order);
-
- return page + (buddy_idx - page_idx);
-}
-
static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
{
- return (page_idx & ~(1 << order));
+ return page_idx ^ (1 << order);
}
/*
@@ -402,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
- * For recording whether a page is in the buddy system, we use PG_buddy.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
@@ -436,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
* order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
@@ -452,6 +491,9 @@ static inline void __free_one_page(struct page *page,
int migratetype)
{
unsigned long page_idx;
+ unsigned long combined_idx;
+ unsigned long uninitialized_var(buddy_idx);
+ struct page *buddy;
if (unlikely(PageCompound(page)))
if (unlikely(destroy_compound_page(page, order)))
@@ -465,10 +507,8 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON(bad_range(zone, page));
while (order < MAX_ORDER-1) {
- unsigned long combined_idx;
- struct page *buddy;
-
- buddy = __page_find_buddy(page, page_idx, order);
+ buddy_idx = __find_buddy_index(page_idx, order);
+ buddy = page + (buddy_idx - page_idx);
if (!page_is_buddy(page, buddy, order))
break;
@@ -476,14 +516,36 @@ static inline void __free_one_page(struct page *page,
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
rmv_page_order(buddy);
- combined_idx = __find_combined_index(page_idx, order);
+ combined_idx = buddy_idx & page_idx;
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
order++;
}
set_page_order(page, order);
- list_add(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
+
+ /*
+ * If this is not the largest possible page, check if the buddy
+ * of the next-highest order is free. If it is, it's possible
+ * that pages are being freed that will coalesce soon. In case,
+ * that is happening, add the free page to the tail of the list
+ * so it's less likely to be used soon and more likely to be merged
+ * as a higher order page
+ */
+ if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
+ struct page *higher_page, *higher_buddy;
+ combined_idx = buddy_idx & page_idx;
+ higher_page = page + (combined_idx - page_idx);
+ buddy_idx = __find_buddy_index(combined_idx, order + 1);
+ higher_buddy = page + (buddy_idx - combined_idx);
+ if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
+ list_add_tail(&page->lru,
+ &zone->free_area[order].free_list[migratetype]);
+ goto out;
+ }
+ }
+
+ list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
+out:
zone->free_area[order].nr_free++;
}
@@ -528,13 +590,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
{
int migratetype = 0;
int batch_free = 0;
+ int to_free = count;
spin_lock(&zone->lock);
- zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
+ zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
- __mod_zone_page_state(zone, NR_FREE_PAGES, count);
- while (count) {
+ while (to_free) {
struct page *page;
struct list_head *list;
@@ -559,8 +621,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
__free_one_page(page, zone, 0, page_private(page));
trace_mm_page_pcpu_drain(page, 0, page_private(page));
- } while (--count && --batch_free && !list_empty(list));
+ } while (--to_free && --batch_free && !list_empty(list));
}
+ __mod_zone_page_state(zone, NR_FREE_PAGES, count);
spin_unlock(&zone->lock);
}
@@ -568,27 +631,28 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
int migratetype)
{
spin_lock(&zone->lock);
- zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
+ zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
- __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
__free_one_page(page, zone, order, migratetype);
+ __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
spin_unlock(&zone->lock);
}
-static void __free_pages_ok(struct page *page, unsigned int order)
+static bool free_pages_prepare(struct page *page, unsigned int order)
{
- unsigned long flags;
int i;
int bad = 0;
- int wasMlocked = __TestClearPageMlocked(page);
+ trace_mm_page_free_direct(page, order);
kmemcheck_free_shadow(page, order);
- for (i = 0 ; i < (1 << order) ; ++i)
+ if (PageAnon(page))
+ page->mapping = NULL;
+ for (i = 0; i < (1 << order); i++)
bad += free_pages_check(page + i);
if (bad)
- return;
+ return false;
if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
@@ -598,6 +662,17 @@ static void __free_pages_ok(struct page *page, unsigned int order)
arch_free_page(page, order);
kernel_map_pages(page, 1 << order, 0);
+ return true;
+}
+
+static void __free_pages_ok(struct page *page, unsigned int order)
+{
+ unsigned long flags;
+ int wasMlocked = __TestClearPageMlocked(page);
+
+ if (!free_pages_prepare(page, order))
+ return;
+
local_irq_save(flags);
if (unlikely(wasMlocked))
free_page_mlock(page);
@@ -1009,10 +1084,10 @@ static void drain_pages(unsigned int cpu)
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
- pset = zone_pcp(zone, cpu);
+ local_irq_save(flags);
+ pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
- local_irq_save(flags);
free_pcppages_bulk(zone, pcp->count, pcp);
pcp->count = 0;
local_irq_restore(flags);
@@ -1073,8 +1148,9 @@ void mark_free_pages(struct zone *zone)
/*
* Free a 0-order page
+ * cold == 1 ? free a cold page : free a hot page
*/
-static void free_hot_cold_page(struct page *page, int cold)
+void free_hot_cold_page(struct page *page, int cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
@@ -1082,21 +1158,9 @@ static void free_hot_cold_page(struct page *page, int cold)
int migratetype;
int wasMlocked = __TestClearPageMlocked(page);
- kmemcheck_free_shadow(page, 0);
-
- if (PageAnon(page))
- page->mapping = NULL;
- if (free_pages_check(page))
+ if (!free_pages_prepare(page, 0))
return;
- if (!PageHighMem(page)) {
- debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
- debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
- }
- arch_free_page(page, 0);
- kernel_map_pages(page, 1, 0);
-
- pcp = &zone_pcp(zone, get_cpu())->pcp;
migratetype = get_pageblock_migratetype(page);
set_page_private(page, migratetype);
local_irq_save(flags);
@@ -1119,6 +1183,7 @@ static void free_hot_cold_page(struct page *page, int cold)
migratetype = MIGRATE_MOVABLE;
}
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
if (cold)
list_add_tail(&page->lru, &pcp->lists[migratetype]);
else
@@ -1131,15 +1196,8 @@ static void free_hot_cold_page(struct page *page, int cold)
out:
local_irq_restore(flags);
- put_cpu();
}
-void free_hot_page(struct page *page)
-{
- trace_mm_page_free_direct(page, 0);
- free_hot_cold_page(page, 0);
-}
-
/*
* split_page takes a non-compound higher-order page, and splits it into
* n (1<<order) sub-pages: page[0..n]
@@ -1169,6 +1227,51 @@ void split_page(struct page *page, unsigned int order)
}
/*
+ * Similar to split_page except the page is already free. As this is only
+ * being used for migration, the migratetype of the block also changes.
+ * As this is called with interrupts disabled, the caller is responsible
+ * for calling arch_alloc_page() and kernel_map_page() after interrupts
+ * are enabled.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+int split_free_page(struct page *page)
+{
+ unsigned int order;
+ unsigned long watermark;
+ struct zone *zone;
+
+ BUG_ON(!PageBuddy(page));
+
+ zone = page_zone(page);
+ order = page_order(page);
+
+ /* Obey watermarks as if the page was being allocated */
+ watermark = low_wmark_pages(zone) + (1 << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ return 0;
+
+ /* Remove page from free list */
+ list_del(&page->lru);
+ zone->free_area[order].nr_free--;
+ rmv_page_order(page);
+ __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
+
+ /* Split into individual pages */
+ set_page_refcounted(page);
+ split_page(page, order);
+
+ if (order >= pageblock_order - 1) {
+ struct page *endpage = page + (1 << order) - 1;
+ for (; page < endpage; page += pageblock_nr_pages)
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ }
+
+ return 1 << order;
+}
+
+/*
* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
@@ -1181,17 +1284,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);
- int cpu;
again:
- cpu = get_cpu();
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
struct list_head *list;
- pcp = &zone_pcp(zone, cpu)->pcp;
- list = &pcp->lists[migratetype];
local_irq_save(flags);
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ list = &pcp->lists[migratetype];
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
@@ -1232,7 +1333,6 @@ again:
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
- put_cpu();
VM_BUG_ON(bad_range(zone, page));
if (prep_new_page(page, order, gfp_flags))
@@ -1241,7 +1341,6 @@ again:
failed:
local_irq_restore(flags);
- put_cpu();
return NULL;
}
@@ -1354,24 +1453,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
#endif /* CONFIG_FAIL_PAGE_ALLOC */
/*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags, long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
- long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
int o;
+ free_pages -= (1 << order) + 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
- return 0;
+ return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
@@ -1380,9 +1479,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
min >>= 1;
if (free_pages <= min)
- return 0;
+ return false;
}
- return 1;
+ return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+ if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ free_pages);
}
#ifdef CONFIG_NUMA
@@ -1639,7 +1757,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
struct page *page;
/* Acquire the OOM killer lock for the zones in zonelist */
- if (!try_set_zone_oom(zonelist, gfp_mask)) {
+ if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
schedule_timeout_uninterruptible(1);
return NULL;
}
@@ -1660,6 +1778,9 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/* The OOM killer will not help higher order allocs */
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto out;
+ /* The OOM killer does not needlessly kill tasks for lowmem */
+ if (high_zoneidx < ZONE_NORMAL)
+ goto out;
/*
* GFP_THISNODE contains __GFP_NORETRY and we never hit this.
* Sanity check for bare calls of __GFP_THISNODE, not real OOM.
@@ -1678,6 +1799,66 @@ out:
return page;
}
+#ifdef CONFIG_COMPACTION
+/* Try memory compaction for high-order allocations before reclaim */
+static struct page *
+__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+ int migratetype, unsigned long *did_some_progress,
+ bool sync_migration)
+{
+ struct page *page;
+
+ if (!order || compaction_deferred(preferred_zone))
+ return NULL;
+
+ current->flags |= PF_MEMALLOC;
+ *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
+ nodemask, sync_migration);
+ current->flags &= ~PF_MEMALLOC;
+ if (*did_some_progress != COMPACT_SKIPPED) {
+
+ /* Page migration frees to the PCP lists but we want merging */
+ drain_pages(get_cpu());
+ put_cpu();
+
+ page = get_page_from_freelist(gfp_mask, nodemask,
+ order, zonelist, high_zoneidx,
+ alloc_flags, preferred_zone,
+ migratetype);
+ if (page) {
+ preferred_zone->compact_considered = 0;
+ preferred_zone->compact_defer_shift = 0;
+ count_vm_event(COMPACTSUCCESS);
+ return page;
+ }
+
+ /*
+ * It's bad if compaction run occurs and fails.
+ * The most likely reason is that pages exist,
+ * but not enough to satisfy watermarks.
+ */
+ count_vm_event(COMPACTFAIL);
+ defer_compaction(preferred_zone);
+
+ cond_resched();
+ }
+
+ return NULL;
+}
+#else
+static inline struct page *
+__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+ int migratetype, unsigned long *did_some_progress,
+ bool sync_migration)
+{
+ return NULL;
+}
+#endif /* CONFIG_COMPACTION */
+
/* The really slow allocator path where we enter direct reclaim */
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -1687,33 +1868,44 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
{
struct page *page = NULL;
struct reclaim_state reclaim_state;
- struct task_struct *p = current;
+ bool drained = false;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
- p->flags |= PF_MEMALLOC;
+ current->flags |= PF_MEMALLOC;
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ current->reclaim_state = &reclaim_state;
*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
- p->reclaim_state = NULL;
+ current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
- p->flags &= ~PF_MEMALLOC;
+ current->flags &= ~PF_MEMALLOC;
cond_resched();
- if (order != 0)
- drain_all_pages();
+ if (unlikely(!(*did_some_progress)))
+ return NULL;
- if (likely(*did_some_progress))
- page = get_page_from_freelist(gfp_mask, nodemask, order,
+retry:
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
alloc_flags, preferred_zone,
migratetype);
+
+ /*
+ * If an allocation failed after direct reclaim, it could be because
+ * pages are pinned on the per-cpu lists. Drain them and try again
+ */
+ if (!page && !drained) {
+ drain_all_pages();
+ drained = true;
+ goto retry;
+ }
+
return page;
}
@@ -1735,7 +1927,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
preferred_zone, migratetype);
if (!page && gfp_mask & __GFP_NOFAIL)
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
} while (!page && (gfp_mask & __GFP_NOFAIL));
return page;
@@ -1743,24 +1935,24 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
static inline
void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
- enum zone_type high_zoneidx)
+ enum zone_type high_zoneidx,
+ enum zone_type classzone_idx)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
- wakeup_kswapd(zone, order);
+ wakeup_kswapd(zone, order, classzone_idx);
}
static inline int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
- struct task_struct *p = current;
int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
const gfp_t wait = gfp_mask & __GFP_WAIT;
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
- BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
+ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
/*
* The caller may dip into page reserves a bit more if the caller
@@ -1768,21 +1960,26 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
*/
- alloc_flags |= (gfp_mask & __GFP_HIGH);
+ alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
if (!wait) {
- alloc_flags |= ALLOC_HARDER;
+ /*
+ * Not worth trying to allocate harder for
+ * __GFP_NOMEMALLOC even if it can't schedule.
+ */
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
+ alloc_flags |= ALLOC_HARDER;
/*
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
alloc_flags &= ~ALLOC_CPUSET;
- } else if (unlikely(rt_task(p)) && !in_interrupt())
+ } else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
if (!in_interrupt() &&
- ((p->flags & PF_MEMALLOC) ||
+ ((current->flags & PF_MEMALLOC) ||
unlikely(test_thread_flag(TIF_MEMDIE))))
alloc_flags |= ALLOC_NO_WATERMARKS;
}
@@ -1801,7 +1998,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
- struct task_struct *p = current;
+ bool sync_migration = false;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -1826,7 +2023,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
goto nopage;
restart:
- wake_all_kswapd(order, zonelist, high_zoneidx);
+ if (!(gfp_mask & __GFP_NO_KSWAPD))
+ wake_all_kswapd(order, zonelist, high_zoneidx,
+ zone_idx(preferred_zone));
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -1857,13 +2056,27 @@ rebalance:
goto nopage;
/* Avoid recursion of direct reclaim */
- if (p->flags & PF_MEMALLOC)
+ if (current->flags & PF_MEMALLOC)
goto nopage;
/* Avoid allocations with no watermarks from looping endlessly */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
+ /*
+ * Try direct compaction. The first pass is asynchronous. Subsequent
+ * attempts after direct reclaim are synchronous
+ */
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ zonelist, high_zoneidx,
+ nodemask,
+ alloc_flags, preferred_zone,
+ migratetype, &did_some_progress,
+ sync_migration);
+ if (page)
+ goto got_pg;
+ sync_migration = true;
+
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
@@ -1888,15 +2101,23 @@ rebalance:
if (page)
goto got_pg;
- /*
- * The OOM killer does not trigger for high-order
- * ~__GFP_NOFAIL allocations so if no progress is being
- * made, there are no other options and retrying is
- * unlikely to help.
- */
- if (order > PAGE_ALLOC_COSTLY_ORDER &&
- !(gfp_mask & __GFP_NOFAIL))
- goto nopage;
+ if (!(gfp_mask & __GFP_NOFAIL)) {
+ /*
+ * The oom killer is not called for high-order
+ * allocations that may fail, so if no progress
+ * is being made, there are no other options and
+ * retrying is unlikely to help.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ goto nopage;
+ /*
+ * The oom killer is not called for lowmem
+ * allocations to prevent needlessly killing
+ * innocent tasks.
+ */
+ if (high_zoneidx < ZONE_NORMAL)
+ goto nopage;
+ }
goto restart;
}
@@ -1906,15 +2127,29 @@ rebalance:
pages_reclaimed += did_some_progress;
if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
+ } else {
+ /*
+ * High-order allocations do not necessarily loop after
+ * direct reclaim and reclaim/compaction depends on compaction
+ * being called after reclaim so call directly if necessary
+ */
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ zonelist, high_zoneidx,
+ nodemask,
+ alloc_flags, preferred_zone,
+ migratetype, &did_some_progress,
+ sync_migration);
+ if (page)
+ goto got_pg;
}
nopage:
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
printk(KERN_WARNING "%s: page allocation failure."
" order:%d, mode:0x%x\n",
- p->comm, order, gfp_mask);
+ current->comm, order, gfp_mask);
dump_stack();
show_mem();
}
@@ -1955,10 +2190,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
+ get_mems_allowed();
/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
- if (!preferred_zone)
+ if (!preferred_zone) {
+ put_mems_allowed();
return NULL;
+ }
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -1968,6 +2206,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
+ put_mems_allowed();
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
return page;
@@ -2013,9 +2252,8 @@ void __pagevec_free(struct pagevec *pvec)
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
- trace_mm_page_free_direct(page, order);
if (order == 0)
- free_hot_page(page);
+ free_hot_cold_page(page, 0);
else
__free_pages_ok(page, order);
}
@@ -2180,7 +2418,7 @@ void show_free_areas(void)
for_each_online_cpu(cpu) {
struct per_cpu_pageset *pageset;
- pageset = zone_pcp(zone, cpu);
+ pageset = per_cpu_ptr(zone->pageset, cpu);
printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
cpu, pageset->pcp.high,
@@ -2271,7 +2509,7 @@ void show_free_areas(void)
K(zone_page_state(zone, NR_BOUNCE)),
K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
zone->pages_scanned,
- (zone_is_all_unreclaimable(zone) ? "yes" : "no")
+ (zone->all_unreclaimable ? "yes" : "no")
);
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
@@ -2388,9 +2626,16 @@ static int __parse_numa_zonelist_order(char *s)
static __init int setup_numa_zonelist_order(char *s)
{
- if (s)
- return __parse_numa_zonelist_order(s);
- return 0;
+ int ret;
+
+ if (!s)
+ return 0;
+
+ ret = __parse_numa_zonelist_order(s);
+ if (ret == 0)
+ strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+
+ return ret;
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);
@@ -2420,8 +2665,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
strncpy((char*)table->data, saved_string,
NUMA_ZONELIST_ORDER_LEN);
user_zonelist_order = oldval;
- } else if (oldval != user_zonelist_order)
- build_all_zonelists();
+ } else if (oldval != user_zonelist_order) {
+ mutex_lock(&zonelists_mutex);
+ build_all_zonelists(NULL);
+ mutex_unlock(&zonelists_mutex);
+ }
}
out:
mutex_unlock(&zl_order_mutex);
@@ -2565,10 +2813,10 @@ static int default_zonelist_order(void)
struct zone *z;
int average_size;
/*
- * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
+ * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
* If they are really small and used heavily, the system can fall
* into OOM very easily.
- * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
+ * This function detect ZONE_DMA/DMA32 size and configures zone order.
*/
/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
low_kmem_size = 0;
@@ -2580,6 +2828,15 @@ static int default_zonelist_order(void)
if (zone_type < ZONE_NORMAL)
low_kmem_size += z->present_pages;
total_size += z->present_pages;
+ } else if (zone_type == ZONE_NORMAL) {
+ /*
+ * If any node has only lowmem, then node order
+ * is preferred to allow kernel allocations
+ * locally; otherwise, they can easily infringe
+ * on other nodes when there is an abundance of
+ * lowmem available to allocate from.
+ */
+ return ZONELIST_ORDER_NODE;
}
}
}
@@ -2693,6 +2950,24 @@ static void build_zonelist_cache(pg_data_t *pgdat)
zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
}
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+/*
+ * Return node id of node used for "local" allocations.
+ * I.e., first node id of first zone in arg node's generic zonelist.
+ * Used for initializing percpu 'numa_mem', which is used primarily
+ * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
+ */
+int local_memory_node(int node)
+{
+ struct zone *zone;
+
+ (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
+ gfp_zone(GFP_KERNEL),
+ NULL,
+ &zone);
+ return zone->node;
+}
+#endif
#else /* CONFIG_NUMA */
@@ -2745,10 +3020,36 @@ static void build_zonelist_cache(pg_data_t *pgdat)
#endif /* CONFIG_NUMA */
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
+static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+static void setup_zone_pageset(struct zone *zone);
+
+/*
+ * Global mutex to protect against size modification of zonelists
+ * as well as to serialize pageset setup for the new populated zone.
+ */
+DEFINE_MUTEX(zonelists_mutex);
+
/* return values int ....just for stop_machine() */
-static int __build_all_zonelists(void *dummy)
+static __init_refok int __build_all_zonelists(void *data)
{
int nid;
+ int cpu;
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
@@ -2759,10 +3060,45 @@ static int __build_all_zonelists(void *dummy)
build_zonelists(pgdat);
build_zonelist_cache(pgdat);
}
+
+ /*
+ * Initialize the boot_pagesets that are going to be used
+ * for bootstrapping processors. The real pagesets for
+ * each zone will be allocated later when the per cpu
+ * allocator is available.
+ *
+ * boot_pagesets are used also for bootstrapping offline
+ * cpus if the system is already booted because the pagesets
+ * are needed to initialize allocators on a specific cpu too.
+ * F.e. the percpu allocator needs the page allocator which
+ * needs the percpu allocator in order to allocate its pagesets
+ * (a chicken-egg dilemma).
+ */
+ for_each_possible_cpu(cpu) {
+ setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+ /*
+ * We now know the "local memory node" for each node--
+ * i.e., the node of the first zone in the generic zonelist.
+ * Set up numa_mem percpu variable for on-line cpus. During
+ * boot, only the boot cpu should be on-line; we'll init the
+ * secondary cpus' numa_mem as they come on-line. During
+ * node/memory hotplug, we'll fixup all on-line cpus.
+ */
+ if (cpu_online(cpu))
+ set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
+#endif
+ }
+
return 0;
}
-void build_all_zonelists(void)
+/*
+ * Called with zonelists_mutex held always
+ * unless system_state == SYSTEM_BOOTING.
+ */
+void build_all_zonelists(void *data)
{
set_zonelist_order();
@@ -2773,6 +3109,10 @@ void build_all_zonelists(void)
} else {
/* we have to stop all cpus to guarantee there is no user
of zonelist */
+#ifdef CONFIG_MEMORY_HOTPLUG
+ if (data)
+ setup_zone_pageset((struct zone *)data);
+#endif
stop_machine(__build_all_zonelists, NULL, NULL);
/* cpuset refresh routine should be here */
}
@@ -3096,121 +3436,36 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
pcp->batch = PAGE_SHIFT * 8;
}
-
-#ifdef CONFIG_NUMA
-/*
- * Boot pageset table. One per cpu which is going to be used for all
- * zones and all nodes. The parameters will be set in such a way
- * that an item put on a list will immediately be handed over to
- * the buddy list. This is safe since pageset manipulation is done
- * with interrupts disabled.
- *
- * Some NUMA counter updates may also be caught by the boot pagesets.
- *
- * The boot_pagesets must be kept even after bootup is complete for
- * unused processors and/or zones. They do play a role for bootstrapping
- * hotplugged processors.
- *
- * zoneinfo_show() and maybe other functions do
- * not check if the processor is online before following the pageset pointer.
- * Other parts of the kernel may not check if the zone is available.
- */
-static struct per_cpu_pageset boot_pageset[NR_CPUS];
-
-/*
- * Dynamically allocate memory for the
- * per cpu pageset array in struct zone.
- */
-static int __cpuinit process_zones(int cpu)
+static __meminit void setup_zone_pageset(struct zone *zone)
{
- struct zone *zone, *dzone;
- int node = cpu_to_node(cpu);
+ int cpu;
- node_set_state(node, N_CPU); /* this node has a cpu */
+ zone->pageset = alloc_percpu(struct per_cpu_pageset);
- for_each_populated_zone(zone) {
- zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
- GFP_KERNEL, node);
- if (!zone_pcp(zone, cpu))
- goto bad;
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
- setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+ setup_pageset(pcp, zone_batchsize(zone));
if (percpu_pagelist_fraction)
- setup_pagelist_highmark(zone_pcp(zone, cpu),
- (zone->present_pages / percpu_pagelist_fraction));
- }
-
- return 0;
-bad:
- for_each_zone(dzone) {
- if (!populated_zone(dzone))
- continue;
- if (dzone == zone)
- break;
- kfree(zone_pcp(dzone, cpu));
- zone_pcp(dzone, cpu) = &boot_pageset[cpu];
+ setup_pagelist_highmark(pcp,
+ (zone->present_pages /
+ percpu_pagelist_fraction));
}
- return -ENOMEM;
}
-static inline void free_zone_pagesets(int cpu)
-{
- struct zone *zone;
-
- for_each_zone(zone) {
- struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
-
- /* Free per_cpu_pageset if it is slab allocated */
- if (pset != &boot_pageset[cpu])
- kfree(pset);
- zone_pcp(zone, cpu) = &boot_pageset[cpu];
- }
-}
-
-static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- int cpu = (long)hcpu;
- int ret = NOTIFY_OK;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- if (process_zones(cpu))
- ret = NOTIFY_BAD;
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- free_zone_pagesets(cpu);
- break;
- default:
- break;
- }
- return ret;
-}
-
-static struct notifier_block __cpuinitdata pageset_notifier =
- { &pageset_cpuup_callback, NULL, 0 };
-
+/*
+ * Allocate per cpu pagesets and initialize them.
+ * Before this call only boot pagesets were available.
+ */
void __init setup_per_cpu_pageset(void)
{
- int err;
+ struct zone *zone;
- /* Initialize per_cpu_pageset for cpu 0.
- * A cpuup callback will do this for every cpu
- * as it comes online
- */
- err = process_zones(smp_processor_id());
- BUG_ON(err);
- register_cpu_notifier(&pageset_notifier);
+ for_each_populated_zone(zone)
+ setup_zone_pageset(zone);
}
-#endif
-
static noinline __init_refok
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
@@ -3260,11 +3515,11 @@ static int __zone_pcp_update(void *data)
int cpu;
unsigned long batch = zone_batchsize(zone), flags;
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ for_each_possible_cpu(cpu) {
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
- pset = zone_pcp(zone, cpu);
+ pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
local_irq_save(flags);
@@ -3282,21 +3537,17 @@ void zone_pcp_update(struct zone *zone)
static __meminit void zone_pcp_init(struct zone *zone)
{
- int cpu;
- unsigned long batch = zone_batchsize(zone);
+ /*
+ * per cpu subsystem is not up at this point. The following code
+ * relies on the ability of the linker to provide the
+ * offset of a (static) per cpu variable into the per cpu area.
+ */
+ zone->pageset = &boot_pageset;
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
- /* Early boot. Slab allocator not functional yet */
- zone_pcp(zone, cpu) = &boot_pageset[cpu];
- setup_pageset(&boot_pageset[cpu],0);
-#else
- setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
- }
if (zone->present_pages)
- printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
- zone->name, zone->present_pages, batch);
+ printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
+ zone->name, zone->present_pages,
+ zone_batchsize(zone));
}
__meminit int init_currently_empty_zone(struct zone *zone,
@@ -3435,6 +3686,84 @@ void __init free_bootmem_with_active_regions(int nid,
}
}
+#ifdef CONFIG_HAVE_MEMBLOCK
+u64 __init find_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit)
+{
+ int i;
+
+ /* Need to go over early_node_map to find out good range for node */
+ for_each_active_range_index_in_nid(i, nid) {
+ u64 addr;
+ u64 ei_start, ei_last;
+ u64 final_start, final_end;
+
+ ei_last = early_node_map[i].end_pfn;
+ ei_last <<= PAGE_SHIFT;
+ ei_start = early_node_map[i].start_pfn;
+ ei_start <<= PAGE_SHIFT;
+
+ final_start = max(ei_start, goal);
+ final_end = min(ei_last, limit);
+
+ if (final_start >= final_end)
+ continue;
+
+ addr = memblock_find_in_range(final_start, final_end, size, align);
+
+ if (addr == MEMBLOCK_ERROR)
+ continue;
+
+ return addr;
+ }
+
+ return MEMBLOCK_ERROR;
+}
+#endif
+
+int __init add_from_early_node_map(struct range *range, int az,
+ int nr_range, int nid)
+{
+ int i;
+ u64 start, end;
+
+ /* need to go over early_node_map to find out good range for node */
+ for_each_active_range_index_in_nid(i, nid) {
+ start = early_node_map[i].start_pfn;
+ end = early_node_map[i].end_pfn;
+ nr_range = add_range(range, az, nr_range, start, end);
+ }
+ return nr_range;
+}
+
+#ifdef CONFIG_NO_BOOTMEM
+void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit)
+{
+ void *ptr;
+ u64 addr;
+
+ if (limit > memblock.current_limit)
+ limit = memblock.current_limit;
+
+ addr = find_memory_core_early(nid, size, align, goal, limit);
+
+ if (addr == MEMBLOCK_ERROR)
+ return NULL;
+
+ ptr = phys_to_virt(addr);
+ memset(ptr, 0, size);
+ memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks.
+ */
+ kmemleak_alloc(ptr, size, 0, 0);
+ return ptr;
+}
+#endif
+
+
void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
{
int i;
@@ -3733,7 +4062,7 @@ static void __init setup_usemap(struct pglist_data *pgdat,
zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
}
#else
-static void inline setup_usemap(struct pglist_data *pgdat,
+static inline void setup_usemap(struct pglist_data *pgdat,
struct zone *zone, unsigned long zonesize) {}
#endif /* CONFIG_SPARSEMEM */
@@ -3849,8 +4178,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
- zone->prev_priority = DEF_PRIORITY;
-
zone_pcp_init(zone);
for_each_lru(l) {
INIT_LIST_HEAD(&zone->lru[l].list);
@@ -4377,8 +4704,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
- printk(" %-8s %0#10lx -> %0#10lx\n",
- zone_names[i],
+ printk(" %-8s ", zone_names[i]);
+ if (arch_zone_lowest_possible_pfn[i] ==
+ arch_zone_highest_possible_pfn[i])
+ printk("empty\n");
+ else
+ printk("%0#10lx -> %0#10lx\n",
arch_zone_lowest_possible_pfn[i],
arch_zone_highest_possible_pfn[i]);
}
@@ -4467,7 +4798,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
+struct pglist_data __refdata contig_page_data = {
+#ifndef CONFIG_NO_BOOTMEM
+ .bdata = &bootmem_node_data[0]
+#endif
+ };
EXPORT_SYMBOL(contig_page_data);
#endif
@@ -4810,10 +5145,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
if (!write || (ret == -EINVAL))
return ret;
for_each_populated_zone(zone) {
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
unsigned long high;
high = zone->present_pages / percpu_pagelist_fraction;
- setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+ setup_pagelist_highmark(
+ per_cpu_ptr(zone->pageset, cpu), high);
}
}
return 0;
@@ -4911,9 +5247,9 @@ void *__init alloc_large_system_hash(const char *tablename,
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
- printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
+ printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
tablename,
- (1U << log2qty),
+ (1UL << log2qty),
ilog2(size) - PAGE_SHIFT,
size);
@@ -5010,12 +5346,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
* page allocater never alloc memory from ISOLATE block.
*/
+static int
+__count_immobile_pages(struct zone *zone, struct page *page, int count)
+{
+ unsigned long pfn, iter, found;
+ /*
+ * For avoiding noise data, lru_add_drain_all() should be called
+ * If ZONE_MOVABLE, the zone never contains immobile pages
+ */
+ if (zone_idx(zone) == ZONE_MOVABLE)
+ return true;
+
+ if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
+ return true;
+
+ pfn = page_to_pfn(page);
+ for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
+ unsigned long check = pfn + iter;
+
+ if (!pfn_valid_within(check)) {
+ iter++;
+ continue;
+ }
+ page = pfn_to_page(check);
+ if (!page_count(page)) {
+ if (PageBuddy(page))
+ iter += (1 << page_order(page)) - 1;
+ continue;
+ }
+ if (!PageLRU(page))
+ found++;
+ /*
+ * If there are RECLAIMABLE pages, we need to check it.
+ * But now, memory offline itself doesn't call shrink_slab()
+ * and it still to be fixed.
+ */
+ /*
+ * If the page is not RAM, page_count()should be 0.
+ * we don't need more check. This is an _used_ not-movable page.
+ *
+ * The problematic thing here is PG_reserved pages. PG_reserved
+ * is set to both of a memory hole page and a _used_ kernel
+ * page at boot.
+ */
+ if (found > count)
+ return false;
+ }
+ return true;
+}
+
+bool is_pageblock_removable_nolock(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ return __count_immobile_pages(zone, page, 0);
+}
+
int set_migratetype_isolate(struct page *page)
{
struct zone *zone;
- struct page *curr_page;
- unsigned long flags, pfn, iter;
- unsigned long immobile = 0;
+ unsigned long flags, pfn;
struct memory_isolate_notify arg;
int notifier_ret;
int ret = -EBUSY;
@@ -5025,11 +5414,6 @@ int set_migratetype_isolate(struct page *page)
zone_idx = zone_idx(zone);
spin_lock_irqsave(&zone->lock, flags);
- if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
- zone_idx == ZONE_MOVABLE) {
- ret = 0;
- goto out;
- }
pfn = page_to_pfn(page);
arg.start_pfn = pfn;
@@ -5049,23 +5433,20 @@ int set_migratetype_isolate(struct page *page)
*/
notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
notifier_ret = notifier_to_errno(notifier_ret);
- if (notifier_ret || !arg.pages_found)
+ if (notifier_ret)
goto out;
-
- for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
- if (!pfn_valid_within(pfn))
- continue;
-
- curr_page = pfn_to_page(iter);
- if (!page_count(curr_page) || PageLRU(curr_page))
- continue;
-
- immobile++;
- }
-
- if (arg.pages_found == immobile)
+ /*
+ * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+ * We just check MOVABLE pages.
+ */
+ if (__count_immobile_pages(zone, page, arg.pages_found))
ret = 0;
+ /*
+ * immobile means "not-on-lru" paes. If immobile is larger than
+ * removable-by-driver pages reported by notifier, we'll fail.
+ */
+
out:
if (!ret) {
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
@@ -5159,3 +5540,79 @@ bool is_free_buddy_page(struct page *page)
return order < MAX_ORDER;
}
#endif
+
+static struct trace_print_flags pageflag_names[] = {
+ {1UL << PG_locked, "locked" },
+ {1UL << PG_error, "error" },
+ {1UL << PG_referenced, "referenced" },
+ {1UL << PG_uptodate, "uptodate" },
+ {1UL << PG_dirty, "dirty" },
+ {1UL << PG_lru, "lru" },
+ {1UL << PG_active, "active" },
+ {1UL << PG_slab, "slab" },
+ {1UL << PG_owner_priv_1, "owner_priv_1" },
+ {1UL << PG_arch_1, "arch_1" },
+ {1UL << PG_reserved, "reserved" },
+ {1UL << PG_private, "private" },
+ {1UL << PG_private_2, "private_2" },
+ {1UL << PG_writeback, "writeback" },
+#ifdef CONFIG_PAGEFLAGS_EXTENDED
+ {1UL << PG_head, "head" },
+ {1UL << PG_tail, "tail" },
+#else
+ {1UL << PG_compound, "compound" },
+#endif
+ {1UL << PG_swapcache, "swapcache" },
+ {1UL << PG_mappedtodisk, "mappedtodisk" },
+ {1UL << PG_reclaim, "reclaim" },
+ {1UL << PG_swapbacked, "swapbacked" },
+ {1UL << PG_unevictable, "unevictable" },
+#ifdef CONFIG_MMU
+ {1UL << PG_mlocked, "mlocked" },
+#endif
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
+ {1UL << PG_uncached, "uncached" },
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+ {1UL << PG_hwpoison, "hwpoison" },
+#endif
+ {-1UL, NULL },
+};
+
+static void dump_page_flags(unsigned long flags)
+{
+ const char *delim = "";
+ unsigned long mask;
+ int i;
+
+ printk(KERN_ALERT "page flags: %#lx(", flags);
+
+ /* remove zone id */
+ flags &= (1UL << NR_PAGEFLAGS) - 1;
+
+ for (i = 0; pageflag_names[i].name && flags; i++) {
+
+ mask = pageflag_names[i].mask;
+ if ((flags & mask) != mask)
+ continue;
+
+ flags &= ~mask;
+ printk("%s%s", delim, pageflag_names[i].name);
+ delim = "|";
+ }
+
+ /* check for left over flags */
+ if (flags)
+ printk("%s%#lx", delim, flags);
+
+ printk(")\n");
+}
+
+void dump_page(struct page *page)
+{
+ printk(KERN_ALERT
+ "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
+ page, atomic_read(&page->_count), page_mapcount(page),
+ page->mapping, page->index);
+ dump_page_flags(page->flags);
+}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3d535d594826..5bffada7cde1 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -9,6 +9,7 @@
#include <linux/vmalloc.h>
#include <linux/cgroup.h>
#include <linux/swapops.h>
+#include <linux/kmemleak.h>
static void __meminit
__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
@@ -126,6 +127,12 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
if (!base)
base = vmalloc(table_size);
}
+ /*
+ * The value stored in section->page_cgroup is (base - pfn)
+ * and it does not point to the memory block allocated above,
+ * causing kmemleak false positives.
+ */
+ kmemleak_not_leak(base);
} else {
/*
* We don't have to allocate page_cgroup again, but
@@ -284,6 +291,7 @@ static DEFINE_MUTEX(swap_cgroup_mutex);
struct swap_cgroup_ctrl {
struct page **map;
unsigned long length;
+ spinlock_t lock;
};
struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
@@ -335,6 +343,43 @@ not_enough_page:
}
/**
+ * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
+ * @end: swap entry to be cmpxchged
+ * @old: old id
+ * @new: new id
+ *
+ * Returns old id at success, 0 at failure.
+ * (There is no mem_cgroup useing 0 as its id)
+ */
+unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
+ unsigned short old, unsigned short new)
+{
+ int type = swp_type(ent);
+ unsigned long offset = swp_offset(ent);
+ unsigned long idx = offset / SC_PER_PAGE;
+ unsigned long pos = offset & SC_POS_MASK;
+ struct swap_cgroup_ctrl *ctrl;
+ struct page *mappage;
+ struct swap_cgroup *sc;
+ unsigned long flags;
+ unsigned short retval;
+
+ ctrl = &swap_cgroup_ctrl[type];
+
+ mappage = ctrl->map[idx];
+ sc = page_address(mappage);
+ sc += pos;
+ spin_lock_irqsave(&ctrl->lock, flags);
+ retval = sc->id;
+ if (retval == old)
+ sc->id = new;
+ else
+ retval = 0;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+ return retval;
+}
+
+/**
* swap_cgroup_record - record mem_cgroup for this swp_entry.
* @ent: swap entry to be recorded into
* @mem: mem_cgroup to be recorded
@@ -352,14 +397,17 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
struct page *mappage;
struct swap_cgroup *sc;
unsigned short old;
+ unsigned long flags;
ctrl = &swap_cgroup_ctrl[type];
mappage = ctrl->map[idx];
sc = page_address(mappage);
sc += pos;
+ spin_lock_irqsave(&ctrl->lock, flags);
old = sc->id;
sc->id = id;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
return old;
}
@@ -411,6 +459,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
mutex_lock(&swap_cgroup_mutex);
ctrl->length = length;
ctrl->map = array;
+ spin_lock_init(&ctrl->lock);
if (swap_cgroup_prepare(type)) {
/* memory shortage */
ctrl->map = NULL;
diff --git a/mm/page_io.c b/mm/page_io.c
index a19af956ee1b..2dee975bf469 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -12,6 +12,7 @@
#include <linux/mm.h>
#include <linux/kernel_stat.h>
+#include <linux/gfp.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/bio.h>
@@ -105,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
goto out;
}
if (wbc->sync_mode == WB_SYNC_ALL)
- rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
+ rw |= REQ_SYNC | REQ_UNPLUG;
count_vm_event(PSWPOUT);
set_page_writeback(page);
unlock_page(page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5e0ffd967452..4ae42bb40892 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -86,7 +86,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
* all pages in [start_pfn...end_pfn) must be in the same zone.
* zone->lock must be held before call this.
*
- * Returns 0 if all pages in the range is isolated.
+ * Returns 1 if all pages in the range is isolated.
*/
static int
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
@@ -119,7 +119,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
struct zone *zone;
int ret;
- pfn = start_pfn;
/*
* Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page
* is not aligned to pageblock_nr_pages.
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 7b47a57b6646..7cfa6ae02303 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ split_huge_page_pmd(walk->mm, pmd);
if (pmd_none_or_clear_bad(pmd)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
@@ -80,6 +81,37 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
return err;
}
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
+ return boundary < end ? boundary : end;
+}
+
+static int walk_hugetlb_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hstate *h = hstate_vma(vma);
+ unsigned long next;
+ unsigned long hmask = huge_page_mask(h);
+ pte_t *pte;
+ int err = 0;
+
+ do {
+ next = hugetlb_entry_end(h, addr, end);
+ pte = huge_pte_offset(walk->mm, addr & hmask);
+ if (pte && walk->hugetlb_entry)
+ err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
+ if (err)
+ return err;
+ } while (addr = next, addr != end);
+
+ return 0;
+}
+#endif
+
/**
* walk_page_range - walk a memory map's page tables with a callback
* @mm: memory map to walk
@@ -108,7 +140,6 @@ int walk_page_range(unsigned long addr, unsigned long end,
pgd_t *pgd;
unsigned long next;
int err = 0;
- struct vm_area_struct *vma;
if (addr >= end)
return err;
@@ -118,30 +149,28 @@ int walk_page_range(unsigned long addr, unsigned long end,
pgd = pgd_offset(walk->mm, addr);
do {
+ struct vm_area_struct *uninitialized_var(vma);
+
next = pgd_addr_end(addr, end);
+#ifdef CONFIG_HUGETLB_PAGE
/*
* handle hugetlb vma individually because pagetable walk for
* the hugetlb page is dependent on the architecture and
* we can't handled it in the same manner as non-huge pages.
*/
vma = find_vma(walk->mm, addr);
-#ifdef CONFIG_HUGETLB_PAGE
if (vma && is_vm_hugetlb_page(vma)) {
- pte_t *pte;
- struct hstate *hs;
-
if (vma->vm_end < next)
next = vma->vm_end;
- hs = hstate_vma(vma);
- pte = huge_pte_offset(walk->mm,
- addr & huge_page_mask(hs));
- if (pte && !huge_pte_none(huge_ptep_get(pte))
- && walk->hugetlb_entry)
- err = walk->hugetlb_entry(pte, addr,
- next, walk);
+ /*
+ * Hugepage is very tightly coupled with vma, so
+ * walk through hugetlb entries within a given vma.
+ */
+ err = walk_hugetlb_range(vma, addr, next, walk);
if (err)
break;
+ pgd = pgd_offset(walk->mm, next);
continue;
}
#endif
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
new file mode 100644
index 000000000000..89633fefc6a2
--- /dev/null
+++ b/mm/percpu-km.c
@@ -0,0 +1,108 @@
+/*
+ * mm/percpu-km.c - kernel memory based chunk allocation
+ *
+ * Copyright (C) 2010 SUSE Linux Products GmbH
+ * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Chunks are allocated as a contiguous kernel memory using gfp
+ * allocation. This is to be used on nommu architectures.
+ *
+ * To use percpu-km,
+ *
+ * - define CONFIG_NEED_PER_CPU_KM from the arch Kconfig.
+ *
+ * - CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK must not be defined. It's
+ * not compatible with PER_CPU_KM. EMBED_FIRST_CHUNK should work
+ * fine.
+ *
+ * - NUMA is not supported. When setting up the first chunk,
+ * @cpu_distance_fn should be NULL or report all CPUs to be nearer
+ * than or at LOCAL_DISTANCE.
+ *
+ * - It's best if the chunk size is power of two multiple of
+ * PAGE_SIZE. Because each chunk is allocated as a contiguous
+ * kernel memory block using alloc_pages(), memory will be wasted if
+ * chunk size is not aligned. percpu-km code will whine about it.
+ */
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
+#error "contiguous percpu allocation is incompatible with paged first chunk"
+#endif
+
+#include <linux/log2.h>
+
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
+
+ return 0;
+}
+
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ /* nada */
+}
+
+static struct pcpu_chunk *pcpu_create_chunk(void)
+{
+ const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
+ struct pcpu_chunk *chunk;
+ struct page *pages;
+ int i;
+
+ chunk = pcpu_alloc_chunk();
+ if (!chunk)
+ return NULL;
+
+ pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
+ if (!pages) {
+ pcpu_free_chunk(chunk);
+ return NULL;
+ }
+
+ for (i = 0; i < nr_pages; i++)
+ pcpu_set_page_chunk(nth_page(pages, i), chunk);
+
+ chunk->data = pages;
+ chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
+ return chunk;
+}
+
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
+{
+ const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
+
+ if (chunk && chunk->data)
+ __free_pages(chunk->data, order_base_2(nr_pages));
+ pcpu_free_chunk(chunk);
+}
+
+static struct page *pcpu_addr_to_page(void *addr)
+{
+ return virt_to_page(addr);
+}
+
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
+{
+ size_t nr_pages, alloc_pages;
+
+ /* all units must be in a single group */
+ if (ai->nr_groups != 1) {
+ printk(KERN_CRIT "percpu: can't handle more than one groups\n");
+ return -EINVAL;
+ }
+
+ nr_pages = (ai->groups[0].nr_units * ai->unit_size) >> PAGE_SHIFT;
+ alloc_pages = roundup_pow_of_two(nr_pages);
+
+ if (alloc_pages > nr_pages)
+ printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n",
+ alloc_pages - nr_pages);
+
+ return 0;
+}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
new file mode 100644
index 000000000000..ea534960a04b
--- /dev/null
+++ b/mm/percpu-vm.c
@@ -0,0 +1,451 @@
+/*
+ * mm/percpu-vm.c - vmalloc area based chunk allocation
+ *
+ * Copyright (C) 2010 SUSE Linux Products GmbH
+ * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Chunks are mapped into vmalloc areas and populated page by page.
+ * This is the default chunk allocator.
+ */
+
+static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
+ unsigned int cpu, int page_idx)
+{
+ /* must not be used on pre-mapped chunk */
+ WARN_ON(chunk->immutable);
+
+ return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
+}
+
+/**
+ * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
+ * @chunk: chunk of interest
+ * @bitmapp: output parameter for bitmap
+ * @may_alloc: may allocate the array
+ *
+ * Returns pointer to array of pointers to struct page and bitmap,
+ * both of which can be indexed with pcpu_page_idx(). The returned
+ * array is cleared to zero and *@bitmapp is copied from
+ * @chunk->populated. Note that there is only one array and bitmap
+ * and access exclusion is the caller's responsibility.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
+ * Otherwise, don't care.
+ *
+ * RETURNS:
+ * Pointer to temp pages array on success, NULL on failure.
+ */
+static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
+ unsigned long **bitmapp,
+ bool may_alloc)
+{
+ static struct page **pages;
+ static unsigned long *bitmap;
+ size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
+ size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
+ sizeof(unsigned long);
+
+ if (!pages || !bitmap) {
+ if (may_alloc && !pages)
+ pages = pcpu_mem_alloc(pages_size);
+ if (may_alloc && !bitmap)
+ bitmap = pcpu_mem_alloc(bitmap_size);
+ if (!pages || !bitmap)
+ return NULL;
+ }
+
+ memset(pages, 0, pages_size);
+ bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
+
+ *bitmapp = bitmap;
+ return pages;
+}
+
+/**
+ * pcpu_free_pages - free pages which were allocated for @chunk
+ * @chunk: chunk pages were allocated for
+ * @pages: array of pages to be freed, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be freed
+ * @page_end: page index of the last page to be freed + 1
+ *
+ * Free pages [@page_start and @page_end) in @pages for all units.
+ * The pages were allocated for @chunk.
+ */
+static void pcpu_free_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page *page = pages[pcpu_page_idx(cpu, i)];
+
+ if (page)
+ __free_page(page);
+ }
+ }
+}
+
+/**
+ * pcpu_alloc_pages - allocates pages for @chunk
+ * @chunk: target chunk
+ * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be allocated
+ * @page_end: page index of the last page to be allocated + 1
+ *
+ * Allocate pages [@page_start,@page_end) into @pages for all units.
+ * The allocation is for @chunk. Percpu core doesn't care about the
+ * content of @pages and will pass it verbatim to pcpu_map_pages().
+ */
+static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+ unsigned int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
+
+ *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
+ if (!*pagep) {
+ pcpu_free_pages(chunk, pages, populated,
+ page_start, page_end);
+ return -ENOMEM;
+ }
+ }
+ }
+ return 0;
+}
+
+/**
+ * pcpu_pre_unmap_flush - flush cache prior to unmapping
+ * @chunk: chunk the regions to be flushed belongs to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages in [@page_start,@page_end) of @chunk are about to be
+ * unmapped. Flush cache. As each flushing trial can be very
+ * expensive, issue flush on the whole region at once rather than
+ * doing it for each cpu. This could be an overkill but is more
+ * scalable.
+ */
+static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_cache_vunmap(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+
+static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
+{
+ unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+}
+
+/**
+ * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @pages: pages array which can be used to pass information to free
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * Corresponding elements in @pages were cleared by the caller and can
+ * be used to carry information to pcpu_free_pages() which will be
+ * called after all unmaps are finished. The caller should call
+ * proper pre/post flush functions.
+ */
+static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page *page;
+
+ page = pcpu_chunk_page(chunk, cpu, i);
+ WARN_ON(!page);
+ pages[pcpu_page_idx(cpu, i)] = page;
+ }
+ __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ page_end - page_start);
+ }
+
+ for (i = page_start; i < page_end; i++)
+ __clear_bit(i, populated);
+}
+
+/**
+ * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
+ * TLB for the regions. This can be skipped if the area is to be
+ * returned to vmalloc as vmalloc will handle TLB flushing lazily.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_tlb_kernel_range(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+
+static int __pcpu_map_pages(unsigned long addr, struct page **pages,
+ int nr_pages)
+{
+ return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
+ PAGE_KERNEL, pages);
+}
+
+/**
+ * pcpu_map_pages - map pages into a pcpu_chunk
+ * @chunk: chunk of interest
+ * @pages: pages array containing pages to be mapped
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to map
+ * @page_end: page index of the last page to map + 1
+ *
+ * For each cpu, map pages [@page_start,@page_end) into @chunk. The
+ * caller is responsible for calling pcpu_post_map_flush() after all
+ * mappings are complete.
+ *
+ * This function is responsible for setting corresponding bits in
+ * @chunk->populated bitmap and whatever is necessary for reverse
+ * lookup (addr -> chunk).
+ */
+static int pcpu_map_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu, tcpu;
+ int i, err;
+
+ for_each_possible_cpu(cpu) {
+ err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ &pages[pcpu_page_idx(cpu, page_start)],
+ page_end - page_start);
+ if (err < 0)
+ goto err;
+ }
+
+ /* mapping successful, link chunk and mark populated */
+ for (i = page_start; i < page_end; i++) {
+ for_each_possible_cpu(cpu)
+ pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
+ chunk);
+ __set_bit(i, populated);
+ }
+
+ return 0;
+
+err:
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
+ page_end - page_start);
+ }
+ return err;
+}
+
+/**
+ * pcpu_post_map_flush - flush cache after mapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
+ * cache.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_cache_vmap(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+
+/**
+ * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @off: offset to the area to populate
+ * @size: size of the area to populate in bytes
+ *
+ * For each cpu, populate and map pages [@page_start,@page_end) into
+ * @chunk. The area is cleared on return.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
+ */
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ int page_start = PFN_DOWN(off);
+ int page_end = PFN_UP(off + size);
+ int free_end = page_start, unmap_end = page_start;
+ struct page **pages;
+ unsigned long *populated;
+ unsigned int cpu;
+ int rs, re, rc;
+
+ /* quick path, check whether all pages are already there */
+ rs = page_start;
+ pcpu_next_pop(chunk, &rs, &re, page_end);
+ if (rs == page_start && re == page_end)
+ goto clear;
+
+ /* need to allocate and map pages, this chunk can't be immutable */
+ WARN_ON(chunk->immutable);
+
+ pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
+ if (!pages)
+ return -ENOMEM;
+
+ /* alloc and map */
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
+ if (rc)
+ goto err_free;
+ free_end = re;
+ }
+
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ rc = pcpu_map_pages(chunk, pages, populated, rs, re);
+ if (rc)
+ goto err_unmap;
+ unmap_end = re;
+ }
+ pcpu_post_map_flush(chunk, page_start, page_end);
+
+ /* commit new bitmap */
+ bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+clear:
+ for_each_possible_cpu(cpu)
+ memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
+ return 0;
+
+err_unmap:
+ pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
+ pcpu_unmap_pages(chunk, pages, populated, rs, re);
+ pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
+err_free:
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
+ pcpu_free_pages(chunk, pages, populated, rs, re);
+ return rc;
+}
+
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate in bytes
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk. If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ int page_start = PFN_DOWN(off);
+ int page_end = PFN_UP(off + size);
+ struct page **pages;
+ unsigned long *populated;
+ int rs, re;
+
+ /* quick path, check whether it's empty already */
+ rs = page_start;
+ pcpu_next_unpop(chunk, &rs, &re, page_end);
+ if (rs == page_start && re == page_end)
+ return;
+
+ /* immutable chunks can't be depopulated */
+ WARN_ON(chunk->immutable);
+
+ /*
+ * If control reaches here, there must have been at least one
+ * successful population attempt so the temp pages array must
+ * be available now.
+ */
+ pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
+ BUG_ON(!pages);
+
+ /* unmap and free */
+ pcpu_pre_unmap_flush(chunk, page_start, page_end);
+
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+ pcpu_unmap_pages(chunk, pages, populated, rs, re);
+
+ /* no need to flush tlb, vmalloc will handle it lazily */
+
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+ pcpu_free_pages(chunk, pages, populated, rs, re);
+
+ /* commit new bitmap */
+ bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+}
+
+static struct pcpu_chunk *pcpu_create_chunk(void)
+{
+ struct pcpu_chunk *chunk;
+ struct vm_struct **vms;
+
+ chunk = pcpu_alloc_chunk();
+ if (!chunk)
+ return NULL;
+
+ vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
+ pcpu_nr_groups, pcpu_atom_size);
+ if (!vms) {
+ pcpu_free_chunk(chunk);
+ return NULL;
+ }
+
+ chunk->data = vms;
+ chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];
+ return chunk;
+}
+
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
+{
+ if (chunk && chunk->data)
+ pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);
+ pcpu_free_chunk(chunk);
+}
+
+static struct page *pcpu_addr_to_page(void *addr)
+{
+ return vmalloc_to_page(addr);
+}
+
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
+{
+ /* no extra restriction */
+ return 0;
+}
diff --git a/mm/percpu.c b/mm/percpu.c
index 083e7c91e5f6..3f930018aa60 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1,5 +1,5 @@
/*
- * linux/mm/percpu.c - percpu memory allocator
+ * mm/percpu.c - percpu memory allocator
*
* Copyright (C) 2009 SUSE Linux Products GmbH
* Copyright (C) 2009 Tejun Heo <tj@kernel.org>
@@ -7,14 +7,13 @@
* This file is released under the GPLv2.
*
* This is percpu allocator which can handle both static and dynamic
- * areas. Percpu areas are allocated in chunks in vmalloc area. Each
- * chunk is consisted of boot-time determined number of units and the
- * first chunk is used for static percpu variables in the kernel image
+ * areas. Percpu areas are allocated in chunks. Each chunk is
+ * consisted of boot-time determined number of units and the first
+ * chunk is used for static percpu variables in the kernel image
* (special boot time alloc/init handling necessary as these areas
* need to be brought up before allocation services are running).
* Unit grows as necessary and all units grow or shrink in unison.
- * When a chunk is filled up, another chunk is allocated. ie. in
- * vmalloc area
+ * When a chunk is filled up, another chunk is allocated.
*
* c0 c1 c2
* ------------------- ------------------- ------------
@@ -32,7 +31,7 @@
* as small as 4 bytes. The allocator organizes chunks into lists
* according to free size and tries to allocate from the fullest one.
* Each chunk keeps the maximum contiguous area size hint which is
- * guaranteed to be eqaul to or larger than the maximum contiguous
+ * guaranteed to be equal to or larger than the maximum contiguous
* area in the chunk. This helps the allocator not to iterate the
* chunk maps unnecessarily.
*
@@ -77,17 +76,25 @@
#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
+#ifdef CONFIG_SMP
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
#ifndef __addr_to_pcpu_ptr
#define __addr_to_pcpu_ptr(addr) \
- (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \
- + (unsigned long)__per_cpu_start)
+ (void __percpu *)((unsigned long)(addr) - \
+ (unsigned long)pcpu_base_addr + \
+ (unsigned long)__per_cpu_start)
#endif
#ifndef __pcpu_ptr_to_addr
#define __pcpu_ptr_to_addr(ptr) \
- (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \
- - (unsigned long)__per_cpu_start)
+ (void __force *)((unsigned long)(ptr) + \
+ (unsigned long)pcpu_base_addr - \
+ (unsigned long)__per_cpu_start)
#endif
+#else /* CONFIG_SMP */
+/* on UP, it's always identity mapped */
+#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
+#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
+#endif /* CONFIG_SMP */
struct pcpu_chunk {
struct list_head list; /* linked to pcpu_slot lists */
@@ -97,7 +104,7 @@ struct pcpu_chunk {
int map_used; /* # of map entries used */
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
- struct vm_struct **vms; /* mapped vmalloc regions */
+ void *data; /* chunk data */
bool immutable; /* no [de]population allowed */
unsigned long populated[]; /* populated bitmap */
};
@@ -175,6 +182,21 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
static void pcpu_reclaim(struct work_struct *work);
static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
+static bool pcpu_addr_in_first_chunk(void *addr)
+{
+ void *first_start = pcpu_first_chunk->base_addr;
+
+ return addr >= first_start && addr < first_start + pcpu_unit_size;
+}
+
+static bool pcpu_addr_in_reserved_chunk(void *addr)
+{
+ void *first_start = pcpu_first_chunk->base_addr;
+
+ return addr >= first_start &&
+ addr < first_start + pcpu_reserved_chunk_limit;
+}
+
static int __pcpu_size_to_slot(int size)
{
int highbit = fls(size); /* size is in bytes */
@@ -196,27 +218,6 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
return pcpu_size_to_slot(chunk->free_size);
}
-static int pcpu_page_idx(unsigned int cpu, int page_idx)
-{
- return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
-}
-
-static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
- unsigned int cpu, int page_idx)
-{
- return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
- (page_idx << PAGE_SHIFT);
-}
-
-static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
- unsigned int cpu, int page_idx)
-{
- /* must not be used on pre-mapped chunk */
- WARN_ON(chunk->immutable);
-
- return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
-}
-
/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
@@ -229,13 +230,27 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
return (struct pcpu_chunk *)page->index;
}
-static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
+{
+ return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
+}
+
+static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
+ unsigned int cpu, int page_idx)
+{
+ return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
+ (page_idx << PAGE_SHIFT);
+}
+
+static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
+ int *rs, int *re, int end)
{
*rs = find_next_zero_bit(chunk->populated, end, *rs);
*re = find_next_bit(chunk->populated, end, *rs + 1);
}
-static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
+ int *rs, int *re, int end)
{
*rs = find_next_bit(chunk->populated, end, *rs);
*re = find_next_zero_bit(chunk->populated, end, *rs + 1);
@@ -243,7 +258,7 @@ static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
/*
* (Un)populated page region iterators. Iterate over (un)populated
- * page regions betwen @start and @end in @chunk. @rs and @re should
+ * page regions between @start and @end in @chunk. @rs and @re should
* be integer variables and will be set to start and end page index of
* the current region.
*/
@@ -273,14 +288,13 @@ static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
*/
static void *pcpu_mem_alloc(size_t size)
{
+ if (WARN_ON_ONCE(!slab_is_available()))
+ return NULL;
+
if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL);
- else {
- void *ptr = vmalloc(size);
- if (ptr)
- memset(ptr, 0, size);
- return ptr;
- }
+ else
+ return vzalloc(size);
}
/**
@@ -324,36 +338,6 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
}
/**
- * pcpu_chunk_addr_search - determine chunk containing specified address
- * @addr: address for which the chunk needs to be determined.
- *
- * RETURNS:
- * The address of the found chunk.
- */
-static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
-{
- void *first_start = pcpu_first_chunk->base_addr;
-
- /* is it in the first chunk? */
- if (addr >= first_start && addr < first_start + pcpu_unit_size) {
- /* is it in the reserved area? */
- if (addr < first_start + pcpu_reserved_chunk_limit)
- return pcpu_reserved_chunk;
- return pcpu_first_chunk;
- }
-
- /*
- * The address is relative to unit0 which might be unused and
- * thus unmapped. Offset the address to the unit space of the
- * current processor before looking it up in the vmalloc
- * space. Note that any possible cpu id can be used here, so
- * there's no need to worry about preemption or cpu hotplug.
- */
- addr += pcpu_unit_offsets[raw_smp_processor_id()];
- return pcpu_get_page_chunk(vmalloc_to_page(addr));
-}
-
-/**
* pcpu_need_to_extend - determine whether chunk area map needs to be extended
* @chunk: chunk of interest
*
@@ -411,14 +395,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
goto out_unlock;
old_size = chunk->map_alloc * sizeof(chunk->map[0]);
- memcpy(new, chunk->map, old_size);
+ old = chunk->map;
- /*
- * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
- * one of the first chunks and still using static map.
- */
- if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
- old = chunk->map;
+ memcpy(new, old, old_size);
chunk->map_alloc = new_alloc;
chunk->map = new;
@@ -621,436 +600,92 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
pcpu_chunk_relocate(chunk, oslot);
}
-/**
- * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
- * @chunk: chunk of interest
- * @bitmapp: output parameter for bitmap
- * @may_alloc: may allocate the array
- *
- * Returns pointer to array of pointers to struct page and bitmap,
- * both of which can be indexed with pcpu_page_idx(). The returned
- * array is cleared to zero and *@bitmapp is copied from
- * @chunk->populated. Note that there is only one array and bitmap
- * and access exclusion is the caller's responsibility.
- *
- * CONTEXT:
- * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
- * Otherwise, don't care.
- *
- * RETURNS:
- * Pointer to temp pages array on success, NULL on failure.
- */
-static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
- unsigned long **bitmapp,
- bool may_alloc)
-{
- static struct page **pages;
- static unsigned long *bitmap;
- size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
- size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
- sizeof(unsigned long);
-
- if (!pages || !bitmap) {
- if (may_alloc && !pages)
- pages = pcpu_mem_alloc(pages_size);
- if (may_alloc && !bitmap)
- bitmap = pcpu_mem_alloc(bitmap_size);
- if (!pages || !bitmap)
- return NULL;
- }
-
- memset(pages, 0, pages_size);
- bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
-
- *bitmapp = bitmap;
- return pages;
-}
-
-/**
- * pcpu_free_pages - free pages which were allocated for @chunk
- * @chunk: chunk pages were allocated for
- * @pages: array of pages to be freed, indexed by pcpu_page_idx()
- * @populated: populated bitmap
- * @page_start: page index of the first page to be freed
- * @page_end: page index of the last page to be freed + 1
- *
- * Free pages [@page_start and @page_end) in @pages for all units.
- * The pages were allocated for @chunk.
- */
-static void pcpu_free_pages(struct pcpu_chunk *chunk,
- struct page **pages, unsigned long *populated,
- int page_start, int page_end)
-{
- unsigned int cpu;
- int i;
-
- for_each_possible_cpu(cpu) {
- for (i = page_start; i < page_end; i++) {
- struct page *page = pages[pcpu_page_idx(cpu, i)];
-
- if (page)
- __free_page(page);
- }
- }
-}
-
-/**
- * pcpu_alloc_pages - allocates pages for @chunk
- * @chunk: target chunk
- * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
- * @populated: populated bitmap
- * @page_start: page index of the first page to be allocated
- * @page_end: page index of the last page to be allocated + 1
- *
- * Allocate pages [@page_start,@page_end) into @pages for all units.
- * The allocation is for @chunk. Percpu core doesn't care about the
- * content of @pages and will pass it verbatim to pcpu_map_pages().
- */
-static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
- struct page **pages, unsigned long *populated,
- int page_start, int page_end)
-{
- const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
- unsigned int cpu;
- int i;
-
- for_each_possible_cpu(cpu) {
- for (i = page_start; i < page_end; i++) {
- struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
-
- *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
- if (!*pagep) {
- pcpu_free_pages(chunk, pages, populated,
- page_start, page_end);
- return -ENOMEM;
- }
- }
- }
- return 0;
-}
-
-/**
- * pcpu_pre_unmap_flush - flush cache prior to unmapping
- * @chunk: chunk the regions to be flushed belongs to
- * @page_start: page index of the first page to be flushed
- * @page_end: page index of the last page to be flushed + 1
- *
- * Pages in [@page_start,@page_end) of @chunk are about to be
- * unmapped. Flush cache. As each flushing trial can be very
- * expensive, issue flush on the whole region at once rather than
- * doing it for each cpu. This could be an overkill but is more
- * scalable.
- */
-static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
- int page_start, int page_end)
-{
- flush_cache_vunmap(
- pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
- pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
-}
-
-static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
-{
- unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
-}
-
-/**
- * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
- * @chunk: chunk of interest
- * @pages: pages array which can be used to pass information to free
- * @populated: populated bitmap
- * @page_start: page index of the first page to unmap
- * @page_end: page index of the last page to unmap + 1
- *
- * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
- * Corresponding elements in @pages were cleared by the caller and can
- * be used to carry information to pcpu_free_pages() which will be
- * called after all unmaps are finished. The caller should call
- * proper pre/post flush functions.
- */
-static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
- struct page **pages, unsigned long *populated,
- int page_start, int page_end)
+static struct pcpu_chunk *pcpu_alloc_chunk(void)
{
- unsigned int cpu;
- int i;
+ struct pcpu_chunk *chunk;
- for_each_possible_cpu(cpu) {
- for (i = page_start; i < page_end; i++) {
- struct page *page;
+ chunk = pcpu_mem_alloc(pcpu_chunk_struct_size);
+ if (!chunk)
+ return NULL;
- page = pcpu_chunk_page(chunk, cpu, i);
- WARN_ON(!page);
- pages[pcpu_page_idx(cpu, i)] = page;
- }
- __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
- page_end - page_start);
+ chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+ if (!chunk->map) {
+ kfree(chunk);
+ return NULL;
}
- for (i = page_start; i < page_end; i++)
- __clear_bit(i, populated);
-}
+ chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+ chunk->map[chunk->map_used++] = pcpu_unit_size;
-/**
- * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
- * @chunk: pcpu_chunk the regions to be flushed belong to
- * @page_start: page index of the first page to be flushed
- * @page_end: page index of the last page to be flushed + 1
- *
- * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
- * TLB for the regions. This can be skipped if the area is to be
- * returned to vmalloc as vmalloc will handle TLB flushing lazily.
- *
- * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
- * for the whole region.
- */
-static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
- int page_start, int page_end)
-{
- flush_tlb_kernel_range(
- pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
- pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
-}
+ INIT_LIST_HEAD(&chunk->list);
+ chunk->free_size = pcpu_unit_size;
+ chunk->contig_hint = pcpu_unit_size;
-static int __pcpu_map_pages(unsigned long addr, struct page **pages,
- int nr_pages)
-{
- return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
- PAGE_KERNEL, pages);
+ return chunk;
}
-/**
- * pcpu_map_pages - map pages into a pcpu_chunk
- * @chunk: chunk of interest
- * @pages: pages array containing pages to be mapped
- * @populated: populated bitmap
- * @page_start: page index of the first page to map
- * @page_end: page index of the last page to map + 1
- *
- * For each cpu, map pages [@page_start,@page_end) into @chunk. The
- * caller is responsible for calling pcpu_post_map_flush() after all
- * mappings are complete.
- *
- * This function is responsible for setting corresponding bits in
- * @chunk->populated bitmap and whatever is necessary for reverse
- * lookup (addr -> chunk).
- */
-static int pcpu_map_pages(struct pcpu_chunk *chunk,
- struct page **pages, unsigned long *populated,
- int page_start, int page_end)
+static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
- unsigned int cpu, tcpu;
- int i, err;
-
- for_each_possible_cpu(cpu) {
- err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
- &pages[pcpu_page_idx(cpu, page_start)],
- page_end - page_start);
- if (err < 0)
- goto err;
- }
-
- /* mapping successful, link chunk and mark populated */
- for (i = page_start; i < page_end; i++) {
- for_each_possible_cpu(cpu)
- pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
- chunk);
- __set_bit(i, populated);
- }
-
- return 0;
-
-err:
- for_each_possible_cpu(tcpu) {
- if (tcpu == cpu)
- break;
- __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
- page_end - page_start);
- }
- return err;
+ if (!chunk)
+ return;
+ pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
+ kfree(chunk);
}
-/**
- * pcpu_post_map_flush - flush cache after mapping
- * @chunk: pcpu_chunk the regions to be flushed belong to
- * @page_start: page index of the first page to be flushed
- * @page_end: page index of the last page to be flushed + 1
- *
- * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
- * cache.
- *
- * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
- * for the whole region.
+/*
+ * Chunk management implementation.
+ *
+ * To allow different implementations, chunk alloc/free and
+ * [de]population are implemented in a separate file which is pulled
+ * into this file and compiled together. The following functions
+ * should be implemented.
+ *
+ * pcpu_populate_chunk - populate the specified range of a chunk
+ * pcpu_depopulate_chunk - depopulate the specified range of a chunk
+ * pcpu_create_chunk - create a new chunk
+ * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop
+ * pcpu_addr_to_page - translate address to physical address
+ * pcpu_verify_alloc_info - check alloc_info is acceptable during init
*/
-static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
- int page_start, int page_end)
-{
- flush_cache_vmap(
- pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
- pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
-}
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
+static struct pcpu_chunk *pcpu_create_chunk(void);
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
+static struct page *pcpu_addr_to_page(void *addr);
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
+
+#ifdef CONFIG_NEED_PER_CPU_KM
+#include "percpu-km.c"
+#else
+#include "percpu-vm.c"
+#endif
/**
- * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
- * @chunk: chunk to depopulate
- * @off: offset to the area to depopulate
- * @size: size of the area to depopulate in bytes
- * @flush: whether to flush cache and tlb or not
- *
- * For each cpu, depopulate and unmap pages [@page_start,@page_end)
- * from @chunk. If @flush is true, vcache is flushed before unmapping
- * and tlb after.
+ * pcpu_chunk_addr_search - determine chunk containing specified address
+ * @addr: address for which the chunk needs to be determined.
*
- * CONTEXT:
- * pcpu_alloc_mutex.
+ * RETURNS:
+ * The address of the found chunk.
*/
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
- int page_start = PFN_DOWN(off);
- int page_end = PFN_UP(off + size);
- struct page **pages;
- unsigned long *populated;
- int rs, re;
-
- /* quick path, check whether it's empty already */
- pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
- if (rs == page_start && re == page_end)
- return;
- break;
+ /* is it in the first chunk? */
+ if (pcpu_addr_in_first_chunk(addr)) {
+ /* is it in the reserved area? */
+ if (pcpu_addr_in_reserved_chunk(addr))
+ return pcpu_reserved_chunk;
+ return pcpu_first_chunk;
}
- /* immutable chunks can't be depopulated */
- WARN_ON(chunk->immutable);
-
/*
- * If control reaches here, there must have been at least one
- * successful population attempt so the temp pages array must
- * be available now.
+ * The address is relative to unit0 which might be unused and
+ * thus unmapped. Offset the address to the unit space of the
+ * current processor before looking it up in the vmalloc
+ * space. Note that any possible cpu id can be used here, so
+ * there's no need to worry about preemption or cpu hotplug.
*/
- pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
- BUG_ON(!pages);
-
- /* unmap and free */
- pcpu_pre_unmap_flush(chunk, page_start, page_end);
-
- pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
- pcpu_unmap_pages(chunk, pages, populated, rs, re);
-
- /* no need to flush tlb, vmalloc will handle it lazily */
-
- pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
- pcpu_free_pages(chunk, pages, populated, rs, re);
-
- /* commit new bitmap */
- bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
-}
-
-/**
- * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
- * @chunk: chunk of interest
- * @off: offset to the area to populate
- * @size: size of the area to populate in bytes
- *
- * For each cpu, populate and map pages [@page_start,@page_end) into
- * @chunk. The area is cleared on return.
- *
- * CONTEXT:
- * pcpu_alloc_mutex, does GFP_KERNEL allocation.
- */
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
-{
- int page_start = PFN_DOWN(off);
- int page_end = PFN_UP(off + size);
- int free_end = page_start, unmap_end = page_start;
- struct page **pages;
- unsigned long *populated;
- unsigned int cpu;
- int rs, re, rc;
-
- /* quick path, check whether all pages are already there */
- pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
- if (rs == page_start && re == page_end)
- goto clear;
- break;
- }
-
- /* need to allocate and map pages, this chunk can't be immutable */
- WARN_ON(chunk->immutable);
-
- pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
- if (!pages)
- return -ENOMEM;
-
- /* alloc and map */
- pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
- rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
- if (rc)
- goto err_free;
- free_end = re;
- }
-
- pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
- rc = pcpu_map_pages(chunk, pages, populated, rs, re);
- if (rc)
- goto err_unmap;
- unmap_end = re;
- }
- pcpu_post_map_flush(chunk, page_start, page_end);
-
- /* commit new bitmap */
- bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
-clear:
- for_each_possible_cpu(cpu)
- memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
- return 0;
-
-err_unmap:
- pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
- pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
- pcpu_unmap_pages(chunk, pages, populated, rs, re);
- pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
-err_free:
- pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
- pcpu_free_pages(chunk, pages, populated, rs, re);
- return rc;
-}
-
-static void free_pcpu_chunk(struct pcpu_chunk *chunk)
-{
- if (!chunk)
- return;
- if (chunk->vms)
- pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups);
- pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
- kfree(chunk);
-}
-
-static struct pcpu_chunk *alloc_pcpu_chunk(void)
-{
- struct pcpu_chunk *chunk;
-
- chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
- if (!chunk)
- return NULL;
-
- chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
- chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
- chunk->map[chunk->map_used++] = pcpu_unit_size;
-
- chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
- pcpu_nr_groups, pcpu_atom_size,
- GFP_KERNEL);
- if (!chunk->vms) {
- free_pcpu_chunk(chunk);
- return NULL;
- }
-
- INIT_LIST_HEAD(&chunk->list);
- chunk->free_size = pcpu_unit_size;
- chunk->contig_hint = pcpu_unit_size;
- chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0];
-
- return chunk;
+ addr += pcpu_unit_offsets[raw_smp_processor_id()];
+ return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}
/**
@@ -1067,7 +702,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
-static void *pcpu_alloc(size_t size, size_t align, bool reserved)
+static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
{
static int warn_limit = 10;
struct pcpu_chunk *chunk;
@@ -1142,7 +777,7 @@ restart:
/* hmmm... no space left, create a new chunk */
spin_unlock_irqrestore(&pcpu_lock, flags);
- chunk = alloc_pcpu_chunk();
+ chunk = pcpu_create_chunk();
if (!chunk) {
err = "failed to allocate new chunk";
goto fail_unlock_mutex;
@@ -1187,8 +822,8 @@ fail_unlock_mutex:
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
- * Allocate percpu area of @size bytes aligned at @align. Might
- * sleep. Might trigger writeouts.
+ * Allocate zero-filled percpu area of @size bytes aligned at @align.
+ * Might sleep. Might trigger writeouts.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
@@ -1196,7 +831,7 @@ fail_unlock_mutex:
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
-void *__alloc_percpu(size_t size, size_t align)
+void __percpu *__alloc_percpu(size_t size, size_t align)
{
return pcpu_alloc(size, align, false);
}
@@ -1207,9 +842,10 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
- * Allocate percpu area of @size bytes aligned at @align from reserved
- * percpu area if arch has set it up; otherwise, allocation is served
- * from the same dynamic area. Might sleep. Might trigger writeouts.
+ * Allocate zero-filled percpu area of @size bytes aligned at @align
+ * from reserved percpu area if arch has set it up; otherwise,
+ * allocation is served from the same dynamic area. Might sleep.
+ * Might trigger writeouts.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
@@ -1217,7 +853,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
-void *__alloc_reserved_percpu(size_t size, size_t align)
+void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
{
return pcpu_alloc(size, align, true);
}
@@ -1254,7 +890,7 @@ static void pcpu_reclaim(struct work_struct *work)
list_for_each_entry_safe(chunk, next, &todo, list) {
pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
- free_pcpu_chunk(chunk);
+ pcpu_destroy_chunk(chunk);
}
mutex_unlock(&pcpu_alloc_mutex);
@@ -1269,7 +905,7 @@ static void pcpu_reclaim(struct work_struct *work)
* CONTEXT:
* Can be called from atomic context.
*/
-void free_percpu(void *ptr)
+void free_percpu(void __percpu *ptr)
{
void *addr;
struct pcpu_chunk *chunk;
@@ -1304,6 +940,35 @@ void free_percpu(void *ptr)
EXPORT_SYMBOL_GPL(free_percpu);
/**
+ * is_kernel_percpu_address - test whether address is from static percpu area
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to in-kernel static percpu area. Module
+ * static percpu areas are not considered. For those, use
+ * is_module_percpu_address().
+ *
+ * RETURNS:
+ * %true if @addr is from in-kernel static percpu area, %false otherwise.
+ */
+bool is_kernel_percpu_address(unsigned long addr)
+{
+#ifdef CONFIG_SMP
+ const size_t static_size = __per_cpu_end - __per_cpu_start;
+ void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ void *start = per_cpu_ptr(base, cpu);
+
+ if ((void *)addr >= start && (void *)addr < start + static_size)
+ return true;
+ }
+#endif
+ /* on UP, can't distinguish from other static vars, always false */
+ return false;
+}
+
+/**
* per_cpu_ptr_to_phys - convert translated percpu address to physical address
* @addr: the address to be converted to physical address
*
@@ -1317,25 +982,39 @@ EXPORT_SYMBOL_GPL(free_percpu);
*/
phys_addr_t per_cpu_ptr_to_phys(void *addr)
{
- if ((unsigned long)addr < VMALLOC_START ||
- (unsigned long)addr >= VMALLOC_END)
- return __pa(addr);
- else
- return page_to_phys(vmalloc_to_page(addr));
-}
-
-static inline size_t pcpu_calc_fc_sizes(size_t static_size,
- size_t reserved_size,
- ssize_t *dyn_sizep)
-{
- size_t size_sum;
+ void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
+ bool in_first_chunk = false;
+ unsigned long first_start, first_end;
+ unsigned int cpu;
- size_sum = PFN_ALIGN(static_size + reserved_size +
- (*dyn_sizep >= 0 ? *dyn_sizep : 0));
- if (*dyn_sizep != 0)
- *dyn_sizep = size_sum - static_size - reserved_size;
+ /*
+ * The following test on first_start/end isn't strictly
+ * necessary but will speed up lookups of addresses which
+ * aren't in the first chunk.
+ */
+ first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0);
+ first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu,
+ pcpu_unit_pages);
+ if ((unsigned long)addr >= first_start &&
+ (unsigned long)addr < first_end) {
+ for_each_possible_cpu(cpu) {
+ void *start = per_cpu_ptr(base, cpu);
+
+ if (addr >= start && addr < start + pcpu_unit_size) {
+ in_first_chunk = true;
+ break;
+ }
+ }
+ }
- return size_sum;
+ if (in_first_chunk) {
+ if ((unsigned long)addr < VMALLOC_START ||
+ (unsigned long)addr >= VMALLOC_END)
+ return __pa(addr);
+ else
+ return page_to_phys(vmalloc_to_page(addr));
+ } else
+ return page_to_phys(pcpu_addr_to_page(addr));
}
/**
@@ -1394,158 +1073,6 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
}
/**
- * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
- * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
- * @atom_size: allocation atom size
- * @cpu_distance_fn: callback to determine distance between cpus, optional
- *
- * This function determines grouping of units, their mappings to cpus
- * and other parameters considering needed percpu size, allocation
- * atom size and distances between CPUs.
- *
- * Groups are always mutliples of atom size and CPUs which are of
- * LOCAL_DISTANCE both ways are grouped together and share space for
- * units in the same group. The returned configuration is guaranteed
- * to have CPUs on different nodes on different groups and >=75% usage
- * of allocated virtual address space.
- *
- * RETURNS:
- * On success, pointer to the new allocation_info is returned. On
- * failure, ERR_PTR value is returned.
- */
-struct pcpu_alloc_info * __init pcpu_build_alloc_info(
- size_t reserved_size, ssize_t dyn_size,
- size_t atom_size,
- pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
-{
- static int group_map[NR_CPUS] __initdata;
- static int group_cnt[NR_CPUS] __initdata;
- const size_t static_size = __per_cpu_end - __per_cpu_start;
- int group_cnt_max = 0, nr_groups = 1, nr_units = 0;
- size_t size_sum, min_unit_size, alloc_size;
- int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
- int last_allocs, group, unit;
- unsigned int cpu, tcpu;
- struct pcpu_alloc_info *ai;
- unsigned int *cpu_map;
-
- /* this function may be called multiple times */
- memset(group_map, 0, sizeof(group_map));
- memset(group_cnt, 0, sizeof(group_map));
-
- /*
- * Determine min_unit_size, alloc_size and max_upa such that
- * alloc_size is multiple of atom_size and is the smallest
- * which can accomodate 4k aligned segments which are equal to
- * or larger than min_unit_size.
- */
- size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
- min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-
- alloc_size = roundup(min_unit_size, atom_size);
- upa = alloc_size / min_unit_size;
- while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
- upa--;
- max_upa = upa;
-
- /* group cpus according to their proximity */
- for_each_possible_cpu(cpu) {
- group = 0;
- next_group:
- for_each_possible_cpu(tcpu) {
- if (cpu == tcpu)
- break;
- if (group_map[tcpu] == group && cpu_distance_fn &&
- (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
- cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
- group++;
- nr_groups = max(nr_groups, group + 1);
- goto next_group;
- }
- }
- group_map[cpu] = group;
- group_cnt[group]++;
- group_cnt_max = max(group_cnt_max, group_cnt[group]);
- }
-
- /*
- * Expand unit size until address space usage goes over 75%
- * and then as much as possible without using more address
- * space.
- */
- last_allocs = INT_MAX;
- for (upa = max_upa; upa; upa--) {
- int allocs = 0, wasted = 0;
-
- if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
- continue;
-
- for (group = 0; group < nr_groups; group++) {
- int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
- allocs += this_allocs;
- wasted += this_allocs * upa - group_cnt[group];
- }
-
- /*
- * Don't accept if wastage is over 25%. The
- * greater-than comparison ensures upa==1 always
- * passes the following check.
- */
- if (wasted > num_possible_cpus() / 3)
- continue;
-
- /* and then don't consume more memory */
- if (allocs > last_allocs)
- break;
- last_allocs = allocs;
- best_upa = upa;
- }
- upa = best_upa;
-
- /* allocate and fill alloc_info */
- for (group = 0; group < nr_groups; group++)
- nr_units += roundup(group_cnt[group], upa);
-
- ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
- if (!ai)
- return ERR_PTR(-ENOMEM);
- cpu_map = ai->groups[0].cpu_map;
-
- for (group = 0; group < nr_groups; group++) {
- ai->groups[group].cpu_map = cpu_map;
- cpu_map += roundup(group_cnt[group], upa);
- }
-
- ai->static_size = static_size;
- ai->reserved_size = reserved_size;
- ai->dyn_size = dyn_size;
- ai->unit_size = alloc_size / upa;
- ai->atom_size = atom_size;
- ai->alloc_size = alloc_size;
-
- for (group = 0, unit = 0; group_cnt[group]; group++) {
- struct pcpu_group_info *gi = &ai->groups[group];
-
- /*
- * Initialize base_offset as if all groups are located
- * back-to-back. The caller should update this to
- * reflect actual allocation.
- */
- gi->base_offset = unit * ai->unit_size;
-
- for_each_possible_cpu(cpu)
- if (group_map[cpu] == group)
- gi->cpu_map[gi->nr_units++] = cpu;
- gi->nr_units = roundup(gi->nr_units, upa);
- unit += gi->nr_units;
- }
- BUG_ON(unit != nr_units);
-
- return ai;
-}
-
-/**
* pcpu_dump_alloc_info - print out information about pcpu_alloc_info
* @lvl: loglevel
* @ai: allocation info to dump
@@ -1662,7 +1189,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
void *base_addr)
{
static char cpus_buf[4096] __initdata;
- static int smap[2], dmap[2];
+ static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
+ static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
size_t dyn_size = ai->dyn_size;
size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
struct pcpu_chunk *schunk, *dchunk = NULL;
@@ -1685,14 +1213,16 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
} while (0)
/* sanity checks */
- BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
- ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
+#ifdef CONFIG_SMP
PCPU_SETUP_BUG_ON(!ai->static_size);
+#endif
PCPU_SETUP_BUG_ON(!base_addr);
PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+ PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
+ PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
/* process group information and build config tables accordingly */
group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
@@ -1724,9 +1254,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
if (pcpu_first_unit_cpu == NR_CPUS)
pcpu_first_unit_cpu = cpu;
+ pcpu_last_unit_cpu = cpu;
}
}
- pcpu_last_unit_cpu = cpu;
pcpu_nr_units = unit;
for_each_possible_cpu(cpu)
@@ -1734,7 +1264,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
/* we're done parsing the input, undefine BUG macro and dump config */
#undef PCPU_SETUP_BUG_ON
- pcpu_dump_alloc_info(KERN_INFO, ai);
+ pcpu_dump_alloc_info(KERN_DEBUG, ai);
pcpu_nr_groups = ai->nr_groups;
pcpu_group_offsets = group_offsets;
@@ -1811,6 +1341,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
return 0;
}
+#ifdef CONFIG_SMP
+
const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
[PCPU_FC_AUTO] = "auto",
[PCPU_FC_EMBED] = "embed",
@@ -1838,12 +1370,184 @@ static int __init percpu_alloc_setup(char *str)
}
early_param("percpu_alloc", percpu_alloc_setup);
+/*
+ * pcpu_embed_first_chunk() is used by the generic percpu setup.
+ * Build it if needed by the arch config or the generic setup is going
+ * to be used.
+ */
#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
+#define BUILD_EMBED_FIRST_CHUNK
+#endif
+
+/* build pcpu_page_first_chunk() iff needed by the arch config */
+#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
+#define BUILD_PAGE_FIRST_CHUNK
+#endif
+
+/* pcpu_build_alloc_info() is used by both embed and page first chunk */
+#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
+/**
+ * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: minimum free size for dynamic allocation in bytes
+ * @atom_size: allocation atom size
+ * @cpu_distance_fn: callback to determine distance between cpus, optional
+ *
+ * This function determines grouping of units, their mappings to cpus
+ * and other parameters considering needed percpu size, allocation
+ * atom size and distances between CPUs.
+ *
+ * Groups are always mutliples of atom size and CPUs which are of
+ * LOCAL_DISTANCE both ways are grouped together and share space for
+ * units in the same group. The returned configuration is guaranteed
+ * to have CPUs on different nodes on different groups and >=75% usage
+ * of allocated virtual address space.
+ *
+ * RETURNS:
+ * On success, pointer to the new allocation_info is returned. On
+ * failure, ERR_PTR value is returned.
+ */
+static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+ size_t reserved_size, size_t dyn_size,
+ size_t atom_size,
+ pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+ static int group_map[NR_CPUS] __initdata;
+ static int group_cnt[NR_CPUS] __initdata;
+ const size_t static_size = __per_cpu_end - __per_cpu_start;
+ int nr_groups = 1, nr_units = 0;
+ size_t size_sum, min_unit_size, alloc_size;
+ int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
+ int last_allocs, group, unit;
+ unsigned int cpu, tcpu;
+ struct pcpu_alloc_info *ai;
+ unsigned int *cpu_map;
+
+ /* this function may be called multiple times */
+ memset(group_map, 0, sizeof(group_map));
+ memset(group_cnt, 0, sizeof(group_cnt));
+
+ /* calculate size_sum and ensure dyn_size is enough for early alloc */
+ size_sum = PFN_ALIGN(static_size + reserved_size +
+ max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
+ dyn_size = size_sum - static_size - reserved_size;
+
+ /*
+ * Determine min_unit_size, alloc_size and max_upa such that
+ * alloc_size is multiple of atom_size and is the smallest
+ * which can accomodate 4k aligned segments which are equal to
+ * or larger than min_unit_size.
+ */
+ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+
+ alloc_size = roundup(min_unit_size, atom_size);
+ upa = alloc_size / min_unit_size;
+ while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ upa--;
+ max_upa = upa;
+
+ /* group cpus according to their proximity */
+ for_each_possible_cpu(cpu) {
+ group = 0;
+ next_group:
+ for_each_possible_cpu(tcpu) {
+ if (cpu == tcpu)
+ break;
+ if (group_map[tcpu] == group && cpu_distance_fn &&
+ (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+ cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+ group++;
+ nr_groups = max(nr_groups, group + 1);
+ goto next_group;
+ }
+ }
+ group_map[cpu] = group;
+ group_cnt[group]++;
+ }
+
+ /*
+ * Expand unit size until address space usage goes over 75%
+ * and then as much as possible without using more address
+ * space.
+ */
+ last_allocs = INT_MAX;
+ for (upa = max_upa; upa; upa--) {
+ int allocs = 0, wasted = 0;
+
+ if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ continue;
+
+ for (group = 0; group < nr_groups; group++) {
+ int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+ allocs += this_allocs;
+ wasted += this_allocs * upa - group_cnt[group];
+ }
+
+ /*
+ * Don't accept if wastage is over 1/3. The
+ * greater-than comparison ensures upa==1 always
+ * passes the following check.
+ */
+ if (wasted > num_possible_cpus() / 3)
+ continue;
+
+ /* and then don't consume more memory */
+ if (allocs > last_allocs)
+ break;
+ last_allocs = allocs;
+ best_upa = upa;
+ }
+ upa = best_upa;
+
+ /* allocate and fill alloc_info */
+ for (group = 0; group < nr_groups; group++)
+ nr_units += roundup(group_cnt[group], upa);
+
+ ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
+ if (!ai)
+ return ERR_PTR(-ENOMEM);
+ cpu_map = ai->groups[0].cpu_map;
+
+ for (group = 0; group < nr_groups; group++) {
+ ai->groups[group].cpu_map = cpu_map;
+ cpu_map += roundup(group_cnt[group], upa);
+ }
+
+ ai->static_size = static_size;
+ ai->reserved_size = reserved_size;
+ ai->dyn_size = dyn_size;
+ ai->unit_size = alloc_size / upa;
+ ai->atom_size = atom_size;
+ ai->alloc_size = alloc_size;
+
+ for (group = 0, unit = 0; group_cnt[group]; group++) {
+ struct pcpu_group_info *gi = &ai->groups[group];
+
+ /*
+ * Initialize base_offset as if all groups are located
+ * back-to-back. The caller should update this to
+ * reflect actual allocation.
+ */
+ gi->base_offset = unit * ai->unit_size;
+
+ for_each_possible_cpu(cpu)
+ if (group_map[cpu] == group)
+ gi->cpu_map[gi->nr_units++] = cpu;
+ gi->nr_units = roundup(gi->nr_units, upa);
+ unit += gi->nr_units;
+ }
+ BUG_ON(unit != nr_units);
+
+ return ai;
+}
+#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
+
+#if defined(BUILD_EMBED_FIRST_CHUNK)
/**
* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
* @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @dyn_size: minimum free size for dynamic allocation in bytes
* @atom_size: allocation atom size
* @cpu_distance_fn: callback to determine distance between cpus, optional
* @alloc_fn: function to allocate percpu page
@@ -1864,10 +1568,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
* vmalloc space is not orders of magnitude larger than distances
* between node memory addresses (ie. 32bit NUMA machines).
*
- * When @dyn_size is positive, dynamic area might be larger than
- * specified to fill page alignment. When @dyn_size is auto,
- * @dyn_size is just big enough to fill page alignment after static
- * and reserved areas.
+ * @dyn_size specifies the minimum dynamic area size.
*
* If the needed size is smaller than the minimum or specified unit
* size, the leftover is returned using @free_fn.
@@ -1875,7 +1576,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
* RETURNS:
* 0 on success, -errno on failure.
*/
-int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
+int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
pcpu_fc_alloc_fn_t alloc_fn,
@@ -1971,10 +1672,9 @@ out_free:
free_bootmem(__pa(areas), areas_size);
return rc;
}
-#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
- !CONFIG_HAVE_SETUP_PER_CPU_AREA */
+#endif /* BUILD_EMBED_FIRST_CHUNK */
-#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+#ifdef BUILD_PAGE_FIRST_CHUNK
/**
* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
* @reserved_size: the size of reserved percpu area in bytes
@@ -2006,7 +1706,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
- ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL);
+ ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
if (IS_ERR(ai))
return PTR_ERR(ai);
BUG_ON(ai->nr_groups != 1);
@@ -2082,10 +1782,11 @@ out_free_ar:
pcpu_free_alloc_info(ai);
return rc;
}
-#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
+#endif /* BUILD_PAGE_FIRST_CHUNK */
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
/*
- * Generic percpu area setup.
+ * Generic SMP percpu area setup.
*
* The embedding helper is used because its behavior closely resembles
* the original non-dynamic generic percpu area setup. This is
@@ -2096,7 +1797,6 @@ out_free_ar:
* on the physical linear memory mapping which uses large page
* mappings on applicable archs.
*/
-#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
@@ -2125,10 +1825,75 @@ void __init setup_per_cpu_areas(void)
PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
if (rc < 0)
- panic("Failed to initialized percpu areas.");
+ panic("Failed to initialize percpu areas.");
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu)
__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
-#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
+
+#else /* CONFIG_SMP */
+
+/*
+ * UP percpu area setup.
+ *
+ * UP always uses km-based percpu allocator with identity mapping.
+ * Static percpu variables are indistinguishable from the usual static
+ * variables and don't require any special preparation.
+ */
+void __init setup_per_cpu_areas(void)
+{
+ const size_t unit_size =
+ roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
+ PERCPU_DYNAMIC_RESERVE));
+ struct pcpu_alloc_info *ai;
+ void *fc;
+
+ ai = pcpu_alloc_alloc_info(1, 1);
+ fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ if (!ai || !fc)
+ panic("Failed to allocate memory for percpu areas.");
+
+ ai->dyn_size = unit_size;
+ ai->unit_size = unit_size;
+ ai->atom_size = unit_size;
+ ai->alloc_size = unit_size;
+ ai->groups[0].nr_units = 1;
+ ai->groups[0].cpu_map[0] = 0;
+
+ if (pcpu_setup_first_chunk(ai, fc) < 0)
+ panic("Failed to initialize percpu areas.");
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * First and reserved chunks are initialized with temporary allocation
+ * map in initdata so that they can be used before slab is online.
+ * This function is called after slab is brought up and replaces those
+ * with properly allocated maps.
+ */
+void __init percpu_init_late(void)
+{
+ struct pcpu_chunk *target_chunks[] =
+ { pcpu_first_chunk, pcpu_reserved_chunk, NULL };
+ struct pcpu_chunk *chunk;
+ unsigned long flags;
+ int i;
+
+ for (i = 0; (chunk = target_chunks[i]); i++) {
+ int *map;
+ const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
+
+ BUILD_BUG_ON(size > PAGE_SIZE);
+
+ map = pcpu_mem_alloc(size);
+ BUG_ON(!map);
+
+ spin_lock_irqsave(&pcpu_lock, flags);
+ memcpy(map, chunk->map, size);
+ chunk->map = map;
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+ }
+}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
new file mode 100644
index 000000000000..0369f5b3ba1b
--- /dev/null
+++ b/mm/pgtable-generic.c
@@ -0,0 +1,120 @@
+/*
+ * mm/pgtable-generic.c
+ *
+ * Generic pgtable methods declared in asm-generic/pgtable.h
+ *
+ * Copyright (C) 2010 Linus Torvalds
+ */
+
+#include <asm/tlb.h>
+#include <asm-generic/pgtable.h>
+
+#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+/*
+ * Only sets the access flags (dirty, accessed, and
+ * writable). Furthermore, we know it always gets set to a "more
+ * permissive" setting, which allows most architectures to optimize
+ * this. We return whether the PTE actually changed, which in turn
+ * instructs the caller to do things like update__mmu_cache. This
+ * used to be done in the caller, but sparc needs minor faults to
+ * force that call on sun4c so we changed this macro slightly
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ int changed = !pte_same(*ptep, entry);
+ if (changed) {
+ set_pte_at(vma->vm_mm, address, ptep, entry);
+ flush_tlb_page(vma, address);
+ }
+ return changed;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ int changed = !pmd_same(*pmdp, entry);
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ if (changed) {
+ set_pmd_at(vma->vm_mm, address, pmdp, entry);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+ return changed;
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+ BUG();
+ return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ int young;
+ young = ptep_test_and_clear_young(vma, address, ptep);
+ if (young)
+ flush_tlb_page(vma, address);
+ return young;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int young;
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+ BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ young = pmdp_test_and_clear_young(vma, address, pmdp);
+ if (young)
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return young;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
+pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep)
+{
+ pte_t pte;
+ pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
+ flush_tlb_page(vma, address);
+ return pte;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd;
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return pmd;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd = pmd_mksplitting(*pmdp);
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ set_pmd_at(vma->vm_mm, address, pmdp, pmd);
+ /* tlb flush only to serialize against gup-fast */
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 6633965bb27b..2876349339a7 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -14,6 +14,7 @@
*/
#include <linux/kernel.h>
+#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/module.h>
diff --git a/mm/readahead.c b/mm/readahead.c
index 033bc135a41f..77506a291a2d 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -9,6 +9,7 @@
#include <linux/kernel.h>
#include <linux/fs.h>
+#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/blkdev.h>
@@ -501,6 +502,12 @@ void page_cache_sync_readahead(struct address_space *mapping,
if (!ra->ra_pages)
return;
+ /* be dumb */
+ if (filp && (filp->f_mode & FMODE_RANDOM)) {
+ force_page_cache_readahead(mapping, filp, offset, req_size);
+ return;
+ }
+
/* do read-ahead */
ondemand_readahead(mapping, ra, filp, false, offset, req_size);
}
@@ -516,7 +523,7 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
* @req_size: hint: total size of the read which the caller is performing in
* pagecache pages
*
- * page_cache_async_ondemand() should be called when a page is used which
+ * page_cache_async_readahead() should be called when a page is used which
* has the PG_readahead flag; this is a marker to suggest that the application
* has used up enough of the readahead window that we should start pulling in
* more pages.
diff --git a/mm/rmap.c b/mm/rmap.c
index 278cd277bdec..f21f4a1d6a1c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,12 +56,14 @@
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
+#include <linux/hugetlb.h>
#include <asm/tlbflush.h>
#include "internal.h"
static struct kmem_cache *anon_vma_cachep;
+static struct kmem_cache *anon_vma_chain_cachep;
static inline struct anon_vma *anon_vma_alloc(void)
{
@@ -73,6 +75,16 @@ void anon_vma_free(struct anon_vma *anon_vma)
kmem_cache_free(anon_vma_cachep, anon_vma);
}
+static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
+{
+ return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
+}
+
+static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
+{
+ kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
+}
+
/**
* anon_vma_prepare - attach an anon_vma to a memory region
* @vma: the memory region in question
@@ -82,7 +94,7 @@ void anon_vma_free(struct anon_vma *anon_vma)
* anonymous pages mapped into it with that anon_vma.
*
* The common case will be that we already have one, but if
- * if not we either need to find an adjacent mapping that we
+ * not we either need to find an adjacent mapping that we
* can re-use the anon_vma from (very common when the only
* reason for splitting a vma has been mprotect()), or we
* allocate a new one.
@@ -103,80 +115,187 @@ void anon_vma_free(struct anon_vma *anon_vma)
int anon_vma_prepare(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
+ struct anon_vma_chain *avc;
might_sleep();
if (unlikely(!anon_vma)) {
struct mm_struct *mm = vma->vm_mm;
struct anon_vma *allocated;
+ avc = anon_vma_chain_alloc();
+ if (!avc)
+ goto out_enomem;
+
anon_vma = find_mergeable_anon_vma(vma);
allocated = NULL;
if (!anon_vma) {
anon_vma = anon_vma_alloc();
if (unlikely(!anon_vma))
- return -ENOMEM;
+ goto out_enomem_free_avc;
allocated = anon_vma;
+ /*
+ * This VMA had no anon_vma yet. This anon_vma is
+ * the root of any anon_vma tree that might form.
+ */
+ anon_vma->root = anon_vma;
}
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
/* page_table_lock to protect against threads */
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
- list_add_tail(&vma->anon_vma_node, &anon_vma->head);
+ avc->anon_vma = anon_vma;
+ avc->vma = vma;
+ list_add(&avc->same_vma, &vma->anon_vma_chain);
+ list_add_tail(&avc->same_anon_vma, &anon_vma->head);
allocated = NULL;
+ avc = NULL;
}
spin_unlock(&mm->page_table_lock);
+ anon_vma_unlock(anon_vma);
- spin_unlock(&anon_vma->lock);
if (unlikely(allocated))
anon_vma_free(allocated);
+ if (unlikely(avc))
+ anon_vma_chain_free(avc);
}
return 0;
+
+ out_enomem_free_avc:
+ anon_vma_chain_free(avc);
+ out_enomem:
+ return -ENOMEM;
}
-void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
+static void anon_vma_chain_link(struct vm_area_struct *vma,
+ struct anon_vma_chain *avc,
+ struct anon_vma *anon_vma)
{
- BUG_ON(vma->anon_vma != next->anon_vma);
- list_del(&next->anon_vma_node);
+ avc->vma = vma;
+ avc->anon_vma = anon_vma;
+ list_add(&avc->same_vma, &vma->anon_vma_chain);
+
+ anon_vma_lock(anon_vma);
+ /*
+ * It's critical to add new vmas to the tail of the anon_vma,
+ * see comment in huge_memory.c:__split_huge_page().
+ */
+ list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+ anon_vma_unlock(anon_vma);
}
-void __anon_vma_link(struct vm_area_struct *vma)
+/*
+ * Attach the anon_vmas from src to dst.
+ * Returns 0 on success, -ENOMEM on failure.
+ */
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
- struct anon_vma *anon_vma = vma->anon_vma;
+ struct anon_vma_chain *avc, *pavc;
- if (anon_vma)
- list_add_tail(&vma->anon_vma_node, &anon_vma->head);
+ list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
+ avc = anon_vma_chain_alloc();
+ if (!avc)
+ goto enomem_failure;
+ anon_vma_chain_link(dst, avc, pavc->anon_vma);
+ }
+ return 0;
+
+ enomem_failure:
+ unlink_anon_vmas(dst);
+ return -ENOMEM;
}
-void anon_vma_link(struct vm_area_struct *vma)
+/*
+ * Attach vma to its own anon_vma, as well as to the anon_vmas that
+ * the corresponding VMA in the parent process is attached to.
+ * Returns 0 on success, non-zero on failure.
+ */
+int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
{
- struct anon_vma *anon_vma = vma->anon_vma;
+ struct anon_vma_chain *avc;
+ struct anon_vma *anon_vma;
- if (anon_vma) {
- spin_lock(&anon_vma->lock);
- list_add_tail(&vma->anon_vma_node, &anon_vma->head);
- spin_unlock(&anon_vma->lock);
- }
+ /* Don't bother if the parent process has no anon_vma here. */
+ if (!pvma->anon_vma)
+ return 0;
+
+ /*
+ * First, attach the new VMA to the parent VMA's anon_vmas,
+ * so rmap can find non-COWed pages in child processes.
+ */
+ if (anon_vma_clone(vma, pvma))
+ return -ENOMEM;
+
+ /* Then add our own anon_vma. */
+ anon_vma = anon_vma_alloc();
+ if (!anon_vma)
+ goto out_error;
+ avc = anon_vma_chain_alloc();
+ if (!avc)
+ goto out_error_free_anon_vma;
+
+ /*
+ * The root anon_vma's spinlock is the lock actually used when we
+ * lock any of the anon_vmas in this anon_vma tree.
+ */
+ anon_vma->root = pvma->anon_vma->root;
+ /*
+ * With KSM refcounts, an anon_vma can stay around longer than the
+ * process it belongs to. The root anon_vma needs to be pinned
+ * until this anon_vma is freed, because the lock lives in the root.
+ */
+ get_anon_vma(anon_vma->root);
+ /* Mark this anon_vma as the one where our new (COWed) pages go. */
+ vma->anon_vma = anon_vma;
+ anon_vma_chain_link(vma, avc, anon_vma);
+
+ return 0;
+
+ out_error_free_anon_vma:
+ anon_vma_free(anon_vma);
+ out_error:
+ unlink_anon_vmas(vma);
+ return -ENOMEM;
}
-void anon_vma_unlink(struct vm_area_struct *vma)
+static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
{
- struct anon_vma *anon_vma = vma->anon_vma;
+ struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
int empty;
+ /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
if (!anon_vma)
return;
- spin_lock(&anon_vma->lock);
- list_del(&vma->anon_vma_node);
+ anon_vma_lock(anon_vma);
+ list_del(&anon_vma_chain->same_anon_vma);
/* We must garbage collect the anon_vma if it's empty */
- empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
- spin_unlock(&anon_vma->lock);
+ empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
+ anon_vma_unlock(anon_vma);
- if (empty)
+ if (empty) {
+ /* We no longer need the root anon_vma */
+ if (anon_vma->root != anon_vma)
+ drop_anon_vma(anon_vma->root);
anon_vma_free(anon_vma);
+ }
+}
+
+void unlink_anon_vmas(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc, *next;
+
+ /*
+ * Unlink each anon_vma chained to the VMA. This list is ordered
+ * from newest to oldest, ensuring the root anon_vma gets freed last.
+ */
+ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+ anon_vma_unlink(avc);
+ list_del(&avc->same_vma);
+ anon_vma_chain_free(avc);
+ }
}
static void anon_vma_ctor(void *data)
@@ -184,7 +303,7 @@ static void anon_vma_ctor(void *data)
struct anon_vma *anon_vma = data;
spin_lock_init(&anon_vma->lock);
- ksm_refcount_init(anon_vma);
+ anonvma_external_refcount_init(anon_vma);
INIT_LIST_HEAD(&anon_vma->head);
}
@@ -192,15 +311,16 @@ void __init anon_vma_init(void)
{
anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
+ anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
}
/*
* Getting a lock on a stable anon_vma from a page off the LRU is
* tricky: page_lock_anon_vma rely on RCU to guard against the races.
*/
-struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *__page_lock_anon_vma(struct page *page)
{
- struct anon_vma *anon_vma;
+ struct anon_vma *anon_vma, *root_anon_vma;
unsigned long anon_mapping;
rcu_read_lock();
@@ -211,16 +331,31 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
- spin_lock(&anon_vma->lock);
- return anon_vma;
+ root_anon_vma = ACCESS_ONCE(anon_vma->root);
+ spin_lock(&root_anon_vma->lock);
+
+ /*
+ * If this page is still mapped, then its anon_vma cannot have been
+ * freed. But if it has been unmapped, we have no security against
+ * the anon_vma structure being freed and reused (for another anon_vma:
+ * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot
+ * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting
+ * anon_vma->root before page_unlock_anon_vma() is called to unlock.
+ */
+ if (page_mapped(page))
+ return anon_vma;
+
+ spin_unlock(&root_anon_vma->lock);
out:
rcu_read_unlock();
return NULL;
}
void page_unlock_anon_vma(struct anon_vma *anon_vma)
+ __releases(&anon_vma->root->lock)
+ __releases(RCU)
{
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
rcu_read_unlock();
}
@@ -229,12 +364,14 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
* Returns virtual address or -EFAULT if page's index/offset is not
* within the range mapped the @vma.
*/
-static inline unsigned long
+inline unsigned long
vma_address(struct page *page, struct vm_area_struct *vma)
{
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
unsigned long address;
+ if (unlikely(is_vm_hugetlb_page(vma)))
+ pgoff = page->index << huge_page_order(page_hstate(page));
address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
/* page should be within @vma mapping range */
@@ -245,12 +382,18 @@ vma_address(struct page *page, struct vm_area_struct *vma)
/*
* At what user virtual address is page expected in vma?
- * checking that the page matches the vma.
+ * Caller should check the page is actually part of the vma.
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
if (PageAnon(page)) {
- if (vma->anon_vma != page_anon_vma(page))
+ struct anon_vma *page__anon_vma = page_anon_vma(page);
+ /*
+ * Note: swapoff's unuse_vma() is more efficient with this
+ * check, and needs it to match anon_vma when KSM is active.
+ */
+ if (!vma->anon_vma || !page__anon_vma ||
+ vma->anon_vma->root != page__anon_vma->root)
return -EFAULT;
} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
if (!vma->vm_file ||
@@ -270,7 +413,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
*
* On success returns with pte mapped and locked.
*/
-pte_t *page_check_address(struct page *page, struct mm_struct *mm,
+pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
unsigned long address, spinlock_t **ptlp, int sync)
{
pgd_t *pgd;
@@ -279,6 +422,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
pte_t *pte;
spinlock_t *ptl;
+ if (unlikely(PageHuge(page))) {
+ pte = huge_pte_offset(mm, address);
+ ptl = &mm->page_table_lock;
+ goto check;
+ }
+
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
return NULL;
@@ -290,6 +439,8 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return NULL;
+ if (pmd_trans_huge(*pmd))
+ return NULL;
pte = pte_offset_map(pmd, address);
/* Make a quick check before getting the lock */
@@ -299,6 +450,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
}
ptl = pte_lockptr(mm, pmd);
+check:
spin_lock(ptl);
if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
*ptlp = ptl;
@@ -343,35 +495,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
unsigned long *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
- pte_t *pte;
- spinlock_t *ptl;
int referenced = 0;
- pte = page_check_address(page, mm, address, &ptl, 0);
- if (!pte)
- goto out;
-
/*
* Don't want to elevate referenced for mlocked page that gets this far,
* in order that it progresses to try_to_unmap and is moved to the
* unevictable list.
*/
if (vma->vm_flags & VM_LOCKED) {
- *mapcount = 1; /* break early from loop */
+ *mapcount = 0; /* break early from loop */
*vm_flags |= VM_LOCKED;
- goto out_unmap;
- }
-
- if (ptep_clear_flush_young_notify(vma, address, pte)) {
- /*
- * Don't treat a reference through a sequentially read
- * mapping as such. If the page has been used in
- * another mapping, we will catch it; if this other
- * mapping is already gone, the unmap path will have
- * set PG_referenced or activated the page.
- */
- if (likely(!VM_SequentialReadHint(vma)))
- referenced++;
+ goto out;
}
/* Pretend the page is referenced if the task has the
@@ -380,9 +514,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
rwsem_is_locked(&mm->mmap_sem))
referenced++;
-out_unmap:
+ if (unlikely(PageTransHuge(page))) {
+ pmd_t *pmd;
+
+ spin_lock(&mm->page_table_lock);
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_FLAG);
+ if (pmd && !pmd_trans_splitting(*pmd) &&
+ pmdp_clear_flush_young_notify(vma, address, pmd))
+ referenced++;
+ spin_unlock(&mm->page_table_lock);
+ } else {
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ pte = page_check_address(page, mm, address, &ptl, 0);
+ if (!pte)
+ goto out;
+
+ if (ptep_clear_flush_young_notify(vma, address, pte)) {
+ /*
+ * Don't treat a reference through a sequentially read
+ * mapping as such. If the page has been used in
+ * another mapping, we will catch it; if this other
+ * mapping is already gone, the unmap path will have
+ * set PG_referenced or activated the page.
+ */
+ if (likely(!VM_SequentialReadHint(vma)))
+ referenced++;
+ }
+ pte_unmap_unlock(pte, ptl);
+ }
+
(*mapcount)--;
- pte_unmap_unlock(pte, ptl);
if (referenced)
*vm_flags |= vma->vm_flags;
@@ -396,7 +560,7 @@ static int page_referenced_anon(struct page *page,
{
unsigned int mapcount;
struct anon_vma *anon_vma;
- struct vm_area_struct *vma;
+ struct anon_vma_chain *avc;
int referenced = 0;
anon_vma = page_lock_anon_vma(page);
@@ -404,7 +568,8 @@ static int page_referenced_anon(struct page *page,
return referenced;
mapcount = page_mapcount(page);
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
if (address == -EFAULT)
continue;
@@ -511,9 +676,6 @@ int page_referenced(struct page *page,
int referenced = 0;
int we_locked = 0;
- if (TestClearPageReferenced(page))
- referenced++;
-
*vm_flags = 0;
if (page_mapped(page) && page_rmapping(page)) {
if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
@@ -603,7 +765,7 @@ int page_mkclean(struct page *page)
if (mapping) {
ret = page_mkclean_file(mapping, page);
if (page_test_dirty(page)) {
- page_clear_dirty(page);
+ page_clear_dirty(page, 1);
ret = 1;
}
}
@@ -614,17 +776,54 @@ int page_mkclean(struct page *page)
EXPORT_SYMBOL_GPL(page_mkclean);
/**
- * __page_set_anon_rmap - setup new anonymous rmap
- * @page: the page to add the mapping to
- * @vma: the vm area in which the mapping is added
+ * page_move_anon_rmap - move a page to our anon_vma
+ * @page: the page to move to our anon_vma
+ * @vma: the vma the page belongs to
* @address: the user virtual address mapped
+ *
+ * When a page belongs exclusively to one process after a COW event,
+ * that page can be moved into the anon_vma that belongs to just that
+ * process, so the rmap code will not search the parent or sibling
+ * processes.
*/
-static void __page_set_anon_rmap(struct page *page,
+void page_move_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
struct anon_vma *anon_vma = vma->anon_vma;
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(!anon_vma);
+ VM_BUG_ON(page->index != linear_page_index(vma, address));
+
+ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ page->mapping = (struct address_space *) anon_vma;
+}
+
+/**
+ * __page_set_anon_rmap - set up new anonymous rmap
+ * @page: Page to add to rmap
+ * @vma: VM area to add page to.
+ * @address: User virtual address of the mapping
+ * @exclusive: the page is exclusively owned by the current process
+ */
+static void __page_set_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+
BUG_ON(!anon_vma);
+
+ if (PageAnon(page))
+ return;
+
+ /*
+ * If the page isn't exclusively mapped into this vma,
+ * we must use the _oldest_ possible anon_vma for the
+ * page mapping!
+ */
+ if (!exclusive)
+ anon_vma = anon_vma->root;
+
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
page->index = linear_page_index(vma, address);
@@ -652,9 +851,7 @@ static void __page_check_anon_rmap(struct page *page,
* are initially only visible via the pagetables, and the pte is locked
* over the call to page_add_new_anon_rmap.
*/
- struct anon_vma *anon_vma = vma->anon_vma;
- anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
- BUG_ON(page->mapping != (struct address_space *)anon_vma);
+ BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
BUG_ON(page->index != linear_page_index(vma, address));
#endif
}
@@ -673,16 +870,32 @@ static void __page_check_anon_rmap(struct page *page,
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
+ do_page_add_anon_rmap(page, vma, address, 0);
+}
+
+/*
+ * Special version of the above for do_swap_page, which often runs
+ * into pages that are exclusively owned by the current process.
+ * Everybody else should continue to use page_add_anon_rmap above.
+ */
+void do_page_add_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
int first = atomic_inc_and_test(&page->_mapcount);
- if (first)
- __inc_zone_page_state(page, NR_ANON_PAGES);
+ if (first) {
+ if (!PageTransHuge(page))
+ __inc_zone_page_state(page, NR_ANON_PAGES);
+ else
+ __inc_zone_page_state(page,
+ NR_ANON_TRANSPARENT_HUGEPAGES);
+ }
if (unlikely(PageKsm(page)))
return;
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
if (first)
- __page_set_anon_rmap(page, vma, address);
+ __page_set_anon_rmap(page, vma, address, exclusive);
else
__page_check_anon_rmap(page, vma, address);
}
@@ -703,8 +916,11 @@ void page_add_new_anon_rmap(struct page *page,
VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
SetPageSwapBacked(page);
atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
- __inc_zone_page_state(page, NR_ANON_PAGES);
- __page_set_anon_rmap(page, vma, address);
+ if (!PageTransHuge(page))
+ __inc_zone_page_state(page, NR_ANON_PAGES);
+ else
+ __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ __page_set_anon_rmap(page, vma, address, 1);
if (page_evictable(page, vma))
lru_cache_add_lru(page, LRU_ACTIVE_ANON);
else
@@ -721,7 +937,7 @@ void page_add_file_rmap(struct page *page)
{
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_update_file_mapped(page, 1);
+ mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
}
}
@@ -745,15 +961,25 @@ void page_remove_rmap(struct page *page)
* containing the swap entry, but page not yet written to swap.
*/
if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
- page_clear_dirty(page);
+ page_clear_dirty(page, 1);
set_page_dirty(page);
}
+ /*
+ * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
+ * and not charged by memcg for now.
+ */
+ if (unlikely(PageHuge(page)))
+ return;
if (PageAnon(page)) {
mem_cgroup_uncharge_page(page);
- __dec_zone_page_state(page, NR_ANON_PAGES);
+ if (!PageTransHuge(page))
+ __dec_zone_page_state(page, NR_ANON_PAGES);
+ else
+ __dec_zone_page_state(page,
+ NR_ANON_TRANSPARENT_HUGEPAGES);
} else {
__dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_update_file_mapped(page, -1);
+ mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
}
/*
* It would be tidy to reset the PageAnon mapping here,
@@ -815,9 +1041,9 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
if (PageAnon(page))
- dec_mm_counter(mm, anon_rss);
+ dec_mm_counter(mm, MM_ANONPAGES);
else
- dec_mm_counter(mm, file_rss);
+ dec_mm_counter(mm, MM_FILEPAGES);
set_pte_at(mm, address, pte,
swp_entry_to_pte(make_hwpoison_entry(page)));
} else if (PageAnon(page)) {
@@ -839,7 +1065,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
- dec_mm_counter(mm, anon_rss);
+ dec_mm_counter(mm, MM_ANONPAGES);
+ inc_mm_counter(mm, MM_SWAPENTS);
} else if (PAGE_MIGRATION) {
/*
* Store the pfn of the page in a special migration
@@ -857,7 +1084,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
entry = make_migration_entry(page, pte_write(pteval));
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
} else
- dec_mm_counter(mm, file_rss);
+ dec_mm_counter(mm, MM_FILEPAGES);
page_remove_rmap(page);
page_cache_release(page);
@@ -996,7 +1223,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
page_remove_rmap(page);
page_cache_release(page);
- dec_mm_counter(mm, file_rss);
+ dec_mm_counter(mm, MM_FILEPAGES);
(*mapcount)--;
}
pte_unmap_unlock(pte - 1, ptl);
@@ -1005,6 +1232,20 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
return ret;
}
+bool is_vma_temporary_stack(struct vm_area_struct *vma)
+{
+ int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
+
+ if (!maybe_stack)
+ return false;
+
+ if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
+ VM_STACK_INCOMPLETE_SETUP)
+ return true;
+
+ return false;
+}
+
/**
* try_to_unmap_anon - unmap or unlock anonymous page using the object-based
* rmap method
@@ -1024,15 +1265,30 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
{
struct anon_vma *anon_vma;
- struct vm_area_struct *vma;
+ struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
anon_vma = page_lock_anon_vma(page);
if (!anon_vma)
return ret;
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
- unsigned long address = vma_address(page, vma);
+ list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long address;
+
+ /*
+ * During exec, a temporary VMA is setup and later moved.
+ * The VMA is moved under the anon_vma lock but not the
+ * page tables leading to a race where migration cannot
+ * find the migration ptes. Rather than increasing the
+ * locking requirements of exec(), migration skips
+ * temporary VMAs until after exec() completes.
+ */
+ if (PAGE_MIGRATION && (flags & TTU_MIGRATION) &&
+ is_vma_temporary_stack(vma))
+ continue;
+
+ address = vma_address(page, vma);
if (address == -EFAULT)
continue;
ret = try_to_unmap_one(page, vma, address, flags);
@@ -1174,6 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
int ret;
BUG_ON(!PageLocked(page));
+ VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
if (unlikely(PageKsm(page)))
ret = try_to_unmap_ksm(page, flags);
@@ -1213,6 +1470,42 @@ int try_to_munlock(struct page *page)
return try_to_unmap_file(page, TTU_MUNLOCK);
}
+#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
+/*
+ * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
+ * if necessary. Be careful to do all the tests under the lock. Once
+ * we know we are the last user, nobody else can get a reference and we
+ * can do the freeing without the lock.
+ */
+void drop_anon_vma(struct anon_vma *anon_vma)
+{
+ BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0);
+ if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
+ struct anon_vma *root = anon_vma->root;
+ int empty = list_empty(&anon_vma->head);
+ int last_root_user = 0;
+ int root_empty = 0;
+
+ /*
+ * The refcount on a non-root anon_vma got dropped. Drop
+ * the refcount on the root and check if we need to free it.
+ */
+ if (empty && anon_vma != root) {
+ BUG_ON(atomic_read(&root->external_refcount) <= 0);
+ last_root_user = atomic_dec_and_test(&root->external_refcount);
+ root_empty = list_empty(&root->head);
+ }
+ anon_vma_unlock(anon_vma);
+
+ if (empty) {
+ anon_vma_free(anon_vma);
+ if (root_empty && last_root_user)
+ anon_vma_free(root);
+ }
+ }
+}
+#endif
+
#ifdef CONFIG_MIGRATION
/*
* rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
@@ -1222,22 +1515,21 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
struct vm_area_struct *, unsigned long, void *), void *arg)
{
struct anon_vma *anon_vma;
- struct vm_area_struct *vma;
+ struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
/*
* Note: remove_migration_ptes() cannot use page_lock_anon_vma()
* because that depends on page_mapped(); but not all its usages
- * are holding mmap_sem, which also gave the necessary guarantee
- * (that this anon_vma's slab has not already been destroyed).
- * This needs to be reviewed later: avoiding page_lock_anon_vma()
- * is risky, and currently limits the usefulness of rmap_walk().
+ * are holding mmap_sem. Users without mmap_sem are required to
+ * take a reference count to prevent the anon_vma disappearing
*/
anon_vma = page_anon_vma(page);
if (!anon_vma)
return ret;
- spin_lock(&anon_vma->lock);
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ anon_vma_lock(anon_vma);
+ list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
if (address == -EFAULT)
continue;
@@ -1245,7 +1537,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
if (ret != SWAP_AGAIN)
break;
}
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
return ret;
}
@@ -1291,3 +1583,49 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
return rmap_walk_file(page, rmap_one, arg);
}
#endif /* CONFIG_MIGRATION */
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * The following three functions are for anonymous (private mapped) hugepages.
+ * Unlike common anonymous pages, anonymous hugepages have no accounting code
+ * and no lru code, because we handle hugepages differently from common pages.
+ */
+static void __hugepage_set_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+
+ BUG_ON(!anon_vma);
+
+ if (PageAnon(page))
+ return;
+ if (!exclusive)
+ anon_vma = anon_vma->root;
+
+ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ page->mapping = (struct address_space *) anon_vma;
+ page->index = linear_page_index(vma, address);
+}
+
+void hugepage_add_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+ int first;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!anon_vma);
+ BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ first = atomic_inc_and_test(&page->_mapcount);
+ if (first)
+ __hugepage_set_anon_rmap(page, vma, address, 0);
+}
+
+void hugepage_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ atomic_set(&page->_mapcount, 0);
+ __hugepage_set_anon_rmap(page, vma, address, 1);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index eef4ebea5158..5ee67c990602 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/percpu_counter.h>
#include <linux/swap.h>
static struct vfsmount *shm_mnt;
@@ -233,10 +234,10 @@ static void shmem_free_blocks(struct inode *inode, long pages)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) {
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_blocks += pages;
+ percpu_counter_add(&sbinfo->used_blocks, -pages);
+ spin_lock(&inode->i_lock);
inode->i_blocks -= pages*BLOCKS_PER_PAGE;
- spin_unlock(&sbinfo->stat_lock);
+ spin_unlock(&inode->i_lock);
}
}
@@ -416,25 +417,21 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
if (sgp == SGP_READ)
return shmem_swp_map(ZERO_PAGE(0));
/*
- * Test free_blocks against 1 not 0, since we have 1 data
+ * Test used_blocks against 1 less max_blocks, since we have 1 data
* page (and perhaps indirect index pages) yet to allocate:
* a waste to allocate index if we cannot allocate data.
*/
if (sbinfo->max_blocks) {
- spin_lock(&sbinfo->stat_lock);
- if (sbinfo->free_blocks <= 1) {
- spin_unlock(&sbinfo->stat_lock);
+ if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0)
return ERR_PTR(-ENOSPC);
- }
- sbinfo->free_blocks--;
+ percpu_counter_inc(&sbinfo->used_blocks);
+ spin_lock(&inode->i_lock);
inode->i_blocks += BLOCKS_PER_PAGE;
- spin_unlock(&sbinfo->stat_lock);
+ spin_unlock(&inode->i_lock);
}
spin_unlock(&info->lock);
page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
- if (page)
- set_page_private(page, 0);
spin_lock(&info->lock);
if (!page) {
@@ -729,10 +726,11 @@ done2:
if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
/*
* Call truncate_inode_pages again: racing shmem_unuse_inode
- * may have swizzled a page in from swap since vmtruncate or
- * generic_delete_inode did it, before we lowered next_index.
- * Also, though shmem_getpage checks i_size before adding to
- * cache, no recheck after: so fix the narrow window there too.
+ * may have swizzled a page in from swap since
+ * truncate_pagecache or generic_delete_inode did it, before we
+ * lowered next_index. Also, though shmem_getpage checks
+ * i_size before adding to cache, no recheck after: so fix the
+ * narrow window there too.
*
* Recalling truncate_inode_pages_range and unmap_mapping_range
* every time for punch_hole (which never got a chance to clear
@@ -762,19 +760,21 @@ done2:
}
}
-static void shmem_truncate(struct inode *inode)
-{
- shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
-}
-
static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
- struct page *page = NULL;
+ loff_t newsize = attr->ia_size;
int error;
- if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
- if (attr->ia_size < inode->i_size) {
+ error = inode_change_ok(inode, attr);
+ if (error)
+ return error;
+
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)
+ && newsize != inode->i_size) {
+ struct page *page = NULL;
+
+ if (newsize < inode->i_size) {
/*
* If truncating down to a partial page, then
* if that page is already allocated, hold it
@@ -782,9 +782,9 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
* truncate_partial_page cannnot miss it were
* it assigned to swap.
*/
- if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
+ if (newsize & (PAGE_CACHE_SIZE-1)) {
(void) shmem_getpage(inode,
- attr->ia_size>>PAGE_CACHE_SHIFT,
+ newsize >> PAGE_CACHE_SHIFT,
&page, SGP_READ, NULL);
if (page)
unlock_page(page);
@@ -796,36 +796,38 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
* if it's being fully truncated to zero-length: the
* nrpages check is efficient enough in that case.
*/
- if (attr->ia_size) {
+ if (newsize) {
struct shmem_inode_info *info = SHMEM_I(inode);
spin_lock(&info->lock);
info->flags &= ~SHMEM_PAGEIN;
spin_unlock(&info->lock);
}
}
+
+ /* XXX(truncate): truncate_setsize should be called last */
+ truncate_setsize(inode, newsize);
+ if (page)
+ page_cache_release(page);
+ shmem_truncate_range(inode, newsize, (loff_t)-1);
}
- error = inode_change_ok(inode, attr);
- if (!error)
- error = inode_setattr(inode, attr);
+ setattr_copy(inode, attr);
#ifdef CONFIG_TMPFS_POSIX_ACL
- if (!error && (attr->ia_valid & ATTR_MODE))
+ if (attr->ia_valid & ATTR_MODE)
error = generic_acl_chmod(inode);
#endif
- if (page)
- page_cache_release(page);
return error;
}
-static void shmem_delete_inode(struct inode *inode)
+static void shmem_evict_inode(struct inode *inode)
{
struct shmem_inode_info *info = SHMEM_I(inode);
- if (inode->i_op->truncate == shmem_truncate) {
+ if (inode->i_mapping->a_ops == &shmem_aops) {
truncate_inode_pages(inode->i_mapping, 0);
shmem_unacct_size(info->flags, inode->i_size);
inode->i_size = 0;
- shmem_truncate(inode);
+ shmem_truncate_range(inode, 0, (loff_t)-1);
if (!list_empty(&info->swaplist)) {
mutex_lock(&shmem_swaplist_mutex);
list_del_init(&info->swaplist);
@@ -834,7 +836,7 @@ static void shmem_delete_inode(struct inode *inode)
}
BUG_ON(inode->i_blocks);
shmem_free_inode(inode->i_sb);
- clear_inode(inode);
+ end_writeback(inode);
}
static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
@@ -931,7 +933,7 @@ found:
/*
* Move _head_ to start search for next from here.
- * But be careful: shmem_delete_inode checks list_empty without taking
+ * But be careful: shmem_evict_inode checks list_empty without taking
* mutex, and there's an instant in list_move_tail when info->swaplist
* would appear empty, if it were the only one on shmem_swaplist. We
* could avoid doing it if inode NULL; or use this minor optimization.
@@ -1221,6 +1223,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
struct shmem_sb_info *sbinfo;
struct page *filepage = *pagep;
struct page *swappage;
+ struct page *prealloc_page = NULL;
swp_entry_t *entry;
swp_entry_t swap;
gfp_t gfp;
@@ -1245,7 +1248,6 @@ repeat:
filepage = find_lock_page(mapping, idx);
if (filepage && PageUptodate(filepage))
goto done;
- error = 0;
gfp = mapping_gfp_mask(mapping);
if (!filepage) {
/*
@@ -1256,7 +1258,19 @@ repeat:
if (error)
goto failed;
radix_tree_preload_end();
+ if (sgp != SGP_READ && !prealloc_page) {
+ /* We don't care if this fails */
+ prealloc_page = shmem_alloc_page(gfp, info, idx);
+ if (prealloc_page) {
+ if (mem_cgroup_cache_charge(prealloc_page,
+ current->mm, GFP_KERNEL)) {
+ page_cache_release(prealloc_page);
+ prealloc_page = NULL;
+ }
+ }
+ }
}
+ error = 0;
spin_lock(&info->lock);
shmem_recalc_inode(inode);
@@ -1385,17 +1399,16 @@ repeat:
shmem_swp_unmap(entry);
sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) {
- spin_lock(&sbinfo->stat_lock);
- if (sbinfo->free_blocks == 0 ||
+ if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) ||
shmem_acct_block(info->flags)) {
- spin_unlock(&sbinfo->stat_lock);
spin_unlock(&info->lock);
error = -ENOSPC;
goto failed;
}
- sbinfo->free_blocks--;
+ percpu_counter_inc(&sbinfo->used_blocks);
+ spin_lock(&inode->i_lock);
inode->i_blocks += BLOCKS_PER_PAGE;
- spin_unlock(&sbinfo->stat_lock);
+ spin_unlock(&inode->i_lock);
} else if (shmem_acct_block(info->flags)) {
spin_unlock(&info->lock);
error = -ENOSPC;
@@ -1405,28 +1418,38 @@ repeat:
if (!filepage) {
int ret;
- spin_unlock(&info->lock);
- filepage = shmem_alloc_page(gfp, info, idx);
- if (!filepage) {
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- error = -ENOMEM;
- goto failed;
- }
- SetPageSwapBacked(filepage);
+ if (!prealloc_page) {
+ spin_unlock(&info->lock);
+ filepage = shmem_alloc_page(gfp, info, idx);
+ if (!filepage) {
+ shmem_unacct_blocks(info->flags, 1);
+ shmem_free_blocks(inode, 1);
+ error = -ENOMEM;
+ goto failed;
+ }
+ SetPageSwapBacked(filepage);
- /* Precharge page while we can wait, compensate after */
- error = mem_cgroup_cache_charge(filepage, current->mm,
- GFP_KERNEL);
- if (error) {
- page_cache_release(filepage);
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- filepage = NULL;
- goto failed;
+ /*
+ * Precharge page while we can wait, compensate
+ * after
+ */
+ error = mem_cgroup_cache_charge(filepage,
+ current->mm, GFP_KERNEL);
+ if (error) {
+ page_cache_release(filepage);
+ shmem_unacct_blocks(info->flags, 1);
+ shmem_free_blocks(inode, 1);
+ filepage = NULL;
+ goto failed;
+ }
+
+ spin_lock(&info->lock);
+ } else {
+ filepage = prealloc_page;
+ prealloc_page = NULL;
+ SetPageSwapBacked(filepage);
}
- spin_lock(&info->lock);
entry = shmem_swp_alloc(info, idx, sgp);
if (IS_ERR(entry))
error = PTR_ERR(entry);
@@ -1467,13 +1490,19 @@ repeat:
}
done:
*pagep = filepage;
- return 0;
+ error = 0;
+ goto out;
failed:
if (*pagep != filepage) {
unlock_page(filepage);
page_cache_release(filepage);
}
+out:
+ if (prealloc_page) {
+ mem_cgroup_uncharge_cache_page(prealloc_page);
+ page_cache_release(prealloc_page);
+ }
return error;
}
@@ -1545,8 +1574,8 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static struct inode *shmem_get_inode(struct super_block *sb, int mode,
- dev_t dev, unsigned long flags)
+static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
+ int mode, dev_t dev, unsigned long flags)
{
struct inode *inode;
struct shmem_inode_info *info;
@@ -1557,9 +1586,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode,
inode = new_inode(sb);
if (inode) {
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
+ inode->i_ino = get_next_ino();
+ inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -1791,17 +1819,16 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = TMPFS_MAGIC;
buf->f_bsize = PAGE_CACHE_SIZE;
buf->f_namelen = NAME_MAX;
- spin_lock(&sbinfo->stat_lock);
if (sbinfo->max_blocks) {
buf->f_blocks = sbinfo->max_blocks;
- buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+ buf->f_bavail = buf->f_bfree =
+ sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
}
if (sbinfo->max_inodes) {
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
}
/* else leave those fields 0 like simple_statfs */
- spin_unlock(&sbinfo->stat_lock);
return 0;
}
@@ -1814,7 +1841,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
struct inode *inode;
int error = -ENOSPC;
- inode = shmem_get_inode(dir->i_sb, mode, dev, VM_NORESERVE);
+ inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
error = security_inode_init_security(inode, dir, NULL, NULL,
NULL);
@@ -1833,11 +1860,6 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
#else
error = 0;
#endif
- if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
- }
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
d_instantiate(dentry, inode);
@@ -1882,7 +1904,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
inc_nlink(inode);
- atomic_inc(&inode->i_count); /* New dentry reference */
+ ihold(inode); /* New dentry reference */
dget(dentry); /* Extra pinning count for the created dentry */
d_instantiate(dentry, inode);
out:
@@ -1957,7 +1979,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
if (len > PAGE_CACHE_SIZE)
return -ENAMETOOLONG;
- inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
+ inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
if (!inode)
return -ENOSPC;
@@ -1992,8 +2014,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
unlock_page(page);
page_cache_release(page);
}
- if (dir->i_mode & S_ISGID)
- inode->i_gid = dir->i_gid;
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
d_instantiate(dentry, inode);
@@ -2033,7 +2053,6 @@ static const struct inode_operations shmem_symlink_inline_operations = {
};
static const struct inode_operations shmem_symlink_inode_operations = {
- .truncate = shmem_truncate,
.readlink = generic_readlink,
.follow_link = shmem_follow_link,
.put_link = shmem_put_link,
@@ -2071,14 +2090,14 @@ static int shmem_xattr_security_set(struct dentry *dentry, const char *name,
size, flags);
}
-static struct xattr_handler shmem_xattr_security_handler = {
+static const struct xattr_handler shmem_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = shmem_xattr_security_list,
.get = shmem_xattr_security_get,
.set = shmem_xattr_security_set,
};
-static struct xattr_handler *shmem_xattr_handlers[] = {
+static const struct xattr_handler *shmem_xattr_handlers[] = {
&generic_acl_access_handler,
&generic_acl_default_handler,
&shmem_xattr_security_handler,
@@ -2128,7 +2147,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
if (*len < 3)
return 255;
- if (hlist_unhashed(&inode->i_hash)) {
+ if (inode_unhashed(inode)) {
/* Unfortunately insert_inode_hash is not idempotent,
* so as we hash inodes here rather than at creation
* time, we need a lock to ensure we only try
@@ -2136,7 +2155,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
*/
static DEFINE_SPINLOCK(lock);
spin_lock(&lock);
- if (hlist_unhashed(&inode->i_hash))
+ if (inode_unhashed(inode))
__insert_inode_hash(inode,
inode->i_ino + inode->i_generation);
spin_unlock(&lock);
@@ -2250,7 +2269,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
struct shmem_sb_info config = *sbinfo;
- unsigned long blocks;
unsigned long inodes;
int error = -EINVAL;
@@ -2258,9 +2276,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
return error;
spin_lock(&sbinfo->stat_lock);
- blocks = sbinfo->max_blocks - sbinfo->free_blocks;
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
- if (config.max_blocks < blocks)
+ if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
goto out;
if (config.max_inodes < inodes)
goto out;
@@ -2277,7 +2294,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
error = 0;
sbinfo->max_blocks = config.max_blocks;
- sbinfo->free_blocks = config.max_blocks - blocks;
sbinfo->max_inodes = config.max_inodes;
sbinfo->free_inodes = config.max_inodes - inodes;
@@ -2310,7 +2326,10 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
static void shmem_put_super(struct super_block *sb)
{
- kfree(sb->s_fs_info);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+
+ percpu_counter_destroy(&sbinfo->used_blocks);
+ kfree(sbinfo);
sb->s_fs_info = NULL;
}
@@ -2352,7 +2371,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
#endif
spin_lock_init(&sbinfo->stat_lock);
- sbinfo->free_blocks = sbinfo->max_blocks;
+ if (percpu_counter_init(&sbinfo->used_blocks, 0))
+ goto failed;
sbinfo->free_inodes = sbinfo->max_inodes;
sb->s_maxbytes = SHMEM_MAX_BYTES;
@@ -2366,7 +2386,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags |= MS_POSIXACL;
#endif
- inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
+ inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
if (!inode)
goto failed;
inode->i_uid = sbinfo->uid;
@@ -2395,13 +2415,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
return &p->vfs_inode;
}
+static void shmem_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ INIT_LIST_HEAD(&inode->i_dentry);
+ kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+}
+
static void shmem_destroy_inode(struct inode *inode)
{
if ((inode->i_mode & S_IFMT) == S_IFREG) {
/* only struct inode is valid if it's an inline symlink */
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
}
- kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+ call_rcu(&inode->i_rcu, shmem_i_callback);
}
static void init_once(void *foo)
@@ -2444,14 +2471,13 @@ static const struct file_operations shmem_file_operations = {
.write = do_sync_write,
.aio_read = shmem_file_aio_read,
.aio_write = generic_file_aio_write,
- .fsync = simple_sync_file,
+ .fsync = noop_fsync,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
#endif
};
static const struct inode_operations shmem_inode_operations = {
- .truncate = shmem_truncate,
.setattr = shmem_notify_change,
.truncate_range = shmem_truncate_range,
#ifdef CONFIG_TMPFS_POSIX_ACL
@@ -2505,7 +2531,7 @@ static const struct super_operations shmem_ops = {
.remount_fs = shmem_remount_fs,
.show_options = shmem_show_options,
#endif
- .delete_inode = shmem_delete_inode,
+ .evict_inode = shmem_evict_inode,
.drop_inode = generic_delete_inode,
.put_super = shmem_put_super,
};
@@ -2519,16 +2545,16 @@ static const struct vm_operations_struct shmem_vm_ops = {
};
-static int shmem_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *shmem_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
{
- return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
+ return mount_nodev(fs_type, flags, data, shmem_fill_super);
}
static struct file_system_type tmpfs_fs_type = {
.owner = THIS_MODULE,
.name = "tmpfs",
- .get_sb = shmem_get_sb,
+ .mount = shmem_mount,
.kill_sb = kill_litter_super,
};
@@ -2570,6 +2596,45 @@ out4:
return error;
}
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+/**
+ * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
+ * @inode: the inode to be searched
+ * @pgoff: the offset to be searched
+ * @pagep: the pointer for the found page to be stored
+ * @ent: the pointer for the found swap entry to be stored
+ *
+ * If a page is found, refcount of it is incremented. Callers should handle
+ * these refcount.
+ */
+void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
+ struct page **pagep, swp_entry_t *ent)
+{
+ swp_entry_t entry = { .val = 0 }, *ptr;
+ struct page *page = NULL;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+
+ if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+ goto out;
+
+ spin_lock(&info->lock);
+ ptr = shmem_swp_entry(info, pgoff, NULL);
+#ifdef CONFIG_SWAP
+ if (ptr && ptr->val) {
+ entry.val = ptr->val;
+ page = find_get_page(&swapper_space, entry.val);
+ } else
+#endif
+ page = find_get_page(inode->i_mapping, pgoff);
+ if (ptr)
+ shmem_swp_unmap(ptr);
+ spin_unlock(&info->lock);
+out:
+ *pagep = page;
+ *ent = entry;
+}
+#endif
+
#else /* !CONFIG_SHMEM */
/*
@@ -2585,7 +2650,7 @@ out4:
static struct file_system_type tmpfs_fs_type = {
.name = "tmpfs",
- .get_sb = ramfs_get_sb,
+ .mount = ramfs_mount,
.kill_sb = kill_litter_super,
};
@@ -2609,9 +2674,34 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
return 0;
}
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+/**
+ * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
+ * @inode: the inode to be searched
+ * @pgoff: the offset to be searched
+ * @pagep: the pointer for the found page to be stored
+ * @ent: the pointer for the found swap entry to be stored
+ *
+ * If a page is found, refcount of it is incremented. Callers should handle
+ * these refcount.
+ */
+void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
+ struct page **pagep, swp_entry_t *ent)
+{
+ struct page *page = NULL;
+
+ if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+ goto out;
+ page = find_get_page(inode->i_mapping, pgoff);
+out:
+ *pagep = page;
+ *ent = (swp_entry_t){ .val = 0 };
+}
+#endif
+
#define shmem_vm_ops generic_file_vm_ops
#define shmem_file_operations ramfs_file_operations
-#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev)
+#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
#define shmem_acct_size(flags, size) 0
#define shmem_unacct_size(flags, size) do {} while (0)
#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE
@@ -2655,7 +2745,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
path.mnt = mntget(shm_mnt);
error = -ENOSPC;
- inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags);
+ inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
if (!inode)
goto put_dentry;
diff --git a/mm/slab.c b/mm/slab.c
index 7451bdacaf18..37961d1f584f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,7 +102,6 @@
#include <linux/cpu.h>
#include <linux/sysctl.h>
#include <linux/module.h>
-#include <linux/kmemtrace.h>
#include <linux/rcupdate.h>
#include <linux/string.h>
#include <linux/uaccess.h>
@@ -115,6 +114,7 @@
#include <linux/reciprocal_div.h>
#include <linux/debugobjects.h>
#include <linux/kmemcheck.h>
+#include <linux/memory.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -144,30 +144,6 @@
#define BYTES_PER_WORD sizeof(void *)
#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
-#ifndef ARCH_KMALLOC_MINALIGN
-/*
- * Enforce a minimum alignment for the kmalloc caches.
- * Usually, the kmalloc caches are cache_line_size() aligned, except when
- * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
- * Some archs want to perform DMA into kmalloc caches and need a guaranteed
- * alignment larger than the alignment of a 64-bit integer.
- * ARCH_KMALLOC_MINALIGN allows that.
- * Note that increasing this value may disable some debug features.
- */
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-#endif
-
-#ifndef ARCH_SLAB_MINALIGN
-/*
- * Enforce a minimum alignment for all caches.
- * Intended for archs that get misalignment faults even for BYTES_PER_WORD
- * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
- * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
- * some debug features.
- */
-#define ARCH_SLAB_MINALIGN 0
-#endif
-
#ifndef ARCH_KMALLOC_FLAGS
#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
#endif
@@ -308,7 +284,7 @@ struct kmem_list3 {
* Need this for bootstrapping a per node allocator.
*/
#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
-struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
+static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
#define CACHE_CACHE 0
#define SIZE_AC MAX_NUMNODES
#define SIZE_L3 (2 * MAX_NUMNODES)
@@ -418,7 +394,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
#define STATS_DEC_ACTIVE(x) do { } while (0)
#define STATS_INC_ALLOCED(x) do { } while (0)
#define STATS_INC_GROWN(x) do { } while (0)
-#define STATS_ADD_REAPED(x,y) do { } while (0)
+#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0)
#define STATS_SET_HIGH(x) do { } while (0)
#define STATS_INC_ERR(x) do { } while (0)
#define STATS_INC_NODEALLOCS(x) do { } while (0)
@@ -844,7 +820,7 @@ static void init_reap_node(int cpu)
{
int node;
- node = next_node(cpu_to_node(cpu), node_online_map);
+ node = next_node(cpu_to_mem(cpu), node_online_map);
if (node == MAX_NUMNODES)
node = first_node(node_online_map);
@@ -853,12 +829,12 @@ static void init_reap_node(int cpu)
static void next_reap_node(void)
{
- int node = __get_cpu_var(slab_reap_node);
+ int node = __this_cpu_read(slab_reap_node);
node = next_node(node, node_online_map);
if (unlikely(node >= MAX_NUMNODES))
node = first_node(node_online_map);
- __get_cpu_var(slab_reap_node) = node;
+ __this_cpu_write(slab_reap_node, node);
}
#else
@@ -884,7 +860,7 @@ static void __cpuinit start_cpu_timer(int cpu)
*/
if (keventd_up() && reap_work->work.func == NULL) {
init_reap_node(cpu);
- INIT_DELAYED_WORK(reap_work, cache_reap);
+ INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap);
schedule_delayed_work_on(cpu, reap_work,
__round_jiffies_relative(HZ, cpu));
}
@@ -925,7 +901,7 @@ static int transfer_objects(struct array_cache *to,
struct array_cache *from, unsigned int max)
{
/* Figure out how many entries to transfer */
- int nr = min(min(from->avail, max), to->limit - to->avail);
+ int nr = min3(from->avail, max, to->limit - to->avail);
if (!nr)
return 0;
@@ -935,7 +911,6 @@ static int transfer_objects(struct array_cache *to,
from->avail -= nr;
to->avail += nr;
- to->touched = 1;
return nr;
}
@@ -983,13 +958,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
if (limit > 1)
limit = 12;
- ac_ptr = kmalloc_node(memsize, gfp, node);
+ ac_ptr = kzalloc_node(memsize, gfp, node);
if (ac_ptr) {
for_each_node(i) {
- if (i == node || !node_online(i)) {
- ac_ptr[i] = NULL;
+ if (i == node || !node_online(i))
continue;
- }
ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
if (!ac_ptr[i]) {
for (i--; i >= 0; i--)
@@ -1039,7 +1012,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
*/
static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
{
- int node = __get_cpu_var(slab_reap_node);
+ int node = __this_cpu_read(slab_reap_node);
if (l3->alien) {
struct array_cache *ac = l3->alien[node];
@@ -1076,7 +1049,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
struct array_cache *alien = NULL;
int node;
- node = numa_node_id();
+ node = numa_mem_id();
/*
* Make sure we are not freeing a object from another node to the array
@@ -1105,11 +1078,57 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
}
#endif
+/*
+ * Allocates and initializes nodelists for a node on each slab cache, used for
+ * either memory or cpu hotplug. If memory is being hot-added, the kmem_list3
+ * will be allocated off-node since memory is not yet online for the new node.
+ * When hotplugging memory or a cpu, existing nodelists are not replaced if
+ * already in use.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int init_cache_nodelists_node(int node)
+{
+ struct kmem_cache *cachep;
+ struct kmem_list3 *l3;
+ const int memsize = sizeof(struct kmem_list3);
+
+ list_for_each_entry(cachep, &cache_chain, next) {
+ /*
+ * Set up the size64 kmemlist for cpu before we can
+ * begin anything. Make sure some other cpu on this
+ * node has not already allocated this
+ */
+ if (!cachep->nodelists[node]) {
+ l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+ if (!l3)
+ return -ENOMEM;
+ kmem_list3_init(l3);
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+
+ /*
+ * The l3s don't come and go as CPUs come and
+ * go. cache_chain_mutex is sufficient
+ * protection here.
+ */
+ cachep->nodelists[node] = l3;
+ }
+
+ spin_lock_irq(&cachep->nodelists[node]->list_lock);
+ cachep->nodelists[node]->free_limit =
+ (1 + nr_cpus_node(node)) *
+ cachep->batchcount + cachep->num;
+ spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+ }
+ return 0;
+}
+
static void __cpuinit cpuup_canceled(long cpu)
{
struct kmem_cache *cachep;
struct kmem_list3 *l3 = NULL;
- int node = cpu_to_node(cpu);
+ int node = cpu_to_mem(cpu);
const struct cpumask *mask = cpumask_of_node(node);
list_for_each_entry(cachep, &cache_chain, next) {
@@ -1174,8 +1193,8 @@ static int __cpuinit cpuup_prepare(long cpu)
{
struct kmem_cache *cachep;
struct kmem_list3 *l3 = NULL;
- int node = cpu_to_node(cpu);
- const int memsize = sizeof(struct kmem_list3);
+ int node = cpu_to_mem(cpu);
+ int err;
/*
* We need to do this right in the beginning since
@@ -1183,35 +1202,9 @@ static int __cpuinit cpuup_prepare(long cpu)
* kmalloc_node allows us to add the slab to the right
* kmem_list3 and not this cpu's kmem_list3
*/
-
- list_for_each_entry(cachep, &cache_chain, next) {
- /*
- * Set up the size64 kmemlist for cpu before we can
- * begin anything. Make sure some other cpu on this
- * node has not already allocated this
- */
- if (!cachep->nodelists[node]) {
- l3 = kmalloc_node(memsize, GFP_KERNEL, node);
- if (!l3)
- goto bad;
- kmem_list3_init(l3);
- l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-
- /*
- * The l3s don't come and go as CPUs come and
- * go. cache_chain_mutex is sufficient
- * protection here.
- */
- cachep->nodelists[node] = l3;
- }
-
- spin_lock_irq(&cachep->nodelists[node]->list_lock);
- cachep->nodelists[node]->free_limit =
- (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
- spin_unlock_irq(&cachep->nodelists[node]->list_lock);
- }
+ err = init_cache_nodelists_node(node);
+ if (err < 0)
+ goto bad;
/*
* Now we can go ahead with allocating the shared arrays and
@@ -1300,7 +1293,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
* anything expensive but will only modify reap_work
* and reschedule the timer.
*/
- cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu));
+ cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
/* Now the cache_reaper is guaranteed to be not running. */
per_cpu(slab_reap_work, cpu).work.func = NULL;
break;
@@ -1327,18 +1320,82 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
mutex_unlock(&cache_chain_mutex);
break;
}
- return err ? NOTIFY_BAD : NOTIFY_OK;
+ return notifier_from_errno(err);
}
static struct notifier_block __cpuinitdata cpucache_notifier = {
&cpuup_callback, NULL, 0
};
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * Drains freelist for a node on each slab cache, used for memory hot-remove.
+ * Returns -EBUSY if all objects cannot be drained so that the node is not
+ * removed.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int __meminit drain_cache_nodelists_node(int node)
+{
+ struct kmem_cache *cachep;
+ int ret = 0;
+
+ list_for_each_entry(cachep, &cache_chain, next) {
+ struct kmem_list3 *l3;
+
+ l3 = cachep->nodelists[node];
+ if (!l3)
+ continue;
+
+ drain_freelist(cachep, l3, l3->free_objects);
+
+ if (!list_empty(&l3->slabs_full) ||
+ !list_empty(&l3->slabs_partial)) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+ return ret;
+}
+
+static int __meminit slab_memory_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mnb = arg;
+ int ret = 0;
+ int nid;
+
+ nid = mnb->status_change_nid;
+ if (nid < 0)
+ goto out;
+
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ mutex_lock(&cache_chain_mutex);
+ ret = init_cache_nodelists_node(nid);
+ mutex_unlock(&cache_chain_mutex);
+ break;
+ case MEM_GOING_OFFLINE:
+ mutex_lock(&cache_chain_mutex);
+ ret = drain_cache_nodelists_node(nid);
+ mutex_unlock(&cache_chain_mutex);
+ break;
+ case MEM_ONLINE:
+ case MEM_OFFLINE:
+ case MEM_CANCEL_ONLINE:
+ case MEM_CANCEL_OFFLINE:
+ break;
+ }
+out:
+ return ret ? notifier_from_errno(ret) : NOTIFY_OK;
+}
+#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
+
/*
* swap the static kmem_list3 with kmalloced memory
*/
-static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
- int nodeid)
+static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+ int nodeid)
{
struct kmem_list3 *ptr;
@@ -1421,7 +1478,7 @@ void __init kmem_cache_init(void)
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/
- node = numa_node_id();
+ node = numa_mem_id();
/* 1) create the cache_cache */
INIT_LIST_HEAD(&cache_chain);
@@ -1583,6 +1640,14 @@ void __init kmem_cache_init_late(void)
*/
register_cpu_notifier(&cpucache_notifier);
+#ifdef CONFIG_NUMA
+ /*
+ * Register a memory hotplug callback that initializes and frees
+ * nodelists.
+ */
+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+#endif
+
/*
* The reap timers are started later, with a module init call: That part
* of the kernel is not yet operational.
@@ -2055,7 +2120,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
}
}
}
- cachep->nodelists[numa_node_id()]->next_reap =
+ cachep->nodelists[numa_mem_id()]->next_reap =
jiffies + REAPTIMEOUT_LIST3 +
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
@@ -2223,8 +2288,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (ralign < align) {
ralign = align;
}
- /* disable debug if necessary */
- if (ralign > __alignof__(unsigned long long))
+ /* disable debug if not aligning with REDZONE_ALIGN */
+ if (ralign & (__alignof__(unsigned long long) - 1))
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
/*
* 4) Store it.
@@ -2250,8 +2315,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
*/
if (flags & SLAB_RED_ZONE) {
/* add space for red zone words */
- cachep->obj_offset += sizeof(unsigned long long);
- size += 2 * sizeof(unsigned long long);
+ cachep->obj_offset += align;
+ size += align + sizeof(unsigned long long);
}
if (flags & SLAB_STORE_USER) {
/* user store requires one word storage behind the end of
@@ -2265,8 +2330,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
- && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
- cachep->obj_offset += PAGE_SIZE - size;
+ && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
+ cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
size = PAGE_SIZE;
}
#endif
@@ -2386,7 +2451,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
+ assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock);
#endif
}
@@ -2413,7 +2478,7 @@ static void do_drain(void *arg)
{
struct kmem_cache *cachep = arg;
struct array_cache *ac;
- int node = numa_node_id();
+ int node = numa_mem_id();
check_irq_off();
ac = cpu_cache_get(cachep);
@@ -2716,7 +2781,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
/*
* Map pages beginning at addr to the given cache and slab. This is required
* for the slab allocator to be able to lookup the cache and slab of a
- * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
+ * virtual address for kfree, ksize, and slab debugging.
*/
static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
void *addr)
@@ -2946,7 +3011,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
retry:
check_irq_off();
- node = numa_node_id();
+ node = numa_mem_id();
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -2963,8 +3028,10 @@ retry:
spin_lock(&l3->list_lock);
/* See if we can refill from the shared array */
- if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
+ if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
+ l3->shared->touched = 1;
goto alloc_done;
+ }
while (batchcount > 0) {
struct list_head *entry;
@@ -3101,7 +3168,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
if (cachep == &cache_cache)
return false;
- return should_failslab(obj_size(cachep), flags);
+ return should_failslab(obj_size(cachep), flags, cachep->flags);
}
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
@@ -3148,11 +3215,13 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
if (in_interrupt() || (flags & __GFP_THISNODE))
return NULL;
- nid_alloc = nid_here = numa_node_id();
+ nid_alloc = nid_here = numa_mem_id();
+ get_mems_allowed();
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
- nid_alloc = cpuset_mem_spread_node();
+ nid_alloc = cpuset_slab_spread_node();
else if (current->mempolicy)
nid_alloc = slab_node(current->mempolicy);
+ put_mems_allowed();
if (nid_alloc != nid_here)
return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
@@ -3179,6 +3248,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
if (flags & __GFP_THISNODE)
return NULL;
+ get_mems_allowed();
zonelist = node_zonelist(slab_node(current->mempolicy), flags);
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
@@ -3210,7 +3280,7 @@ retry:
if (local_flags & __GFP_WAIT)
local_irq_enable();
kmem_flagcheck(cache, flags);
- obj = kmem_getpages(cache, local_flags, numa_node_id());
+ obj = kmem_getpages(cache, local_flags, numa_mem_id());
if (local_flags & __GFP_WAIT)
local_irq_disable();
if (obj) {
@@ -3234,6 +3304,7 @@ retry:
}
}
}
+ put_mems_allowed();
return obj;
}
@@ -3317,6 +3388,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
{
unsigned long save_flags;
void *ptr;
+ int slab_node = numa_mem_id();
flags &= gfp_allowed_mask;
@@ -3329,7 +3401,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
local_irq_save(save_flags);
if (nodeid == -1)
- nodeid = numa_node_id();
+ nodeid = slab_node;
if (unlikely(!cachep->nodelists[nodeid])) {
/* Node not bootstrapped yet */
@@ -3337,7 +3409,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
goto out;
}
- if (nodeid == numa_node_id()) {
+ if (nodeid == slab_node) {
/*
* Use the locally cached objects if possible.
* However ____cache_alloc does not allow fallback
@@ -3381,8 +3453,8 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
* We may just have run out of memory on the local node.
* ____cache_alloc_node() knows how to locate memory on other nodes
*/
- if (!objp)
- objp = ____cache_alloc_node(cache, flags, numa_node_id());
+ if (!objp)
+ objp = ____cache_alloc_node(cache, flags, numa_mem_id());
out:
return objp;
@@ -3479,7 +3551,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
{
int batchcount;
struct kmem_list3 *l3;
- int node = numa_node_id();
+ int node = numa_mem_id();
batchcount = ac->batchcount;
#if DEBUG
@@ -3581,53 +3653,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
+void *
+kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
{
- return __cache_alloc(cachep, flags, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(kmem_cache_alloc_notrace);
-#endif
+ void *ret;
-/**
- * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
- * @cachep: the cache we're checking against
- * @ptr: pointer to validate
- *
- * This verifies that the untrusted pointer looks sane;
- * it is _not_ a guarantee that the pointer is actually
- * part of the slab cache in question, but it at least
- * validates that the pointer can be dereferenced and
- * looks half-way sane.
- *
- * Currently only used for dentry validation.
- */
-int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
-{
- unsigned long addr = (unsigned long)ptr;
- unsigned long min_addr = PAGE_OFFSET;
- unsigned long align_mask = BYTES_PER_WORD - 1;
- unsigned long size = cachep->buffer_size;
- struct page *page;
+ ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
- if (unlikely(addr < min_addr))
- goto out;
- if (unlikely(addr > (unsigned long)high_memory - size))
- goto out;
- if (unlikely(addr & align_mask))
- goto out;
- if (unlikely(!kern_addr_valid(addr)))
- goto out;
- if (unlikely(!kern_addr_valid(addr + size - 1)))
- goto out;
- page = virt_to_page(ptr);
- if (unlikely(!PageSlab(page)))
- goto out;
- if (unlikely(page_get_cache(page) != cachep))
- goto out;
- return 1;
-out:
- return 0;
+ trace_kmalloc(_RET_IP_, ret,
+ size, slab_buffer_size(cachep), flags);
+ return ret;
}
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+#endif
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
@@ -3644,31 +3682,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
EXPORT_SYMBOL(kmem_cache_alloc_node);
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
- gfp_t flags,
- int nodeid)
+void *kmem_cache_alloc_node_trace(size_t size,
+ struct kmem_cache *cachep,
+ gfp_t flags,
+ int nodeid)
{
- return __cache_alloc_node(cachep, flags, nodeid,
+ void *ret;
+
+ ret = __cache_alloc_node(cachep, flags, nodeid,
__builtin_return_address(0));
+ trace_kmalloc_node(_RET_IP_, ret,
+ size, slab_buffer_size(cachep),
+ flags, nodeid);
+ return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
static __always_inline void *
__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
{
struct kmem_cache *cachep;
- void *ret;
cachep = kmem_find_general_cachep(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
-
- trace_kmalloc_node((unsigned long) caller, ret,
- size, cachep->buffer_size, flags, node);
-
- return ret;
+ return kmem_cache_alloc_node_trace(size, cachep, flags, node);
}
#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
@@ -3924,7 +3963,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
return -ENOMEM;
for_each_online_cpu(i) {
- new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
+ new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
batchcount, gfp);
if (!new->new[i]) {
for (i--; i >= 0; i--)
@@ -3946,9 +3985,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
struct array_cache *ccold = new->new[i];
if (!ccold)
continue;
- spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
- free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
- spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+ spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
+ free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
+ spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
kfree(ccold);
}
kfree(new);
@@ -4014,7 +4053,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
* necessary. Note that the l3 listlock also protects the array_cache
* if drain_array() is used on the shared array.
*/
-void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
struct array_cache *ac, int force, int node)
{
int tofree;
@@ -4054,7 +4093,7 @@ static void cache_reap(struct work_struct *w)
{
struct kmem_cache *searchp;
struct kmem_list3 *l3;
- int node = numa_node_id();
+ int node = numa_mem_id();
struct delayed_work *work = to_delayed_work(w);
if (!mutex_trylock(&cache_chain_mutex))
@@ -4228,10 +4267,11 @@ static int s_show(struct seq_file *m, void *p)
unsigned long node_frees = cachep->node_frees;
unsigned long overflows = cachep->node_overflow;
- seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
- %4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
- reaped, errors, max_freeable, node_allocs,
- node_frees, overflows);
+ seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
+ "%4lu %4lu %4lu %4lu %4lu",
+ allocs, high, grown,
+ reaped, errors, max_freeable, node_allocs,
+ node_frees, overflows);
}
/* cpu stats */
{
@@ -4277,7 +4317,7 @@ static const struct seq_operations slabinfo_op = {
* @count: data length
* @ppos: unused
*/
-ssize_t slabinfo_write(struct file *file, const char __user * buffer,
+static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
diff --git a/mm/slob.c b/mm/slob.c
index 837ebd64cc34..3588eaaef726 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -66,8 +66,10 @@
#include <linux/module.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
-#include <linux/kmemtrace.h>
#include <linux/kmemleak.h>
+
+#include <trace/events/kmem.h>
+
#include <asm/atomic.h>
/*
@@ -394,6 +396,7 @@ static void slob_free(void *block, int size)
slob_t *prev, *next, *b = (slob_t *)block;
slobidx_t units;
unsigned long flags;
+ struct list_head *slob_list;
if (unlikely(ZERO_OR_NULL_PTR(block)))
return;
@@ -422,7 +425,13 @@ static void slob_free(void *block, int size)
set_slob(b, units,
(void *)((unsigned long)(b +
SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
- set_slob_page_free(sp, &free_slob_small);
+ if (size < SLOB_BREAK1)
+ slob_list = &free_slob_small;
+ else if (size < SLOB_BREAK2)
+ slob_list = &free_slob_medium;
+ else
+ slob_list = &free_slob_large;
+ set_slob_page_free(sp, slob_list);
goto out;
}
@@ -467,14 +476,6 @@ out:
* End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
*/
-#ifndef ARCH_KMALLOC_MINALIGN
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long)
-#endif
-
-#ifndef ARCH_SLAB_MINALIGN
-#define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
-#endif
-
void *__kmalloc_node(size_t size, gfp_t gfp, int node)
{
unsigned int *m;
@@ -499,7 +500,9 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
} else {
unsigned int order = get_order(size);
- ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node);
+ if (likely(order))
+ gfp |= __GFP_COMP;
+ ret = slob_new_pages(gfp, order, node);
if (ret) {
struct page *page;
page = virt_to_page(ret);
@@ -647,7 +650,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
struct slob_rcu *slob_rcu;
slob_rcu = b + (c->size - sizeof(struct slob_rcu));
- INIT_RCU_HEAD(&slob_rcu->head);
slob_rcu->size = c->size;
call_rcu(&slob_rcu->head, kmem_rcu_free);
} else {
@@ -676,11 +678,6 @@ int kmem_cache_shrink(struct kmem_cache *d)
}
EXPORT_SYMBOL(kmem_cache_shrink);
-int kmem_ptr_validate(struct kmem_cache *a, const void *b)
-{
- return 0;
-}
-
static unsigned int slob_ready __read_mostly;
int slab_is_available(void)
diff --git a/mm/slub.c b/mm/slub.c
index 8d71aaf888d7..e15aa7f193c9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -17,7 +17,6 @@
#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <linux/kmemtrace.h>
#include <linux/kmemcheck.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
@@ -29,6 +28,8 @@
#include <linux/math64.h>
#include <linux/fault-inject.h>
+#include <trace/events/kmem.h>
+
/*
* Lock order:
* 1. slab_lock(page)
@@ -107,11 +108,17 @@
* the fast path and disables lockless freelists.
*/
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+ SLAB_TRACE | SLAB_DEBUG_FREE)
+
+static inline int kmem_cache_debug(struct kmem_cache *s)
+{
#ifdef CONFIG_SLUB_DEBUG
-#define SLABDEBUG 1
+ return unlikely(s->flags & SLAB_DEBUG_FLAGS);
#else
-#define SLABDEBUG 0
+ return 0;
#endif
+}
/*
* Issues still to be resolved:
@@ -151,26 +158,18 @@
* Set of flags that will prevent slab merging
*/
#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
- SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE)
+ SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
+ SLAB_FAILSLAB)
#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
SLAB_CACHE_DMA | SLAB_NOTRACK)
-#ifndef ARCH_KMALLOC_MINALIGN
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-#endif
-
-#ifndef ARCH_SLAB_MINALIGN
-#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
-#endif
-
#define OO_SHIFT 16
#define OO_MASK ((1 << OO_SHIFT) - 1)
#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */
/* Internal SLUB flags */
-#define __OBJECT_POISON 0x80000000 /* Poison object */
-#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */
+#define __OBJECT_POISON 0x80000000UL /* Poison object */
static int kmem_size = sizeof(struct kmem_cache);
@@ -180,7 +179,7 @@ static struct notifier_block slab_notifier;
static enum {
DOWN, /* No slab functionality available */
- PARTIAL, /* kmem_cache_open() works but kmalloc does not */
+ PARTIAL, /* Kmem_cache_node works */
UP, /* Everything works but does not show up in sysfs */
SYSFS /* Sysfs up */
} slab_state = DOWN;
@@ -201,7 +200,7 @@ struct track {
enum track_item { TRACK_ALLOC, TRACK_FREE };
-#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SYSFS
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
static void sysfs_slab_remove(struct kmem_cache *);
@@ -212,15 +211,16 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
static inline void sysfs_slab_remove(struct kmem_cache *s)
{
+ kfree(s->name);
kfree(s);
}
#endif
-static inline void stat(struct kmem_cache_cpu *c, enum stat_item si)
+static inline void stat(struct kmem_cache *s, enum stat_item si)
{
#ifdef CONFIG_SLUB_STATS
- c->stat[si]++;
+ __this_cpu_inc(s->cpu_slab->stat[si]);
#endif
}
@@ -235,20 +235,7 @@ int slab_is_available(void)
static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
{
-#ifdef CONFIG_NUMA
return s->node[node];
-#else
- return &s->local_node;
-#endif
-}
-
-static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
-{
-#ifdef CONFIG_SMP
- return s->cpu_slab[cpu];
-#else
- return &s->cpu_slab;
-#endif
}
/* Verify that a pointer has an address that is valid within a slab page */
@@ -269,13 +256,6 @@ static inline int check_valid_pointer(struct kmem_cache *s,
return 1;
}
-/*
- * Slow version of get and set free pointer.
- *
- * This version requires touching the cache lines of kmem_cache which
- * we avoid to do in the fast alloc free paths. There we obtain the offset
- * from the page struct.
- */
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
return *(void **)(object + s->offset);
@@ -512,7 +492,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
dump_stack();
}
-static void init_object(struct kmem_cache *s, void *object, int active)
+static void init_object(struct kmem_cache *s, void *object, u8 val)
{
u8 *p = object;
@@ -522,9 +502,7 @@ static void init_object(struct kmem_cache *s, void *object, int active)
}
if (s->flags & SLAB_RED_ZONE)
- memset(p + s->objsize,
- active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
- s->inuse - s->objsize);
+ memset(p + s->objsize, val, s->inuse - s->objsize);
}
static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
@@ -659,17 +637,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
}
static int check_object(struct kmem_cache *s, struct page *page,
- void *object, int active)
+ void *object, u8 val)
{
u8 *p = object;
u8 *endobject = object + s->objsize;
if (s->flags & SLAB_RED_ZONE) {
- unsigned int red =
- active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
-
if (!check_bytes_and_report(s, page, object, "Redzone",
- endobject, red, s->inuse - s->objsize))
+ endobject, val, s->inuse - s->objsize))
return 0;
} else {
if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
@@ -679,7 +654,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
}
if (s->flags & SLAB_POISON) {
- if (!active && (s->flags & __OBJECT_POISON) &&
+ if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
(!check_bytes_and_report(s, page, p, "Poison", p,
POISON_FREE, s->objsize - 1) ||
!check_bytes_and_report(s, page, p, "Poison",
@@ -691,7 +666,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
check_pad_bytes(s, page, p);
}
- if (!s->offset && active)
+ if (!s->offset && val == SLUB_RED_ACTIVE)
/*
* Object and freepointer overlap. Cannot check
* freepointer while object is allocated.
@@ -810,6 +785,39 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
}
/*
+ * Hooks for other subsystems that check memory allocations. In a typical
+ * production configuration these hooks all should produce no code at all.
+ */
+static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
+{
+ flags &= gfp_allowed_mask;
+ lockdep_trace_alloc(flags);
+ might_sleep_if(flags & __GFP_WAIT);
+
+ return should_failslab(s->objsize, flags, s->flags);
+}
+
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
+{
+ flags &= gfp_allowed_mask;
+ kmemcheck_slab_alloc(s, flags, object, s->objsize);
+ kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
+}
+
+static inline void slab_free_hook(struct kmem_cache *s, void *x)
+{
+ kmemleak_free_recursive(x, s->flags);
+}
+
+static inline void slab_free_hook_irq(struct kmem_cache *s, void *object)
+{
+ kmemcheck_slab_free(s, object, s->objsize);
+ debug_check_no_locks_freed(object, s->objsize);
+ if (!(s->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(object, s->objsize);
+}
+
+/*
* Tracking of fully allocated slabs for debugging purposes.
*/
static void add_full(struct kmem_cache_node *n, struct page *page)
@@ -856,7 +864,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
* dilemma by deferring the increment of the count during
* bootstrap (see early_kmem_cache_node_alloc).
*/
- if (!NUMA_BUILD || n) {
+ if (n) {
atomic_long_inc(&n->nr_slabs);
atomic_long_add(objects, &n->total_objects);
}
@@ -876,11 +884,11 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
return;
- init_object(s, object, 0);
+ init_object(s, object, SLUB_RED_INACTIVE);
init_tracking(s, object);
}
-static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
+static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page,
void *object, unsigned long addr)
{
if (!check_slab(s, page))
@@ -896,14 +904,14 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
goto bad;
}
- if (!check_object(s, page, object, 0))
+ if (!check_object(s, page, object, SLUB_RED_INACTIVE))
goto bad;
/* Success perform special debug activities for allocs */
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_ALLOC, addr);
trace(s, page, object, 1);
- init_object(s, object, 1);
+ init_object(s, object, SLUB_RED_ACTIVE);
return 1;
bad:
@@ -920,8 +928,8 @@ bad:
return 0;
}
-static int free_debug_processing(struct kmem_cache *s, struct page *page,
- void *object, unsigned long addr)
+static noinline int free_debug_processing(struct kmem_cache *s,
+ struct page *page, void *object, unsigned long addr)
{
if (!check_slab(s, page))
goto fail;
@@ -936,7 +944,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
goto fail;
}
- if (!check_object(s, page, object, 1))
+ if (!check_object(s, page, object, SLUB_RED_ACTIVE))
return 0;
if (unlikely(s != page->slab)) {
@@ -960,7 +968,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_FREE, addr);
trace(s, page, object, 0);
- init_object(s, object, 0);
+ init_object(s, object, SLUB_RED_INACTIVE);
return 1;
fail:
@@ -1020,6 +1028,9 @@ static int __init setup_slub_debug(char *str)
case 't':
slub_debug |= SLAB_TRACE;
break;
+ case 'a':
+ slub_debug |= SLAB_FAILSLAB;
+ break;
default:
printk(KERN_ERR "slub_debug option '%c' "
"unknown. skipped\n", *str);
@@ -1061,7 +1072,7 @@ static inline int free_debug_processing(struct kmem_cache *s,
static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
{ return 1; }
static inline int check_object(struct kmem_cache *s, struct page *page,
- void *object, int active) { return 1; }
+ void *object, u8 val) { return 1; }
static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
static inline unsigned long kmem_cache_flags(unsigned long objsize,
unsigned long flags, const char *name,
@@ -1081,7 +1092,19 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
int objects) {}
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
-#endif
+
+static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
+ { return 0; }
+
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
+ void *object) {}
+
+static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
+
+static inline void slab_free_hook_irq(struct kmem_cache *s,
+ void *object) {}
+
+#endif /* CONFIG_SLUB_DEBUG */
/*
* Slab allocation and freeing
@@ -1093,10 +1116,10 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node,
flags |= __GFP_NOTRACK;
- if (node == -1)
+ if (node == NUMA_NO_NODE)
return alloc_pages(flags, order);
else
- return alloc_pages_node(node, flags, order);
+ return alloc_pages_exact_node(node, flags, order);
}
static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1124,7 +1147,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
if (!page)
return NULL;
- stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK);
+ stat(s, ORDER_FALLBACK);
}
if (kmemcheck_enabled
@@ -1177,9 +1200,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
inc_slabs_node(s, page_to_nid(page), page->objects);
page->slab = s;
page->flags |= 1 << PG_slab;
- if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
- SLAB_STORE_USER | SLAB_TRACE))
- __SetPageSlubDebug(page);
start = page_address(page);
@@ -1206,14 +1226,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
int order = compound_order(page);
int pages = 1 << order;
- if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
+ if (kmem_cache_debug(s)) {
void *p;
slab_pad_check(s, page);
for_each_object(p, s, page_address(page),
page->objects)
- check_object(s, page, p, 0);
- __ClearPageSlubDebug(page);
+ check_object(s, page, p, SLUB_RED_INACTIVE);
}
kmemcheck_free_shadow(page, compound_order(page));
@@ -1293,13 +1312,19 @@ static void add_partial(struct kmem_cache_node *n,
spin_unlock(&n->list_lock);
}
+static inline void __remove_partial(struct kmem_cache_node *n,
+ struct page *page)
+{
+ list_del(&page->lru);
+ n->nr_partial--;
+}
+
static void remove_partial(struct kmem_cache *s, struct page *page)
{
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
spin_lock(&n->list_lock);
- list_del(&page->lru);
- n->nr_partial--;
+ __remove_partial(n, page);
spin_unlock(&n->list_lock);
}
@@ -1312,8 +1337,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
struct page *page)
{
if (slab_trylock(page)) {
- list_del(&page->lru);
- n->nr_partial--;
+ __remove_partial(n, page);
__SetPageSlubFrozen(page);
return 1;
}
@@ -1380,6 +1404,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
get_cycles() % 1024 > s->remote_node_defrag_ratio)
return NULL;
+ get_mems_allowed();
zonelist = node_zonelist(slab_node(current->mempolicy), flags);
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
struct kmem_cache_node *n;
@@ -1389,10 +1414,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
n->nr_partial > s->min_partial) {
page = get_partial_node(n);
- if (page)
+ if (page) {
+ put_mems_allowed();
return page;
+ }
}
}
+ put_mems_allowed();
#endif
return NULL;
}
@@ -1403,10 +1431,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
{
struct page *page;
- int searchnode = (node == -1) ? numa_node_id() : node;
+ int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
page = get_partial_node(get_node(s, searchnode));
- if (page || (flags & __GFP_THISNODE))
+ if (page || node != -1)
return page;
return get_any_partial(s, flags);
@@ -1420,25 +1448,24 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
* On exit the slab lock will have been dropped.
*/
static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
+ __releases(bitlock)
{
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
- struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
__ClearPageSlubFrozen(page);
if (page->inuse) {
if (page->freelist) {
add_partial(n, page, tail);
- stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
+ stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
} else {
- stat(c, DEACTIVATE_FULL);
- if (SLABDEBUG && PageSlubDebug(page) &&
- (s->flags & SLAB_STORE_USER))
+ stat(s, DEACTIVATE_FULL);
+ if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER))
add_full(n, page);
}
slab_unlock(page);
} else {
- stat(c, DEACTIVATE_EMPTY);
+ stat(s, DEACTIVATE_EMPTY);
if (n->nr_partial < s->min_partial) {
/*
* Adding an empty slab to the partial slabs in order
@@ -1454,7 +1481,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
slab_unlock(page);
} else {
slab_unlock(page);
- stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB);
+ stat(s, FREE_SLAB);
discard_slab(s, page);
}
}
@@ -1464,12 +1491,13 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
* Remove the cpu slab
*/
static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+ __releases(bitlock)
{
struct page *page = c->page;
int tail = 1;
if (page->freelist)
- stat(c, DEACTIVATE_REMOTE_FREES);
+ stat(s, DEACTIVATE_REMOTE_FREES);
/*
* Merge cpu freelist into slab freelist. Typically we get here
* because both freelists are empty. So this is unlikely
@@ -1482,10 +1510,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
/* Retrieve object from cpu_freelist */
object = c->freelist;
- c->freelist = c->freelist[c->offset];
+ c->freelist = get_freepointer(s, c->freelist);
/* And put onto the regular freelist */
- object[c->offset] = page->freelist;
+ set_freepointer(s, object, page->freelist);
page->freelist = object;
page->inuse--;
}
@@ -1495,7 +1523,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
- stat(c, CPUSLAB_FLUSH);
+ stat(s, CPUSLAB_FLUSH);
slab_lock(c->page);
deactivate_slab(s, c);
}
@@ -1507,7 +1535,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
*/
static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
{
- struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
if (likely(c && c->page))
flush_slab(s, c);
@@ -1532,7 +1560,7 @@ static void flush_all(struct kmem_cache *s)
static inline int node_match(struct kmem_cache_cpu *c, int node)
{
#ifdef CONFIG_NUMA
- if (node != -1 && c->node != node)
+ if (node != NUMA_NO_NODE && c->node != node)
return 0;
#endif
return 1;
@@ -1635,22 +1663,22 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
if (unlikely(!node_match(c, node)))
goto another_slab;
- stat(c, ALLOC_REFILL);
+ stat(s, ALLOC_REFILL);
load_freelist:
object = c->page->freelist;
if (unlikely(!object))
goto another_slab;
- if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
+ if (kmem_cache_debug(s))
goto debug;
- c->freelist = object[c->offset];
+ c->freelist = get_freepointer(s, object);
c->page->inuse = c->page->objects;
c->page->freelist = NULL;
c->node = page_to_nid(c->page);
unlock_out:
slab_unlock(c->page);
- stat(c, ALLOC_SLOWPATH);
+ stat(s, ALLOC_SLOWPATH);
return object;
another_slab:
@@ -1660,10 +1688,11 @@ new_slab:
new = get_partial(s, gfpflags, node);
if (new) {
c->page = new;
- stat(c, ALLOC_FROM_PARTIAL);
+ stat(s, ALLOC_FROM_PARTIAL);
goto load_freelist;
}
+ gfpflags &= gfp_allowed_mask;
if (gfpflags & __GFP_WAIT)
local_irq_enable();
@@ -1673,8 +1702,8 @@ new_slab:
local_irq_disable();
if (new) {
- c = get_cpu_slab(s, smp_processor_id());
- stat(c, ALLOC_SLAB);
+ c = __this_cpu_ptr(s->cpu_slab);
+ stat(s, ALLOC_SLAB);
if (c->page)
flush_slab(s, c);
slab_lock(new);
@@ -1690,8 +1719,8 @@ debug:
goto another_slab;
c->page->inuse++;
- c->page->freelist = object[c->offset];
- c->node = -1;
+ c->page->freelist = get_freepointer(s, object);
+ c->node = NUMA_NO_NODE;
goto unlock_out;
}
@@ -1711,42 +1740,34 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
void **object;
struct kmem_cache_cpu *c;
unsigned long flags;
- unsigned int objsize;
- gfpflags &= gfp_allowed_mask;
-
- lockdep_trace_alloc(gfpflags);
- might_sleep_if(gfpflags & __GFP_WAIT);
-
- if (should_failslab(s->objsize, gfpflags))
+ if (slab_pre_alloc_hook(s, gfpflags))
return NULL;
local_irq_save(flags);
- c = get_cpu_slab(s, smp_processor_id());
- objsize = c->objsize;
- if (unlikely(!c->freelist || !node_match(c, node)))
+ c = __this_cpu_ptr(s->cpu_slab);
+ object = c->freelist;
+ if (unlikely(!object || !node_match(c, node)))
object = __slab_alloc(s, gfpflags, node, addr, c);
else {
- object = c->freelist;
- c->freelist = object[c->offset];
- stat(c, ALLOC_FASTPATH);
+ c->freelist = get_freepointer(s, object);
+ stat(s, ALLOC_FASTPATH);
}
local_irq_restore(flags);
if (unlikely(gfpflags & __GFP_ZERO) && object)
- memset(object, 0, objsize);
+ memset(object, 0, s->objsize);
- kmemcheck_slab_alloc(s, gfpflags, object, c->objsize);
- kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags);
+ slab_post_alloc_hook(s, gfpflags, object);
return object;
}
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
- void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
@@ -1755,11 +1776,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
+void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+{
+ void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+ trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+
+void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
{
- return slab_alloc(s, gfpflags, -1, _RET_IP_);
+ void *ret = kmalloc_order(size, flags, order);
+ trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
+ return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_notrace);
+EXPORT_SYMBOL(kmalloc_order_trace);
#endif
#ifdef CONFIG_NUMA
@@ -1773,16 +1804,20 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#endif
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
+void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
- int node)
+ int node, size_t size)
{
- return slab_alloc(s, gfpflags, node, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+
+ trace_kmalloc_node(_RET_IP_, ret,
+ size, s->size, gfpflags, node);
+ return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
+#endif
#endif
/*
@@ -1794,26 +1829,25 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
* handling required then we can return immediately.
*/
static void __slab_free(struct kmem_cache *s, struct page *page,
- void *x, unsigned long addr, unsigned int offset)
+ void *x, unsigned long addr)
{
void *prior;
void **object = (void *)x;
- struct kmem_cache_cpu *c;
- c = get_cpu_slab(s, raw_smp_processor_id());
- stat(c, FREE_SLOWPATH);
+ stat(s, FREE_SLOWPATH);
slab_lock(page);
- if (unlikely(SLABDEBUG && PageSlubDebug(page)))
+ if (kmem_cache_debug(s))
goto debug;
checks_ok:
- prior = object[offset] = page->freelist;
+ prior = page->freelist;
+ set_freepointer(s, object, prior);
page->freelist = object;
page->inuse--;
if (unlikely(PageSlubFrozen(page))) {
- stat(c, FREE_FROZEN);
+ stat(s, FREE_FROZEN);
goto out_unlock;
}
@@ -1826,7 +1860,7 @@ checks_ok:
*/
if (unlikely(!prior)) {
add_partial(get_node(s, page_to_nid(page)), page, 1);
- stat(c, FREE_ADD_PARTIAL);
+ stat(s, FREE_ADD_PARTIAL);
}
out_unlock:
@@ -1839,10 +1873,10 @@ slab_empty:
* Slab still on the partial list.
*/
remove_partial(s, page);
- stat(c, FREE_REMOVE_PARTIAL);
+ stat(s, FREE_REMOVE_PARTIAL);
}
slab_unlock(page);
- stat(c, FREE_SLAB);
+ stat(s, FREE_SLAB);
discard_slab(s, page);
return;
@@ -1870,19 +1904,19 @@ static __always_inline void slab_free(struct kmem_cache *s,
struct kmem_cache_cpu *c;
unsigned long flags;
- kmemleak_free_recursive(x, s->flags);
+ slab_free_hook(s, x);
+
local_irq_save(flags);
- c = get_cpu_slab(s, smp_processor_id());
- kmemcheck_slab_free(s, object, c->objsize);
- debug_check_no_locks_freed(object, c->objsize);
- if (!(s->flags & SLAB_DEBUG_OBJECTS))
- debug_check_no_obj_freed(object, c->objsize);
- if (likely(page == c->page && c->node >= 0)) {
- object[c->offset] = c->freelist;
+ c = __this_cpu_ptr(s->cpu_slab);
+
+ slab_free_hook_irq(s, x);
+
+ if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
+ set_freepointer(s, object, c->freelist);
c->freelist = object;
- stat(c, FREE_FASTPATH);
+ stat(s, FREE_FASTPATH);
} else
- __slab_free(s, page, x, addr, c->offset);
+ __slab_free(s, page, x, addr);
local_irq_restore(flags);
}
@@ -1899,17 +1933,6 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
}
EXPORT_SYMBOL(kmem_cache_free);
-/* Figure out on which slab page the object resides */
-static struct page *get_object_page(const void *x)
-{
- struct page *page = virt_to_head_page(x);
-
- if (!PageSlab(page))
- return NULL;
-
- return page;
-}
-
/*
* Object placement in a slab is made very easy because we always start at
* offset 0. If we tune the size of the object to the alignment then we can
@@ -2069,19 +2092,6 @@ static unsigned long calculate_alignment(unsigned long flags,
return ALIGN(align, sizeof(void *));
}
-static void init_kmem_cache_cpu(struct kmem_cache *s,
- struct kmem_cache_cpu *c)
-{
- c->page = NULL;
- c->freelist = NULL;
- c->node = 0;
- c->offset = s->offset / sizeof(void *);
- c->objsize = s->objsize;
-#ifdef CONFIG_SLUB_STATS
- memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned));
-#endif
-}
-
static void
init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
{
@@ -2095,132 +2105,18 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
#endif
}
-#ifdef CONFIG_SMP
-/*
- * Per cpu array for per cpu structures.
- *
- * The per cpu array places all kmem_cache_cpu structures from one processor
- * close together meaning that it becomes possible that multiple per cpu
- * structures are contained in one cacheline. This may be particularly
- * beneficial for the kmalloc caches.
- *
- * A desktop system typically has around 60-80 slabs. With 100 here we are
- * likely able to get per cpu structures for all caches from the array defined
- * here. We must be able to cover all kmalloc caches during bootstrap.
- *
- * If the per cpu array is exhausted then fall back to kmalloc
- * of individual cachelines. No sharing is possible then.
- */
-#define NR_KMEM_CACHE_CPU 100
-
-static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
- kmem_cache_cpu);
-
-static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
-static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
-
-static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
- int cpu, gfp_t flags)
-{
- struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
-
- if (c)
- per_cpu(kmem_cache_cpu_free, cpu) =
- (void *)c->freelist;
- else {
- /* Table overflow: So allocate ourselves */
- c = kmalloc_node(
- ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
- flags, cpu_to_node(cpu));
- if (!c)
- return NULL;
- }
-
- init_kmem_cache_cpu(s, c);
- return c;
-}
-
-static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
-{
- if (c < per_cpu(kmem_cache_cpu, cpu) ||
- c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
- kfree(c);
- return;
- }
- c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
- per_cpu(kmem_cache_cpu_free, cpu) = c;
-}
-
-static void free_kmem_cache_cpus(struct kmem_cache *s)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
-
- if (c) {
- s->cpu_slab[cpu] = NULL;
- free_kmem_cache_cpu(c, cpu);
- }
- }
-}
-
-static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
{
- int cpu;
+ BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
+ SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
- for_each_online_cpu(cpu) {
- struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+ s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
- if (c)
- continue;
-
- c = alloc_kmem_cache_cpu(s, cpu, flags);
- if (!c) {
- free_kmem_cache_cpus(s);
- return 0;
- }
- s->cpu_slab[cpu] = c;
- }
- return 1;
+ return s->cpu_slab != NULL;
}
-/*
- * Initialize the per cpu array.
- */
-static void init_alloc_cpu_cpu(int cpu)
-{
- int i;
-
- if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)))
- return;
-
- for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
- free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
+static struct kmem_cache *kmem_cache_node;
- cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once));
-}
-
-static void __init init_alloc_cpu(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu)
- init_alloc_cpu_cpu(cpu);
- }
-
-#else
-static inline void free_kmem_cache_cpus(struct kmem_cache *s) {}
-static inline void init_alloc_cpu(void) {}
-
-static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
-{
- init_kmem_cache_cpu(s, &s->cpu_slab);
- return 1;
-}
-#endif
-
-#ifdef CONFIG_NUMA
/*
* No kmalloc_node yet so do it by hand. We know that this is the first
* slab on the node for this slabcache. There are no concurrent accesses
@@ -2230,15 +2126,15 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
* when allocating for the kmalloc_node_cache. This is used for bootstrapping
* memory on a fresh node that has no slab structures yet.
*/
-static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
+static void early_kmem_cache_node_alloc(int node)
{
struct page *page;
struct kmem_cache_node *n;
unsigned long flags;
- BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
+ BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
- page = new_slab(kmalloc_caches, gfpflags, node);
+ page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
BUG_ON(!page);
if (page_to_nid(page) != node) {
@@ -2250,15 +2146,15 @@ static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
n = page->freelist;
BUG_ON(!n);
- page->freelist = get_freepointer(kmalloc_caches, n);
+ page->freelist = get_freepointer(kmem_cache_node, n);
page->inuse++;
- kmalloc_caches->node[node] = n;
+ kmem_cache_node->node[node] = n;
#ifdef CONFIG_SLUB_DEBUG
- init_object(kmalloc_caches, n, 1);
- init_tracking(kmalloc_caches, n);
+ init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
+ init_tracking(kmem_cache_node, n);
#endif
- init_kmem_cache_node(n, kmalloc_caches);
- inc_slabs_node(kmalloc_caches, node, page->objects);
+ init_kmem_cache_node(n, kmem_cache_node);
+ inc_slabs_node(kmem_cache_node, node, page->objects);
/*
* lockdep requires consistent irq usage for each lock
@@ -2276,57 +2172,38 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = s->node[node];
- if (n && n != &s->local_node)
- kmem_cache_free(kmalloc_caches, n);
+
+ if (n)
+ kmem_cache_free(kmem_cache_node, n);
+
s->node[node] = NULL;
}
}
-static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
+static int init_kmem_cache_nodes(struct kmem_cache *s)
{
int node;
- int local_node;
-
- if (slab_state >= UP)
- local_node = page_to_nid(virt_to_page(s));
- else
- local_node = 0;
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n;
- if (local_node == node)
- n = &s->local_node;
- else {
- if (slab_state == DOWN) {
- early_kmem_cache_node_alloc(gfpflags, node);
- continue;
- }
- n = kmem_cache_alloc_node(kmalloc_caches,
- gfpflags, node);
-
- if (!n) {
- free_kmem_cache_nodes(s);
- return 0;
- }
+ if (slab_state == DOWN) {
+ early_kmem_cache_node_alloc(node);
+ continue;
+ }
+ n = kmem_cache_alloc_node(kmem_cache_node,
+ GFP_KERNEL, node);
+ if (!n) {
+ free_kmem_cache_nodes(s);
+ return 0;
}
+
s->node[node] = n;
init_kmem_cache_node(n, s);
}
return 1;
}
-#else
-static void free_kmem_cache_nodes(struct kmem_cache *s)
-{
-}
-
-static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
-{
- init_kmem_cache_node(&s->local_node, s);
- return 1;
-}
-#endif
static void set_min_partial(struct kmem_cache *s, unsigned long min)
{
@@ -2461,7 +2338,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
}
-static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
+static int kmem_cache_open(struct kmem_cache *s,
const char *name, size_t size,
size_t align, unsigned long flags,
void (*ctor)(void *))
@@ -2497,11 +2374,12 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 1000;
#endif
- if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
+ if (!init_kmem_cache_nodes(s))
goto error;
- if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
+ if (alloc_kmem_cache_cpus(s))
return 1;
+
free_kmem_cache_nodes(s);
error:
if (flags & SLAB_PANIC)
@@ -2513,32 +2391,6 @@ error:
}
/*
- * Check if a given pointer is valid
- */
-int kmem_ptr_validate(struct kmem_cache *s, const void *object)
-{
- struct page *page;
-
- page = get_object_page(object);
-
- if (!page || s != page->slab)
- /* No slab or wrong slab */
- return 0;
-
- if (!check_valid_pointer(s, page, object))
- return 0;
-
- /*
- * We could also check if the object is on the slabs freelist.
- * But this would be too expensive and it seems that the main
- * purpose of kmem_ptr_valid() is to check if the object belongs
- * to a certain slab.
- */
- return 1;
-}
-EXPORT_SYMBOL(kmem_ptr_validate);
-
-/*
* Determine the size of a slab object
*/
unsigned int kmem_cache_size(struct kmem_cache *s)
@@ -2559,9 +2411,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
void *p;
- DECLARE_BITMAP(map, page->objects);
-
- bitmap_zero(map, page->objects);
+ unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
+ sizeof(long), GFP_ATOMIC);
+ if (!map)
+ return;
slab_err(s, page, "%s", text);
slab_lock(page);
for_each_free_object(p, s, page->freelist)
@@ -2576,6 +2429,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
}
}
slab_unlock(page);
+ kfree(map);
#endif
}
@@ -2590,9 +2444,8 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
- list_del(&page->lru);
+ __remove_partial(n, page);
discard_slab(s, page);
- n->nr_partial--;
} else {
list_slab_objects(s, page,
"Objects remaining on kmem_cache_close()");
@@ -2609,9 +2462,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
int node;
flush_all(s);
-
+ free_percpu(s->cpu_slab);
/* Attempt to free all objects */
- free_kmem_cache_cpus(s);
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
@@ -2633,7 +2485,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
s->refcount--;
if (!s->refcount) {
list_del(&s->list);
- up_write(&slub_lock);
if (kmem_cache_close(s)) {
printk(KERN_ERR "SLUB %s: %s called for cache that "
"still has objects.\n", s->name, __func__);
@@ -2642,8 +2493,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->flags & SLAB_DESTROY_BY_RCU)
rcu_barrier();
sysfs_slab_remove(s);
- } else
- up_write(&slub_lock);
+ }
+ up_write(&slub_lock);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2651,9 +2502,15 @@ EXPORT_SYMBOL(kmem_cache_destroy);
* Kmalloc subsystem
*******************************************************************/
-struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
+struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
EXPORT_SYMBOL(kmalloc_caches);
+static struct kmem_cache *kmem_cache;
+
+#ifdef CONFIG_ZONE_DMA
+static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
+#endif
+
static int __init setup_slub_min_order(char *str)
{
get_option(&str, &slub_min_order);
@@ -2690,108 +2547,29 @@ static int __init setup_slub_nomerge(char *str)
__setup("slub_nomerge", setup_slub_nomerge);
-static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
- const char *name, int size, gfp_t gfp_flags)
+static struct kmem_cache *__init create_kmalloc_cache(const char *name,
+ int size, unsigned int flags)
{
- unsigned int flags = 0;
+ struct kmem_cache *s;
- if (gfp_flags & SLUB_DMA)
- flags = SLAB_CACHE_DMA;
+ s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
/*
* This function is called with IRQs disabled during early-boot on
* single CPU so there's no need to take slub_lock here.
*/
- if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
+ if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
flags, NULL))
goto panic;
list_add(&s->list, &slab_caches);
-
- if (sysfs_slab_add(s))
- goto panic;
return s;
panic:
panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
+ return NULL;
}
-#ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
-
-static void sysfs_add_func(struct work_struct *w)
-{
- struct kmem_cache *s;
-
- down_write(&slub_lock);
- list_for_each_entry(s, &slab_caches, list) {
- if (s->flags & __SYSFS_ADD_DEFERRED) {
- s->flags &= ~__SYSFS_ADD_DEFERRED;
- sysfs_slab_add(s);
- }
- }
- up_write(&slub_lock);
-}
-
-static DECLARE_WORK(sysfs_add_work, sysfs_add_func);
-
-static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
-{
- struct kmem_cache *s;
- char *text;
- size_t realsize;
- unsigned long slabflags;
-
- s = kmalloc_caches_dma[index];
- if (s)
- return s;
-
- /* Dynamically create dma cache */
- if (flags & __GFP_WAIT)
- down_write(&slub_lock);
- else {
- if (!down_write_trylock(&slub_lock))
- goto out;
- }
-
- if (kmalloc_caches_dma[index])
- goto unlock_out;
-
- realsize = kmalloc_caches[index].objsize;
- text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
- (unsigned int)realsize);
- s = kmalloc(kmem_size, flags & ~SLUB_DMA);
-
- /*
- * Must defer sysfs creation to a workqueue because we don't know
- * what context we are called from. Before sysfs comes up, we don't
- * need to do anything because our sysfs initcall will start by
- * adding all existing slabs to sysfs.
- */
- slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK;
- if (slab_state >= SYSFS)
- slabflags |= __SYSFS_ADD_DEFERRED;
-
- if (!s || !text || !kmem_cache_open(s, flags, text,
- realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
- kfree(s);
- kfree(text);
- goto unlock_out;
- }
-
- list_add(&s->list, &slab_caches);
- kmalloc_caches_dma[index] = s;
-
- if (slab_state >= SYSFS)
- schedule_work(&sysfs_add_work);
-
-unlock_out:
- up_write(&slub_lock);
-out:
- return kmalloc_caches_dma[index];
-}
-#endif
-
/*
* Conversion table for small slabs sizes / 8 to the index in the
* kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -2844,10 +2622,10 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
#ifdef CONFIG_ZONE_DMA
if (unlikely((flags & SLUB_DMA)))
- return dma_kmalloc_cache(index, flags);
+ return kmalloc_dma_caches[index];
#endif
- return &kmalloc_caches[index];
+ return kmalloc_caches[index];
}
void *__kmalloc(size_t size, gfp_t flags)
@@ -2863,7 +2641,7 @@ void *__kmalloc(size_t size, gfp_t flags)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, flags, -1, _RET_IP_);
+ ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_);
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
@@ -2871,6 +2649,7 @@ void *__kmalloc(size_t size, gfp_t flags)
}
EXPORT_SYMBOL(__kmalloc);
+#ifdef CONFIG_NUMA
static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
struct page *page;
@@ -2885,7 +2664,6 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
return ptr;
}
-#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
struct kmem_cache *s;
@@ -3025,8 +2803,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
* may have freed the last object and be
* waiting to release the slab.
*/
- list_del(&page->lru);
- n->nr_partial--;
+ __remove_partial(n, page);
slab_unlock(page);
discard_slab(s, page);
} else {
@@ -3050,7 +2827,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
}
EXPORT_SYMBOL(kmem_cache_shrink);
-#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+#if defined(CONFIG_MEMORY_HOTPLUG)
static int slab_mem_going_offline_callback(void *arg)
{
struct kmem_cache *s;
@@ -3086,13 +2863,13 @@ static void slab_mem_offline_callback(void *arg)
/*
* if n->nr_slabs > 0, slabs still exist on the node
* that is going down. We were unable to free them,
- * and offline_pages() function shoudn't call this
+ * and offline_pages() function shouldn't call this
* callback. So, we must fail.
*/
BUG_ON(slabs_node(s, offline_node));
s->node[offline_node] = NULL;
- kmem_cache_free(kmalloc_caches, n);
+ kmem_cache_free(kmem_cache_node, n);
}
}
up_read(&slub_lock);
@@ -3125,7 +2902,7 @@ static int slab_mem_going_online_callback(void *arg)
* since memory is not yet available from the node that
* is brought up.
*/
- n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL);
+ n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
if (!n) {
ret = -ENOMEM;
goto out;
@@ -3171,48 +2948,92 @@ static int slab_memory_callback(struct notifier_block *self,
* Basic setup of slabs
*******************************************************************/
+/*
+ * Used for early kmem_cache structures that were allocated using
+ * the page allocator
+ */
+
+static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
+{
+ int node;
+
+ list_add(&s->list, &slab_caches);
+ s->refcount = -1;
+
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
+ struct page *p;
+
+ if (n) {
+ list_for_each_entry(p, &n->partial, lru)
+ p->slab = s;
+
+#ifdef CONFIG_SLAB_DEBUG
+ list_for_each_entry(p, &n->full, lru)
+ p->slab = s;
+#endif
+ }
+ }
+}
+
void __init kmem_cache_init(void)
{
int i;
int caches = 0;
+ struct kmem_cache *temp_kmem_cache;
+ int order;
+ struct kmem_cache *temp_kmem_cache_node;
+ unsigned long kmalloc_size;
- init_alloc_cpu();
+ kmem_size = offsetof(struct kmem_cache, node) +
+ nr_node_ids * sizeof(struct kmem_cache_node *);
+
+ /* Allocate two kmem_caches from the page allocator */
+ kmalloc_size = ALIGN(kmem_size, cache_line_size());
+ order = get_order(2 * kmalloc_size);
+ kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
-#ifdef CONFIG_NUMA
/*
* Must first have the slab cache available for the allocations of the
* struct kmem_cache_node's. There is special bootstrap code in
* kmem_cache_open for slab_state == DOWN.
*/
- create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
- sizeof(struct kmem_cache_node), GFP_NOWAIT);
- kmalloc_caches[0].refcount = -1;
- caches++;
+ kmem_cache_node = (void *)kmem_cache + kmalloc_size;
+
+ kmem_cache_open(kmem_cache_node, "kmem_cache_node",
+ sizeof(struct kmem_cache_node),
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
-#endif
/* Able to allocate the per node structures */
slab_state = PARTIAL;
- /* Caches that are not of the two-to-the-power-of size */
- if (KMALLOC_MIN_SIZE <= 32) {
- create_kmalloc_cache(&kmalloc_caches[1],
- "kmalloc-96", 96, GFP_NOWAIT);
- caches++;
- }
- if (KMALLOC_MIN_SIZE <= 64) {
- create_kmalloc_cache(&kmalloc_caches[2],
- "kmalloc-192", 192, GFP_NOWAIT);
- caches++;
- }
+ temp_kmem_cache = kmem_cache;
+ kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
+ memcpy(kmem_cache, temp_kmem_cache, kmem_size);
- for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
- create_kmalloc_cache(&kmalloc_caches[i],
- "kmalloc", 1 << i, GFP_NOWAIT);
- caches++;
- }
+ /*
+ * Allocate kmem_cache_node properly from the kmem_cache slab.
+ * kmem_cache_node is separately allocated so no need to
+ * update any list pointers.
+ */
+ temp_kmem_cache_node = kmem_cache_node;
+
+ kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
+ memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
+
+ kmem_cache_bootstrap_fixup(kmem_cache_node);
+ caches++;
+ kmem_cache_bootstrap_fixup(kmem_cache);
+ caches++;
+ /* Free temporary boot structure */
+ free_pages((unsigned long)temp_kmem_cache, order);
+
+ /* Now we can use the kmem_cache to allocate kmalloc slabs */
/*
* Patch up the size_index table if we have strange large alignment
@@ -3252,21 +3073,60 @@ void __init kmem_cache_init(void)
size_index[size_index_elem(i)] = 8;
}
+ /* Caches that are not of the two-to-the-power-of size */
+ if (KMALLOC_MIN_SIZE <= 32) {
+ kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0);
+ caches++;
+ }
+
+ if (KMALLOC_MIN_SIZE <= 64) {
+ kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0);
+ caches++;
+ }
+
+ for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
+ kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0);
+ caches++;
+ }
+
slab_state = UP;
/* Provide the correct kmalloc names now that the caches are up */
- for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
- kmalloc_caches[i]. name =
- kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
+ if (KMALLOC_MIN_SIZE <= 32) {
+ kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT);
+ BUG_ON(!kmalloc_caches[1]->name);
+ }
+
+ if (KMALLOC_MIN_SIZE <= 64) {
+ kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT);
+ BUG_ON(!kmalloc_caches[2]->name);
+ }
+
+ for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
+ char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
+
+ BUG_ON(!s);
+ kmalloc_caches[i]->name = s;
+ }
#ifdef CONFIG_SMP
register_cpu_notifier(&slab_notifier);
- kmem_size = offsetof(struct kmem_cache, cpu_slab) +
- nr_cpu_ids * sizeof(struct kmem_cache_cpu *);
-#else
- kmem_size = sizeof(struct kmem_cache);
#endif
+#ifdef CONFIG_ZONE_DMA
+ for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
+ struct kmem_cache *s = kmalloc_caches[i];
+
+ if (s && s->size) {
+ char *name = kasprintf(GFP_NOWAIT,
+ "dma-kmalloc-%d", s->objsize);
+
+ BUG_ON(!name);
+ kmalloc_dma_caches[i] = create_kmalloc_cache(name,
+ s->objsize, SLAB_CACHE_DMA);
+ }
+ }
+#endif
printk(KERN_INFO
"SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
" CPUs=%d, Nodes=%d\n",
@@ -3344,6 +3204,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
+ char *n;
if (WARN_ON(!name))
return NULL;
@@ -3351,54 +3212,46 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
down_write(&slub_lock);
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
- int cpu;
-
s->refcount++;
/*
* Adjust the object sizes so that we clear
* the complete object on kzalloc.
*/
s->objsize = max(s->objsize, (int)size);
-
- /*
- * And then we need to update the object size in the
- * per cpu structures
- */
- for_each_online_cpu(cpu)
- get_cpu_slab(s, cpu)->objsize = s->objsize;
-
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
- up_write(&slub_lock);
if (sysfs_slab_alias(s, name)) {
- down_write(&slub_lock);
s->refcount--;
- up_write(&slub_lock);
goto err;
}
+ up_write(&slub_lock);
return s;
}
+ n = kstrdup(name, GFP_KERNEL);
+ if (!n)
+ goto err;
+
s = kmalloc(kmem_size, GFP_KERNEL);
if (s) {
- if (kmem_cache_open(s, GFP_KERNEL, name,
+ if (kmem_cache_open(s, n,
size, align, flags, ctor)) {
list_add(&s->list, &slab_caches);
- up_write(&slub_lock);
if (sysfs_slab_add(s)) {
- down_write(&slub_lock);
list_del(&s->list);
- up_write(&slub_lock);
+ kfree(n);
kfree(s);
goto err;
}
+ up_write(&slub_lock);
return s;
}
+ kfree(n);
kfree(s);
}
+err:
up_write(&slub_lock);
-err:
if (flags & SLAB_PANIC)
panic("Cannot create slabcache %s\n", name);
else
@@ -3420,29 +3273,15 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
unsigned long flags;
switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- init_alloc_cpu_cpu(cpu);
- down_read(&slub_lock);
- list_for_each_entry(s, &slab_caches, list)
- s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
- GFP_KERNEL);
- up_read(&slub_lock);
- break;
-
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
down_read(&slub_lock);
list_for_each_entry(s, &slab_caches, list) {
- struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
-
local_irq_save(flags);
__flush_cpu_slab(s, cpu);
local_irq_restore(flags);
- free_kmem_cache_cpu(c, cpu);
- s->cpu_slab[cpu] = NULL;
}
up_read(&slub_lock);
break;
@@ -3471,7 +3310,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, gfpflags, -1, caller);
+ ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
/* Honor the call site pointer we recieved. */
trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -3479,14 +3318,22 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
return ret;
}
+#ifdef CONFIG_NUMA
void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
int node, unsigned long caller)
{
struct kmem_cache *s;
void *ret;
- if (unlikely(size > SLUB_MAX_SIZE))
- return kmalloc_large_node(size, gfpflags, node);
+ if (unlikely(size > SLUB_MAX_SIZE)) {
+ ret = kmalloc_large_node(size, gfpflags, node);
+
+ trace_kmalloc_node(caller, ret,
+ size, PAGE_SIZE << get_order(size),
+ gfpflags, node);
+
+ return ret;
+ }
s = get_slab(size, gfpflags);
@@ -3500,8 +3347,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
return ret;
}
+#endif
-#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SYSFS
static int count_inuse(struct page *page)
{
return page->inuse;
@@ -3511,7 +3359,9 @@ static int count_total(struct page *page)
{
return page->objects;
}
+#endif
+#ifdef CONFIG_SLUB_DEBUG
static int validate_slab(struct kmem_cache *s, struct page *page,
unsigned long *map)
{
@@ -3527,13 +3377,13 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
for_each_free_object(p, s, page->freelist) {
set_bit(slab_index(p, s, addr), map);
- if (!check_object(s, page, p, 0))
+ if (!check_object(s, page, p, SLUB_RED_INACTIVE))
return 0;
}
for_each_object(p, s, addr, page->objects)
if (!test_bit(slab_index(p, s, addr), map))
- if (!check_object(s, page, p, 1))
+ if (!check_object(s, page, p, SLUB_RED_ACTIVE))
return 0;
return 1;
}
@@ -3547,16 +3397,6 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page,
} else
printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
s->name, page);
-
- if (s->flags & DEBUG_DEFAULT_FLAGS) {
- if (!PageSlubDebug(page))
- printk(KERN_ERR "SLUB %s: SlubDebug not set "
- "on slab 0x%p\n", s->name, page);
- } else {
- if (PageSlubDebug(page))
- printk(KERN_ERR "SLUB %s: SlubDebug set on "
- "slab 0x%p\n", s->name, page);
- }
}
static int validate_slab_node(struct kmem_cache *s,
@@ -3612,65 +3452,6 @@ static long validate_slab_cache(struct kmem_cache *s)
kfree(map);
return count;
}
-
-#ifdef SLUB_RESILIENCY_TEST
-static void resiliency_test(void)
-{
- u8 *p;
-
- printk(KERN_ERR "SLUB resiliency testing\n");
- printk(KERN_ERR "-----------------------\n");
- printk(KERN_ERR "A. Corruption after allocation\n");
-
- p = kzalloc(16, GFP_KERNEL);
- p[16] = 0x12;
- printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
- " 0x12->0x%p\n\n", p + 16);
-
- validate_slab_cache(kmalloc_caches + 4);
-
- /* Hmmm... The next two are dangerous */
- p = kzalloc(32, GFP_KERNEL);
- p[32 + sizeof(void *)] = 0x34;
- printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
- " 0x34 -> -0x%p\n", p);
- printk(KERN_ERR
- "If allocated object is overwritten then not detectable\n\n");
-
- validate_slab_cache(kmalloc_caches + 5);
- p = kzalloc(64, GFP_KERNEL);
- p += 64 + (get_cycles() & 0xff) * sizeof(void *);
- *p = 0x56;
- printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
- p);
- printk(KERN_ERR
- "If allocated object is overwritten then not detectable\n\n");
- validate_slab_cache(kmalloc_caches + 6);
-
- printk(KERN_ERR "\nB. Corruption after free\n");
- p = kzalloc(128, GFP_KERNEL);
- kfree(p);
- *p = 0x78;
- printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches + 7);
-
- p = kzalloc(256, GFP_KERNEL);
- kfree(p);
- p[50] = 0x9a;
- printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
- p);
- validate_slab_cache(kmalloc_caches + 8);
-
- p = kzalloc(512, GFP_KERNEL);
- kfree(p);
- p[512] = 0xab;
- printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches + 9);
-}
-#else
-static void resiliency_test(void) {};
-#endif
-
/*
* Generate lists of code addresses where slabcache objects are allocated
* and freed.
@@ -3798,10 +3579,10 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
}
static void process_slab(struct loc_track *t, struct kmem_cache *s,
- struct page *page, enum track_item alloc)
+ struct page *page, enum track_item alloc,
+ unsigned long *map)
{
void *addr = page_address(page);
- DECLARE_BITMAP(map, page->objects);
void *p;
bitmap_zero(map, page->objects);
@@ -3820,11 +3601,14 @@ static int list_locations(struct kmem_cache *s, char *buf,
unsigned long i;
struct loc_track t = { 0, 0, NULL };
int node;
+ unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
+ sizeof(unsigned long), GFP_KERNEL);
- if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
- GFP_TEMPORARY))
+ if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
+ GFP_TEMPORARY)) {
+ kfree(map);
return sprintf(buf, "Out of memory\n");
-
+ }
/* Push back cpu slabs */
flush_all(s);
@@ -3838,9 +3622,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, lru)
- process_slab(&t, s, page, alloc);
+ process_slab(&t, s, page, alloc, map);
list_for_each_entry(page, &n->full, lru)
- process_slab(&t, s, page, alloc);
+ process_slab(&t, s, page, alloc, map);
spin_unlock_irqrestore(&n->list_lock, flags);
}
@@ -3852,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
len += sprintf(buf + len, "%7ld ", l->count);
if (l->addr)
- len += sprint_symbol(buf + len, (unsigned long)l->addr);
+ len += sprintf(buf + len, "%pS", (void *)l->addr);
else
len += sprintf(buf + len, "<not-available>");
@@ -3891,11 +3675,76 @@ static int list_locations(struct kmem_cache *s, char *buf,
}
free_loc_track(&t);
+ kfree(map);
if (!t.count)
len += sprintf(buf, "No data\n");
return len;
}
+#endif
+
+#ifdef SLUB_RESILIENCY_TEST
+static void resiliency_test(void)
+{
+ u8 *p;
+
+ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10);
+
+ printk(KERN_ERR "SLUB resiliency testing\n");
+ printk(KERN_ERR "-----------------------\n");
+ printk(KERN_ERR "A. Corruption after allocation\n");
+
+ p = kzalloc(16, GFP_KERNEL);
+ p[16] = 0x12;
+ printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
+ " 0x12->0x%p\n\n", p + 16);
+
+ validate_slab_cache(kmalloc_caches[4]);
+
+ /* Hmmm... The next two are dangerous */
+ p = kzalloc(32, GFP_KERNEL);
+ p[32 + sizeof(void *)] = 0x34;
+ printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
+ " 0x34 -> -0x%p\n", p);
+ printk(KERN_ERR
+ "If allocated object is overwritten then not detectable\n\n");
+
+ validate_slab_cache(kmalloc_caches[5]);
+ p = kzalloc(64, GFP_KERNEL);
+ p += 64 + (get_cycles() & 0xff) * sizeof(void *);
+ *p = 0x56;
+ printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
+ p);
+ printk(KERN_ERR
+ "If allocated object is overwritten then not detectable\n\n");
+ validate_slab_cache(kmalloc_caches[6]);
+
+ printk(KERN_ERR "\nB. Corruption after free\n");
+ p = kzalloc(128, GFP_KERNEL);
+ kfree(p);
+ *p = 0x78;
+ printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
+ validate_slab_cache(kmalloc_caches[7]);
+
+ p = kzalloc(256, GFP_KERNEL);
+ kfree(p);
+ p[50] = 0x9a;
+ printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
+ p);
+ validate_slab_cache(kmalloc_caches[8]);
+
+ p = kzalloc(512, GFP_KERNEL);
+ kfree(p);
+ p[512] = 0xab;
+ printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
+ validate_slab_cache(kmalloc_caches[9]);
+}
+#else
+#ifdef CONFIG_SYSFS
+static void resiliency_test(void) {};
+#endif
+#endif
+#ifdef CONFIG_SYSFS
enum slab_stat_type {
SL_ALL, /* All slabs */
SL_PARTIAL, /* Only partially allocated slabs */
@@ -3928,7 +3777,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
int cpu;
for_each_possible_cpu(cpu) {
- struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
if (!c || c->node < 0)
continue;
@@ -3948,6 +3797,8 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
}
}
+ lock_memory_hotplug();
+#ifdef CONFIG_SLUB_DEBUG
if (flags & SO_ALL) {
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
@@ -3964,7 +3815,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
nodes[node] += x;
}
- } else if (flags & SO_PARTIAL) {
+ } else
+#endif
+ if (flags & SO_PARTIAL) {
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
@@ -3985,10 +3838,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
x += sprintf(buf + x, " N%d=%lu",
node, nodes[node]);
#endif
+ unlock_memory_hotplug();
kfree(nodes);
return x + sprintf(buf + x, "\n");
}
+#ifdef CONFIG_SLUB_DEBUG
static int any_slab_objects(struct kmem_cache *s)
{
int node;
@@ -4004,6 +3859,7 @@ static int any_slab_objects(struct kmem_cache *s)
}
return 0;
}
+#endif
#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
#define to_slab(n) container_of(n, struct kmem_cache, kobj);
@@ -4090,12 +3946,9 @@ SLAB_ATTR(min_partial);
static ssize_t ctor_show(struct kmem_cache *s, char *buf)
{
- if (s->ctor) {
- int n = sprint_symbol(buf, (unsigned long)s->ctor);
-
- return n + sprintf(buf + n, "\n");
- }
- return 0;
+ if (!s->ctor)
+ return 0;
+ return sprintf(buf, "%pS\n", s->ctor);
}
SLAB_ATTR_RO(ctor);
@@ -4105,12 +3958,6 @@ static ssize_t aliases_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(aliases);
-static ssize_t slabs_show(struct kmem_cache *s, char *buf)
-{
- return show_slab_objects(s, buf, SO_ALL);
-}
-SLAB_ATTR_RO(slabs);
-
static ssize_t partial_show(struct kmem_cache *s, char *buf)
{
return show_slab_objects(s, buf, SO_PARTIAL);
@@ -4135,6 +3982,48 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(objects_partial);
+static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
+}
+
+static ssize_t reclaim_account_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ s->flags &= ~SLAB_RECLAIM_ACCOUNT;
+ if (buf[0] == '1')
+ s->flags |= SLAB_RECLAIM_ACCOUNT;
+ return length;
+}
+SLAB_ATTR(reclaim_account);
+
+static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
+}
+SLAB_ATTR_RO(hwcache_align);
+
+#ifdef CONFIG_ZONE_DMA
+static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
+}
+SLAB_ATTR_RO(cache_dma);
+#endif
+
+static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
+}
+SLAB_ATTR_RO(destroy_by_rcu);
+
+#ifdef CONFIG_SLUB_DEBUG
+static ssize_t slabs_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_ALL);
+}
+SLAB_ATTR_RO(slabs);
+
static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
{
return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
@@ -4171,41 +4060,6 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
}
SLAB_ATTR(trace);
-static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
-{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
-}
-
-static ssize_t reclaim_account_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- s->flags &= ~SLAB_RECLAIM_ACCOUNT;
- if (buf[0] == '1')
- s->flags |= SLAB_RECLAIM_ACCOUNT;
- return length;
-}
-SLAB_ATTR(reclaim_account);
-
-static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
-{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
-}
-SLAB_ATTR_RO(hwcache_align);
-
-#ifdef CONFIG_ZONE_DMA
-static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
-{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
-}
-SLAB_ATTR_RO(cache_dma);
-#endif
-
-static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
-{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
-}
-SLAB_ATTR_RO(destroy_by_rcu);
-
static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
@@ -4282,6 +4136,40 @@ static ssize_t validate_store(struct kmem_cache *s,
}
SLAB_ATTR(validate);
+static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return -ENOSYS;
+ return list_locations(s, buf, TRACK_ALLOC);
+}
+SLAB_ATTR_RO(alloc_calls);
+
+static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return -ENOSYS;
+ return list_locations(s, buf, TRACK_FREE);
+}
+SLAB_ATTR_RO(free_calls);
+#endif /* CONFIG_SLUB_DEBUG */
+
+#ifdef CONFIG_FAILSLAB
+static ssize_t failslab_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
+}
+
+static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ s->flags &= ~SLAB_FAILSLAB;
+ if (buf[0] == '1')
+ s->flags |= SLAB_FAILSLAB;
+ return length;
+}
+SLAB_ATTR(failslab);
+#endif
+
static ssize_t shrink_show(struct kmem_cache *s, char *buf)
{
return 0;
@@ -4301,22 +4189,6 @@ static ssize_t shrink_store(struct kmem_cache *s,
}
SLAB_ATTR(shrink);
-static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
-{
- if (!(s->flags & SLAB_STORE_USER))
- return -ENOSYS;
- return list_locations(s, buf, TRACK_ALLOC);
-}
-SLAB_ATTR_RO(alloc_calls);
-
-static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
-{
- if (!(s->flags & SLAB_STORE_USER))
- return -ENOSYS;
- return list_locations(s, buf, TRACK_FREE);
-}
-SLAB_ATTR_RO(free_calls);
-
#ifdef CONFIG_NUMA
static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
{
@@ -4353,7 +4225,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
return -ENOMEM;
for_each_online_cpu(cpu) {
- unsigned x = get_cpu_slab(s, cpu)->stat[si];
+ unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
data[cpu] = x;
sum += x;
@@ -4376,7 +4248,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si)
int cpu;
for_each_online_cpu(cpu)
- get_cpu_slab(s, cpu)->stat[si] = 0;
+ per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
}
#define STAT_ATTR(si, text) \
@@ -4422,25 +4294,27 @@ static struct attribute *slab_attrs[] = {
&min_partial_attr.attr,
&objects_attr.attr,
&objects_partial_attr.attr,
- &total_objects_attr.attr,
- &slabs_attr.attr,
&partial_attr.attr,
&cpu_slabs_attr.attr,
&ctor_attr.attr,
&aliases_attr.attr,
&align_attr.attr,
- &sanity_checks_attr.attr,
- &trace_attr.attr,
&hwcache_align_attr.attr,
&reclaim_account_attr.attr,
&destroy_by_rcu_attr.attr,
+ &shrink_attr.attr,
+#ifdef CONFIG_SLUB_DEBUG
+ &total_objects_attr.attr,
+ &slabs_attr.attr,
+ &sanity_checks_attr.attr,
+ &trace_attr.attr,
&red_zone_attr.attr,
&poison_attr.attr,
&store_user_attr.attr,
&validate_attr.attr,
- &shrink_attr.attr,
&alloc_calls_attr.attr,
&free_calls_attr.attr,
+#endif
#ifdef CONFIG_ZONE_DMA
&cache_dma_attr.attr,
#endif
@@ -4467,6 +4341,10 @@ static struct attribute *slab_attrs[] = {
&deactivate_remote_frees_attr.attr,
&order_fallback_attr.attr,
#endif
+#ifdef CONFIG_FAILSLAB
+ &failslab_attr.attr,
+#endif
+
NULL
};
@@ -4516,10 +4394,11 @@ static void kmem_cache_release(struct kobject *kobj)
{
struct kmem_cache *s = to_slab(kobj);
+ kfree(s->name);
kfree(s);
}
-static struct sysfs_ops slab_sysfs_ops = {
+static const struct sysfs_ops slab_sysfs_ops = {
.show = slab_attr_show,
.store = slab_attr_store,
};
@@ -4538,7 +4417,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
return 0;
}
-static struct kset_uevent_ops slab_uevent_ops = {
+static const struct kset_uevent_ops slab_uevent_ops = {
.filter = uevent_filter,
};
@@ -4631,6 +4510,13 @@ static int sysfs_slab_add(struct kmem_cache *s)
static void sysfs_slab_remove(struct kmem_cache *s)
{
+ if (slab_state < SYSFS)
+ /*
+ * Sysfs has not been setup yet so no need to remove the
+ * cache from sysfs.
+ */
+ return;
+
kobject_uevent(&s->kobj, KOBJ_REMOVE);
kobject_del(&s->kobj);
kobject_put(&s->kobj);
@@ -4676,8 +4562,11 @@ static int __init slab_sysfs_init(void)
struct kmem_cache *s;
int err;
+ down_write(&slub_lock);
+
slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
if (!slab_kset) {
+ up_write(&slub_lock);
printk(KERN_ERR "Cannot register slab subsystem.\n");
return -ENOSYS;
}
@@ -4702,12 +4591,13 @@ static int __init slab_sysfs_init(void)
kfree(al);
}
+ up_write(&slub_lock);
resiliency_test();
return 0;
}
__initcall(slab_sysfs_init);
-#endif
+#endif /* CONFIG_SYSFS */
/*
* The /proc/slabinfo ABI
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index d9714bdcb4a3..64b984091edb 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -9,7 +9,7 @@
*
* However, virtual mappings need a page table and TLBs. Many Linux
* architectures already map their physical space using 1-1 mappings
- * via TLBs. For those arches the virtual memmory map is essentially
+ * via TLBs. For those arches the virtual memory map is essentially
* for free if we use the same page size as the 1-1 mappings. In that
* case the overhead consists of a few additional pages that are
* allocated to create a view of memory for vmemmap.
@@ -22,6 +22,7 @@
#include <linux/bootmem.h>
#include <linux/highmem.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
@@ -40,9 +41,11 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
unsigned long align,
unsigned long goal)
{
- return __alloc_bootmem_node(NODE_DATA(node), size, align, goal);
+ return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
}
+static void *vmemmap_buf;
+static void *vmemmap_buf_end;
void * __meminit vmemmap_alloc_block(unsigned long size, int node)
{
@@ -64,6 +67,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
__pa(MAX_DMA_ADDRESS));
}
+/* need to make sure size is all the same during early stage */
+void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
+{
+ void *ptr;
+
+ if (!vmemmap_buf)
+ return vmemmap_alloc_block(size, node);
+
+ /* take the from buf */
+ ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
+ if (ptr + size > vmemmap_buf_end)
+ return vmemmap_alloc_block(size, node);
+
+ vmemmap_buf = ptr + size;
+
+ return ptr;
+}
+
void __meminit vmemmap_verify(pte_t *pte, int node,
unsigned long start, unsigned long end)
{
@@ -80,7 +101,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
pte_t *pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte)) {
pte_t entry;
- void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
if (!p)
return NULL;
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -163,3 +184,44 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
return map;
}
+
+void __init sparse_mem_maps_populate_node(struct page **map_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count, int nodeid)
+{
+ unsigned long pnum;
+ unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
+ void *vmemmap_buf_start;
+
+ size = ALIGN(size, PMD_SIZE);
+ vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
+ PMD_SIZE, __pa(MAX_DMA_ADDRESS));
+
+ if (vmemmap_buf_start) {
+ vmemmap_buf = vmemmap_buf_start;
+ vmemmap_buf_end = vmemmap_buf_start + size * map_count;
+ }
+
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+
+ map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
+ if (map_map[pnum])
+ continue;
+ ms = __nr_to_section(pnum);
+ printk(KERN_ERR "%s: sparsemem memory map backing failed "
+ "some memory will not be available.\n", __func__);
+ ms->section_mem_map = 0;
+ }
+
+ if (vmemmap_buf_start) {
+ /* need to free left buf */
+ free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
+ vmemmap_buf = NULL;
+ vmemmap_buf_end = NULL;
+ }
+}
diff --git a/mm/sparse.c b/mm/sparse.c
index 6ce4aab69e99..93250207c5cf 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -2,6 +2,7 @@
* sparse memory mappings.
*/
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/mmzone.h>
#include <linux/bootmem.h>
#include <linux/highmem.h>
@@ -271,7 +272,8 @@ static unsigned long *__kmalloc_section_usemap(void)
#ifdef CONFIG_MEMORY_HOTREMOVE
static unsigned long * __init
-sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
+ unsigned long count)
{
unsigned long section_nr;
@@ -286,7 +288,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
* this problem.
*/
section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
- return alloc_bootmem_section(usemap_size(), section_nr);
+ return alloc_bootmem_section(usemap_size() * count, section_nr);
}
static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -329,7 +331,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
}
#else
static unsigned long * __init
-sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
+ unsigned long count)
{
return NULL;
}
@@ -339,44 +342,117 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
-static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
+static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long usemap_count, int nodeid)
{
- unsigned long *usemap;
- struct mem_section *ms = __nr_to_section(pnum);
- int nid = sparse_early_nid(ms);
-
- usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
- if (usemap)
- return usemap;
+ void *usemap;
+ unsigned long pnum;
+ int size = usemap_size();
- usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+ usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
+ usemap_count);
if (usemap) {
- check_usemap_section_nr(nid, usemap);
- return usemap;
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ usemap_map[pnum] = usemap;
+ usemap += size;
+ }
+ return;
}
- /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
- nid = 0;
+ usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
+ if (usemap) {
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ usemap_map[pnum] = usemap;
+ usemap += size;
+ check_usemap_section_nr(nodeid, usemap_map[pnum]);
+ }
+ return;
+ }
printk(KERN_WARNING "%s: allocation failed\n", __func__);
- return NULL;
}
#ifndef CONFIG_SPARSEMEM_VMEMMAP
struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
{
struct page *map;
+ unsigned long size;
map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
if (map)
return map;
- map = alloc_bootmem_pages_node(NODE_DATA(nid),
- PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
+ size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
+ map = __alloc_bootmem_node_high(NODE_DATA(nid), size,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
return map;
}
+void __init sparse_mem_maps_populate_node(struct page **map_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count, int nodeid)
+{
+ void *map;
+ unsigned long pnum;
+ unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
+
+ map = alloc_remap(nodeid, size * map_count);
+ if (map) {
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ map_map[pnum] = map;
+ map += size;
+ }
+ return;
+ }
+
+ size = PAGE_ALIGN(size);
+ map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ if (map) {
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ map_map[pnum] = map;
+ map += size;
+ }
+ return;
+ }
+
+ /* fallback */
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+ map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
+ if (map_map[pnum])
+ continue;
+ ms = __nr_to_section(pnum);
+ printk(KERN_ERR "%s: sparsemem memory map backing failed "
+ "some memory will not be available.\n", __func__);
+ ms->section_mem_map = 0;
+ }
+}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+static void __init sparse_early_mem_maps_alloc_node(struct page **map_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count, int nodeid)
+{
+ sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
+ map_count, nodeid);
+}
+#else
static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
{
struct page *map;
@@ -392,10 +468,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
ms->section_mem_map = 0;
return NULL;
}
+#endif
void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
{
}
+
/*
* Allocate the accumulated non-linear sections, allocate a mem_map
* for each and record the physical to section mapping.
@@ -407,6 +485,14 @@ void __init sparse_init(void)
unsigned long *usemap;
unsigned long **usemap_map;
int size;
+ int nodeid_begin = 0;
+ unsigned long pnum_begin = 0;
+ unsigned long usemap_count;
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+ unsigned long map_count;
+ int size2;
+ struct page **map_map;
+#endif
/*
* map is using big page (aka 2M in x86 64 bit)
@@ -425,10 +511,81 @@ void __init sparse_init(void)
panic("can not allocate usemap_map\n");
for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+
if (!present_section_nr(pnum))
continue;
- usemap_map[pnum] = sparse_early_usemap_alloc(pnum);
+ ms = __nr_to_section(pnum);
+ nodeid_begin = sparse_early_nid(ms);
+ pnum_begin = pnum;
+ break;
}
+ usemap_count = 1;
+ for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+ int nodeid;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid = sparse_early_nid(ms);
+ if (nodeid == nodeid_begin) {
+ usemap_count++;
+ continue;
+ }
+ /* ok, we need to take cake of from pnum_begin to pnum - 1*/
+ sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum,
+ usemap_count, nodeid_begin);
+ /* new start, update count etc*/
+ nodeid_begin = nodeid;
+ pnum_begin = pnum;
+ usemap_count = 1;
+ }
+ /* ok, last chunk */
+ sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
+ usemap_count, nodeid_begin);
+
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+ size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
+ map_map = alloc_bootmem(size2);
+ if (!map_map)
+ panic("can not allocate map_map\n");
+
+ for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid_begin = sparse_early_nid(ms);
+ pnum_begin = pnum;
+ break;
+ }
+ map_count = 1;
+ for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+ int nodeid;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid = sparse_early_nid(ms);
+ if (nodeid == nodeid_begin) {
+ map_count++;
+ continue;
+ }
+ /* ok, we need to take cake of from pnum_begin to pnum - 1*/
+ sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
+ map_count, nodeid_begin);
+ /* new start, update count etc*/
+ nodeid_begin = nodeid;
+ pnum_begin = pnum;
+ map_count = 1;
+ }
+ /* ok, last chunk */
+ sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
+ map_count, nodeid_begin);
+#endif
for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
if (!present_section_nr(pnum))
@@ -438,7 +595,11 @@ void __init sparse_init(void)
if (!usemap)
continue;
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+ map = map_map[pnum];
+#else
map = sparse_early_mem_map_alloc(pnum);
+#endif
if (!map)
continue;
@@ -448,6 +609,9 @@ void __init sparse_init(void)
vmemmap_populate_print_last();
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+ free_bootmem(__pa(map_map), size2);
+#endif
free_bootmem(__pa(usemap_map), size);
}
@@ -507,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
static void free_map_bootmem(struct page *page, unsigned long nr_pages)
{
unsigned long maps_section_nr, removing_section_nr, i;
- int magic;
+ unsigned long magic;
for (i = 0; i < nr_pages; i++, page++) {
- magic = atomic_read(&page->_mapcount);
+ magic = (unsigned long) page->lru.next;
BUG_ON(magic == NODE_INFO);
diff --git a/mm/swap.c b/mm/swap.c
index 308e57d8d7ed..c02f93611a84 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,6 +30,7 @@
#include <linux/notifier.h>
#include <linux/backing-dev.h>
#include <linux/memcontrol.h>
+#include <linux/gfp.h>
#include "internal.h"
@@ -55,17 +56,97 @@ static void __page_cache_release(struct page *page)
del_page_from_lru(zone, page);
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
- free_hot_page(page);
}
-static void put_compound_page(struct page *page)
+static void __put_single_page(struct page *page)
{
- page = compound_head(page);
- if (put_page_testzero(page)) {
- compound_page_dtor *dtor;
+ __page_cache_release(page);
+ free_hot_cold_page(page, 0);
+}
- dtor = get_compound_page_dtor(page);
- (*dtor)(page);
+static void __put_compound_page(struct page *page)
+{
+ compound_page_dtor *dtor;
+
+ __page_cache_release(page);
+ dtor = get_compound_page_dtor(page);
+ (*dtor)(page);
+}
+
+static void put_compound_page(struct page *page)
+{
+ if (unlikely(PageTail(page))) {
+ /* __split_huge_page_refcount can run under us */
+ struct page *page_head = page->first_page;
+ smp_rmb();
+ /*
+ * If PageTail is still set after smp_rmb() we can be sure
+ * that the page->first_page we read wasn't a dangling pointer.
+ * See __split_huge_page_refcount() smp_wmb().
+ */
+ if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
+ unsigned long flags;
+ /*
+ * Verify that our page_head wasn't converted
+ * to a a regular page before we got a
+ * reference on it.
+ */
+ if (unlikely(!PageHead(page_head))) {
+ /* PageHead is cleared after PageTail */
+ smp_rmb();
+ VM_BUG_ON(PageTail(page));
+ goto out_put_head;
+ }
+ /*
+ * Only run compound_lock on a valid PageHead,
+ * after having it pinned with
+ * get_page_unless_zero() above.
+ */
+ smp_mb();
+ /* page_head wasn't a dangling pointer */
+ flags = compound_lock_irqsave(page_head);
+ if (unlikely(!PageTail(page))) {
+ /* __split_huge_page_refcount run before us */
+ compound_unlock_irqrestore(page_head, flags);
+ VM_BUG_ON(PageHead(page_head));
+ out_put_head:
+ if (put_page_testzero(page_head))
+ __put_single_page(page_head);
+ out_put_single:
+ if (put_page_testzero(page))
+ __put_single_page(page);
+ return;
+ }
+ VM_BUG_ON(page_head != page->first_page);
+ /*
+ * We can release the refcount taken by
+ * get_page_unless_zero now that
+ * split_huge_page_refcount is blocked on the
+ * compound_lock.
+ */
+ if (put_page_testzero(page_head))
+ VM_BUG_ON(1);
+ /* __split_huge_page_refcount will wait now */
+ VM_BUG_ON(atomic_read(&page->_count) <= 0);
+ atomic_dec(&page->_count);
+ VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+ compound_unlock_irqrestore(page_head, flags);
+ if (put_page_testzero(page_head)) {
+ if (PageHead(page_head))
+ __put_compound_page(page_head);
+ else
+ __put_single_page(page_head);
+ }
+ } else {
+ /* page_head is a dangling pointer */
+ VM_BUG_ON(PageTail(page));
+ goto out_put_single;
+ }
+ } else if (put_page_testzero(page)) {
+ if (PageHead(page))
+ __put_compound_page(page);
+ else
+ __put_single_page(page);
}
}
@@ -74,7 +155,7 @@ void put_page(struct page *page)
if (unlikely(PageCompound(page)))
put_compound_page(page);
else if (put_page_testzero(page))
- __page_cache_release(page);
+ __put_single_page(page);
}
EXPORT_SYMBOL(put_page);
@@ -223,6 +304,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru)
____pagevec_lru_add(pvec, lru);
put_cpu_var(lru_add_pvecs);
}
+EXPORT_SYMBOL(__lru_cache_add);
/**
* lru_cache_add_lru - add a page to a page list
@@ -376,6 +458,7 @@ void release_pages(struct page **pages, int nr, int cold)
pagevec_free(&pages_to_free);
}
+EXPORT_SYMBOL(release_pages);
/*
* The pages which we're about to release may be in the deferred lru-addition
@@ -396,6 +479,43 @@ void __pagevec_release(struct pagevec *pvec)
EXPORT_SYMBOL(__pagevec_release);
+/* used by __split_huge_page_refcount() */
+void lru_add_page_tail(struct zone* zone,
+ struct page *page, struct page *page_tail)
+{
+ int active;
+ enum lru_list lru;
+ const int file = 0;
+ struct list_head *head;
+
+ VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON(PageCompound(page_tail));
+ VM_BUG_ON(PageLRU(page_tail));
+ VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
+
+ SetPageLRU(page_tail);
+
+ if (page_evictable(page_tail, NULL)) {
+ if (PageActive(page)) {
+ SetPageActive(page_tail);
+ active = 1;
+ lru = LRU_ACTIVE_ANON;
+ } else {
+ active = 0;
+ lru = LRU_INACTIVE_ANON;
+ }
+ update_page_reclaim_stat(zone, page_tail, file, active);
+ if (likely(PageLRU(page)))
+ head = page->lru.prev;
+ else
+ head = &zone->lru[lru].list;
+ __add_page_to_lru_list(zone, page_tail, lru, head);
+ } else {
+ SetPageUnevictable(page_tail);
+ add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
+ }
+}
+
/*
* Add the passed pages to the LRU, then drop the caller's refcount
* on them. Reinitialises the caller's pagevec.
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 6d1daeb1cb4a..5c8cfabbc9bc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -8,6 +8,7 @@
*/
#include <linux/module.h>
#include <linux/mm.h>
+#include <linux/gfp.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -156,6 +157,12 @@ int add_to_swap(struct page *page)
if (!entry.val)
return 0;
+ if (unlikely(PageTransHuge(page)))
+ if (unlikely(split_huge_page(page))) {
+ swapcache_free(entry, NULL);
+ return 0;
+ }
+
/*
* Radix-tree node allocations from PF_MEMALLOC contexts could
* completely exhaust the page allocator. __GFP_NOMEMALLOC
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6c0585b16418..07a458d72fa8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -30,6 +30,7 @@
#include <linux/capability.h>
#include <linux/syscalls.h>
#include <linux/memcontrol.h>
+#include <linux/poll.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -58,6 +59,10 @@ static struct swap_info_struct *swap_info[MAX_SWAPFILES];
static DEFINE_MUTEX(swapon_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
+/* Activity counter to indicate that a swapon or swapoff has occurred */
+static atomic_t proc_poll_event = ATOMIC_INIT(0);
+
static inline unsigned char swap_count(unsigned char ent)
{
return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
@@ -139,7 +144,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
if (nr_blocks) {
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+ nr_blocks, GFP_KERNEL, 0);
if (err)
return err;
cond_resched();
@@ -150,7 +155,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+ nr_blocks, GFP_KERNEL, 0);
if (err)
break;
@@ -189,7 +194,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
+ nr_blocks, GFP_NOIO, 0))
break;
}
@@ -574,6 +579,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
/* free if no reference */
if (!usage) {
+ struct gendisk *disk = p->bdev->bd_disk;
if (offset < p->lowest_bit)
p->lowest_bit = offset;
if (offset > p->highest_bit)
@@ -583,6 +589,9 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
swap_list.next = p->type;
nr_swap_pages++;
p->inuse_pages--;
+ if ((p->flags & SWP_BLKDEV) &&
+ disk->fops->swap_slot_free_notify)
+ disk->fops->swap_slot_free_notify(p->bdev, offset);
}
return usage;
@@ -679,6 +688,24 @@ int try_to_free_swap(struct page *page)
if (page_swapcount(page))
return 0;
+ /*
+ * Once hibernation has begun to create its image of memory,
+ * there's a danger that one of the calls to try_to_free_swap()
+ * - most probably a call from __try_to_reclaim_swap() while
+ * hibernation is allocating its own swap pages for the image,
+ * but conceivably even a call from memory reclaim - will free
+ * the swap from a page which has already been recorded in the
+ * image as a clean swapcache page, and then reuse its swap for
+ * another page of the image. On waking from hibernation, the
+ * original page might be freed under memory pressure, then
+ * later read back in from swap, now with the wrong data.
+ *
+ * Hibernation clears bits from gfp_allowed_mask to prevent
+ * memory reclaim from writing to disk, so check that here.
+ */
+ if (!(gfp_allowed_mask & __GFP_IO))
+ return 0;
+
delete_from_swap_cache(page);
SetPageDirty(page);
return 1;
@@ -723,6 +750,37 @@ int free_swap_and_cache(swp_entry_t entry)
return p != NULL;
}
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+/**
+ * mem_cgroup_count_swap_user - count the user of a swap entry
+ * @ent: the swap entry to be checked
+ * @pagep: the pointer for the swap cache page of the entry to be stored
+ *
+ * Returns the number of the user of the swap entry. The number is valid only
+ * for swaps of anonymous pages.
+ * If the entry is found on swap cache, the page is stored to pagep with
+ * refcount of it being incremented.
+ */
+int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
+{
+ struct page *page;
+ struct swap_info_struct *p;
+ int count = 0;
+
+ page = find_get_page(&swapper_space, ent.val);
+ if (page)
+ count += page_mapcount(page);
+ p = swap_info_get(ent);
+ if (p) {
+ count += swap_count(p->swap_map[swp_offset(ent)]);
+ spin_unlock(&swap_lock);
+ }
+
+ *pagep = page;
+ return count;
+}
+#endif
+
#ifdef CONFIG_HIBERNATION
/*
* Find the swap type that corresponds to given device (if any).
@@ -840,7 +898,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
goto out;
}
- inc_mm_counter(vma->vm_mm, anon_rss);
+ dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -905,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (unlikely(pmd_trans_huge(*pmd)))
+ continue;
if (pmd_none_or_clear_bad(pmd))
continue;
ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
@@ -1618,7 +1679,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (S_ISBLK(inode->i_mode)) {
struct block_device *bdev = I_BDEV(inode);
set_blocksize(bdev, p->old_block_size);
- bd_release(bdev);
+ blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
} else {
mutex_lock(&inode->i_mutex);
inode->i_flags &= ~S_SWAPFILE;
@@ -1626,6 +1687,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
}
filp_close(swap_file, NULL);
err = 0;
+ atomic_inc(&proc_poll_event);
+ wake_up_interruptible(&proc_poll_wait);
out_dput:
filp_close(victim, NULL);
@@ -1634,6 +1697,25 @@ out:
}
#ifdef CONFIG_PROC_FS
+struct proc_swaps {
+ struct seq_file seq;
+ int event;
+};
+
+static unsigned swaps_poll(struct file *file, poll_table *wait)
+{
+ struct proc_swaps *s = file->private_data;
+
+ poll_wait(file, &proc_poll_wait, wait);
+
+ if (s->event != atomic_read(&proc_poll_event)) {
+ s->event = atomic_read(&proc_poll_event);
+ return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+ }
+
+ return POLLIN | POLLRDNORM;
+}
+
/* iterator */
static void *swap_start(struct seq_file *swap, loff_t *pos)
{
@@ -1717,7 +1799,24 @@ static const struct seq_operations swaps_op = {
static int swaps_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &swaps_op);
+ struct proc_swaps *s;
+ int ret;
+
+ s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ file->private_data = s;
+
+ ret = seq_open(file, &swaps_op);
+ if (ret) {
+ kfree(s);
+ return ret;
+ }
+
+ s->seq.private = s;
+ s->event = atomic_read(&proc_poll_event);
+ return ret;
}
static const struct file_operations proc_swaps_operations = {
@@ -1725,6 +1824,7 @@ static const struct file_operations proc_swaps_operations = {
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
+ .poll = swaps_poll,
};
static int __init procswaps_init(void)
@@ -1759,11 +1859,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
unsigned int type;
int i, prev;
int error;
- union swap_header *swap_header = NULL;
- unsigned int nr_good_pages = 0;
+ union swap_header *swap_header;
+ unsigned int nr_good_pages;
int nr_extents = 0;
sector_t span;
- unsigned long maxpages = 1;
+ unsigned long maxpages;
unsigned long swapfilepages;
unsigned char *swap_map = NULL;
struct page *page = NULL;
@@ -1841,7 +1941,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = -EINVAL;
if (S_ISBLK(inode->i_mode)) {
bdev = I_BDEV(inode);
- error = bd_claim(bdev, sys_swapon);
+ error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+ sys_swapon);
if (error < 0) {
bdev = NULL;
error = -EINVAL;
@@ -1852,6 +1953,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (error < 0)
goto bad_swap;
p->bdev = bdev;
+ p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
p->bdev = inode->i_sb->s_bdev;
mutex_lock(&inode->i_mutex);
@@ -1922,9 +2024,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
* swap pte.
*/
maxpages = swp_offset(pte_to_swp_entry(
- swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1;
- if (maxpages > swap_header->info.last_page)
- maxpages = swap_header->info.last_page;
+ swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+ if (maxpages > swap_header->info.last_page) {
+ maxpages = swap_header->info.last_page + 1;
+ /* p->max is an unsigned int: don't overflow it */
+ if ((unsigned int)maxpages == 0)
+ maxpages = UINT_MAX;
+ }
p->highest_bit = maxpages - 1;
error = -EINVAL;
@@ -1948,23 +2054,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
memset(swap_map, 0, maxpages);
+ nr_good_pages = maxpages - 1; /* omit header page */
+
for (i = 0; i < swap_header->info.nr_badpages; i++) {
- int page_nr = swap_header->info.badpages[i];
- if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
+ unsigned int page_nr = swap_header->info.badpages[i];
+ if (page_nr == 0 || page_nr > swap_header->info.last_page) {
error = -EINVAL;
goto bad_swap;
}
- swap_map[page_nr] = SWAP_MAP_BAD;
+ if (page_nr < maxpages) {
+ swap_map[page_nr] = SWAP_MAP_BAD;
+ nr_good_pages--;
+ }
}
error = swap_cgroup_swapon(type, maxpages);
if (error)
goto bad_swap;
- nr_good_pages = swap_header->info.last_page -
- swap_header->info.nr_badpages -
- 1 /* header page */;
-
if (nr_good_pages) {
swap_map[0] = SWAP_MAP_BAD;
p->max = maxpages;
@@ -1987,7 +2094,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->flags |= SWP_SOLIDSTATE;
p->cluster_next = 1 + (random32() % p->highest_bit);
}
- if (discard_swap(p) == 0)
+ if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
p->flags |= SWP_DISCARDABLE;
}
@@ -2024,12 +2131,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
swap_info[prev]->next = type;
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
+ atomic_inc(&proc_poll_event);
+ wake_up_interruptible(&proc_poll_wait);
+
error = 0;
goto out;
bad_swap:
if (bdev) {
set_blocksize(bdev, p->old_block_size);
- bd_release(bdev);
+ blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
}
destroy_swap_extents(p);
swap_cgroup_swapoff(type);
@@ -2155,7 +2265,11 @@ void swap_shmem_alloc(swp_entry_t entry)
}
/*
- * increase reference count of swap entry by 1.
+ * Increase reference count of swap entry by 1.
+ * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
+ * but could not be atomically allocated. Returns 0, just as if it succeeded,
+ * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
+ * might occur if a page table entry has got corrupted.
*/
int swap_duplicate(swp_entry_t entry)
{
diff --git a/mm/truncate.c b/mm/truncate.c
index e87e37244829..3c2d5ddfa0d4 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,6 +9,7 @@
#include <linux/kernel.h>
#include <linux/backing-dev.h>
+#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/module.h>
@@ -389,6 +390,10 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (mapping->a_ops->freepage)
+ mapping->a_ops->freepage(page);
+
page_cache_release(page); /* pagecache ref */
return 1;
failed:
@@ -540,28 +545,48 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
EXPORT_SYMBOL(truncate_pagecache);
/**
+ * truncate_setsize - update inode and pagecache for a new file size
+ * @inode: inode
+ * @newsize: new file size
+ *
+ * truncate_setsize updastes i_size update and performs pagecache
+ * truncation (if necessary) for a file size updates. It will be
+ * typically be called from the filesystem's setattr function when
+ * ATTR_SIZE is passed in.
+ *
+ * Must be called with inode_mutex held and after all filesystem
+ * specific block truncation has been performed.
+ */
+void truncate_setsize(struct inode *inode, loff_t newsize)
+{
+ loff_t oldsize;
+
+ oldsize = inode->i_size;
+ i_size_write(inode, newsize);
+
+ truncate_pagecache(inode, oldsize, newsize);
+}
+EXPORT_SYMBOL(truncate_setsize);
+
+/**
* vmtruncate - unmap mappings "freed" by truncate() syscall
* @inode: inode of the file used
* @offset: file offset to start truncating
*
- * NOTE! We have to be ready to update the memory sharing
- * between the file and the memory map for a potential last
- * incomplete page. Ugly, but necessary.
+ * This function is deprecated and truncate_setsize or truncate_pagecache
+ * should be used instead, together with filesystem specific block truncation.
*/
int vmtruncate(struct inode *inode, loff_t offset)
{
- loff_t oldsize;
int error;
error = inode_newsize_ok(inode, offset);
if (error)
return error;
- oldsize = inode->i_size;
- i_size_write(inode, offset);
- truncate_pagecache(inode, oldsize, offset);
+
+ truncate_setsize(inode, offset);
if (inode->i_op->truncate)
inode->i_op->truncate(inode);
-
- return error;
+ return 0;
}
EXPORT_SYMBOL(vmtruncate);
diff --git a/mm/util.c b/mm/util.c
index 834db7be240f..f126975ef23e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -204,15 +204,10 @@ char *strndup_user(const char __user *s, long n)
if (length > n)
return ERR_PTR(-EINVAL);
- p = kmalloc(length, GFP_KERNEL);
+ p = memdup_user(s, length);
- if (!p)
- return ERR_PTR(-ENOMEM);
-
- if (copy_from_user(p, s, length)) {
- kfree(p);
- return ERR_PTR(-EFAULT);
- }
+ if (IS_ERR(p))
+ return p;
p[length - 1] = '\0';
@@ -229,6 +224,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
}
#endif
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ * If the architecture not support this fucntion, simply return with no
+ * page pinned
+ */
+int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
+ int nr_pages, int write, struct page **pages)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);
+
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d55d905463eb..f9b166732e70 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,7 +31,6 @@
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
-
/*** Page table manipulation functions ***/
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
@@ -292,13 +291,13 @@ static void __insert_vmap_area(struct vmap_area *va)
struct rb_node *tmp;
while (*p) {
- struct vmap_area *tmp;
+ struct vmap_area *tmp_va;
parent = *p;
- tmp = rb_entry(parent, struct vmap_area, rb_node);
- if (va->va_start < tmp->va_end)
+ tmp_va = rb_entry(parent, struct vmap_area, rb_node);
+ if (va->va_start < tmp_va->va_end)
p = &(*p)->rb_left;
- else if (va->va_end > tmp->va_start)
+ else if (va->va_end > tmp_va->va_start)
p = &(*p)->rb_right;
else
BUG();
@@ -509,6 +508,18 @@ static unsigned long lazy_max_pages(void)
static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+/* for per-CPU blocks */
+static void purge_fragmented_blocks_allcpus(void);
+
+/*
+ * called before a call to iounmap() if the caller wants vm_area_struct's
+ * immediately freed.
+ */
+void set_iounmap_nonlazy(void)
+{
+ atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+}
+
/*
* Purges all lazily-freed vmap areas.
*
@@ -539,6 +550,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
} else
spin_lock(&purge_lock);
+ if (sync)
+ purge_fragmented_blocks_allcpus();
+
rcu_read_lock();
list_for_each_entry_rcu(va, &vmap_area_list, list) {
if (va->flags & VM_LAZY_FREE) {
@@ -547,7 +561,6 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
if (va->va_end > *end)
*end = va->va_end;
nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
- unmap_vmap_area(va);
list_add_tail(&va->purge_list, &valist);
va->flags |= VM_LAZY_FREEING;
va->flags &= ~VM_LAZY_FREE;
@@ -592,10 +605,11 @@ static void purge_vmap_area_lazy(void)
}
/*
- * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
- * called for the correct range previously.
+ * Free a vmap area, caller ensuring that the area has been unmapped
+ * and flush_cache_vunmap had been called for the correct range
+ * previously.
*/
-static void free_unmap_vmap_area_noflush(struct vmap_area *va)
+static void free_vmap_area_noflush(struct vmap_area *va)
{
va->flags |= VM_LAZY_FREE;
atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
@@ -604,6 +618,16 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va)
}
/*
+ * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
+ * called for the correct range previously.
+ */
+static void free_unmap_vmap_area_noflush(struct vmap_area *va)
+{
+ unmap_vmap_area(va);
+ free_vmap_area_noflush(va);
+}
+
+/*
* Free and unmap a vmap area
*/
static void free_unmap_vmap_area(struct vmap_area *va)
@@ -667,8 +691,6 @@ static bool vmap_initialized __read_mostly = false;
struct vmap_block_queue {
spinlock_t lock;
struct list_head free;
- struct list_head dirty;
- unsigned int nr_dirty;
};
struct vmap_block {
@@ -678,10 +700,9 @@ struct vmap_block {
unsigned long free, dirty;
DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
- union {
- struct list_head free_list;
- struct rcu_head rcu_head;
- };
+ struct list_head free_list;
+ struct rcu_head rcu_head;
+ struct list_head purge;
};
/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
@@ -727,9 +748,9 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
VMALLOC_START, VMALLOC_END,
node, gfp_mask);
- if (unlikely(IS_ERR(va))) {
+ if (IS_ERR(va)) {
kfree(vb);
- return ERR_PTR(PTR_ERR(va));
+ return ERR_CAST(va);
}
err = radix_tree_preload(gfp_mask);
@@ -757,7 +778,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
vbq = &get_cpu_var(vmap_block_queue);
vb->vbq = vbq;
spin_lock(&vbq->lock);
- list_add(&vb->free_list, &vbq->free);
+ list_add_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
put_cpu_var(vmap_block_queue);
@@ -776,24 +797,71 @@ static void free_vmap_block(struct vmap_block *vb)
struct vmap_block *tmp;
unsigned long vb_idx;
- BUG_ON(!list_empty(&vb->free_list));
-
vb_idx = addr_to_vb_idx(vb->va->va_start);
spin_lock(&vmap_block_tree_lock);
tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
spin_unlock(&vmap_block_tree_lock);
BUG_ON(tmp != vb);
- free_unmap_vmap_area_noflush(vb->va);
+ free_vmap_area_noflush(vb->va);
call_rcu(&vb->rcu_head, rcu_free_vb);
}
+static void purge_fragmented_blocks(int cpu)
+{
+ LIST_HEAD(purge);
+ struct vmap_block *vb;
+ struct vmap_block *n_vb;
+ struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+
+ if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
+ continue;
+
+ spin_lock(&vb->lock);
+ if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
+ vb->free = 0; /* prevent further allocs after releasing lock */
+ vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
+ bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
+ bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
+ spin_lock(&vbq->lock);
+ list_del_rcu(&vb->free_list);
+ spin_unlock(&vbq->lock);
+ spin_unlock(&vb->lock);
+ list_add_tail(&vb->purge, &purge);
+ } else
+ spin_unlock(&vb->lock);
+ }
+ rcu_read_unlock();
+
+ list_for_each_entry_safe(vb, n_vb, &purge, purge) {
+ list_del(&vb->purge);
+ free_vmap_block(vb);
+ }
+}
+
+static void purge_fragmented_blocks_thiscpu(void)
+{
+ purge_fragmented_blocks(smp_processor_id());
+}
+
+static void purge_fragmented_blocks_allcpus(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ purge_fragmented_blocks(cpu);
+}
+
static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
struct vmap_block_queue *vbq;
struct vmap_block *vb;
unsigned long addr = 0;
unsigned int order;
+ int purge = 0;
BUG_ON(size & ~PAGE_MASK);
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -806,24 +874,38 @@ again:
int i;
spin_lock(&vb->lock);
+ if (vb->free < 1UL << order)
+ goto next;
+
i = bitmap_find_free_region(vb->alloc_map,
VMAP_BBMAP_BITS, order);
- if (i >= 0) {
- addr = vb->va->va_start + (i << PAGE_SHIFT);
- BUG_ON(addr_to_vb_idx(addr) !=
- addr_to_vb_idx(vb->va->va_start));
- vb->free -= 1UL << order;
- if (vb->free == 0) {
- spin_lock(&vbq->lock);
- list_del_init(&vb->free_list);
- spin_unlock(&vbq->lock);
+ if (i < 0) {
+ if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
+ /* fragmented and no outstanding allocations */
+ BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
+ purge = 1;
}
- spin_unlock(&vb->lock);
- break;
+ goto next;
+ }
+ addr = vb->va->va_start + (i << PAGE_SHIFT);
+ BUG_ON(addr_to_vb_idx(addr) !=
+ addr_to_vb_idx(vb->va->va_start));
+ vb->free -= 1UL << order;
+ if (vb->free == 0) {
+ spin_lock(&vbq->lock);
+ list_del_rcu(&vb->free_list);
+ spin_unlock(&vbq->lock);
}
spin_unlock(&vb->lock);
+ break;
+next:
+ spin_unlock(&vb->lock);
}
+
+ if (purge)
+ purge_fragmented_blocks_thiscpu();
+
put_cpu_var(vmap_block_queue);
rcu_read_unlock();
@@ -859,12 +941,14 @@ static void vb_free(const void *addr, unsigned long size)
rcu_read_unlock();
BUG_ON(!vb);
+ vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+
spin_lock(&vb->lock);
- bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
+ BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
vb->dirty += 1UL << order;
if (vb->dirty == VMAP_BBMAP_BITS) {
- BUG_ON(vb->free || !list_empty(&vb->free_list));
+ BUG_ON(vb->free);
spin_unlock(&vb->lock);
free_vmap_block(vb);
} else
@@ -911,7 +995,6 @@ void vm_unmap_aliases(void)
s = vb->va->va_start + (i << PAGE_SHIFT);
e = vb->va->va_start + (j << PAGE_SHIFT);
- vunmap_page_range(s, e);
flush = 1;
if (s < start)
@@ -1033,8 +1116,6 @@ void __init vmalloc_init(void)
vbq = &per_cpu(vmap_block_queue, i);
spin_lock_init(&vbq->lock);
INIT_LIST_HEAD(&vbq->free);
- INIT_LIST_HEAD(&vbq->dirty);
- vbq->nr_dirty = 0;
}
/* Import existing vmlist entries. */
@@ -1094,6 +1175,7 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
{
vunmap_page_range(addr, addr + size);
}
+EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
/**
* unmap_kernel_range - unmap kernel VM area and flush cache and TLB
@@ -1234,13 +1316,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
-1, GFP_KERNEL, caller);
}
-struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
- int node, gfp_t gfp_mask)
-{
- return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
- node, gfp_mask, __builtin_return_address(0));
-}
-
static struct vm_struct *find_vm_area(const void *addr)
{
struct vmap_area *va;
@@ -1456,25 +1531,12 @@ fail:
return NULL;
}
-void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
-{
- void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
- __builtin_return_address(0));
-
- /*
- * A ref_count = 3 is needed because the vm_struct and vmap_area
- * structures allocated in the __get_vm_area_node() function contain
- * references to the virtual address of the vmalloc'ed block.
- */
- kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
-
- return addr;
-}
-
/**
- * __vmalloc_node - allocate virtually contiguous memory
+ * __vmalloc_node_range - allocate virtually contiguous memory
* @size: allocation size
* @align: desired alignment
+ * @start: vm area range start
+ * @end: vm area range end
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
* @node: node to use for allocation or -1
@@ -1484,9 +1546,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*/
-static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, void *caller)
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
+ unsigned long start, unsigned long end, gfp_t gfp_mask,
+ pgprot_t prot, int node, void *caller)
{
struct vm_struct *area;
void *addr;
@@ -1496,8 +1558,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
if (!size || (size >> PAGE_SHIFT) > totalram_pages)
return NULL;
- area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
- VMALLOC_END, node, gfp_mask, caller);
+ area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
+ gfp_mask, caller);
if (!area)
return NULL;
@@ -1514,6 +1576,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
return addr;
}
+/**
+ * __vmalloc_node - allocate virtually contiguous memory
+ * @size: allocation size
+ * @align: desired alignment
+ * @gfp_mask: flags for the page level allocator
+ * @prot: protection mask for the allocated pages
+ * @node: node to use for allocation or -1
+ * @caller: caller's return address
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator with @gfp_mask flags. Map them into contiguous
+ * kernel virtual space, using a pagetable protection of @prot.
+ */
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, pgprot_t prot,
+ int node, void *caller)
+{
+ return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+ gfp_mask, prot, node, caller);
+}
+
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
{
return __vmalloc_node(size, 1, gfp_mask, prot, -1,
@@ -1521,6 +1604,13 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
}
EXPORT_SYMBOL(__vmalloc);
+static inline void *__vmalloc_node_flags(unsigned long size,
+ int node, gfp_t flags)
+{
+ return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
+ node, __builtin_return_address(0));
+}
+
/**
* vmalloc - allocate virtually contiguous memory
* @size: allocation size
@@ -1532,12 +1622,28 @@ EXPORT_SYMBOL(__vmalloc);
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
- -1, __builtin_return_address(0));
+ return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM);
}
EXPORT_SYMBOL(vmalloc);
/**
+ * vzalloc - allocate virtually contiguous memory with zero fill
+ * @size: allocation size
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc(unsigned long size)
+{
+ return __vmalloc_node_flags(size, -1,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc);
+
+/**
* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
* @size: allocation size
*
@@ -1578,6 +1684,25 @@ void *vmalloc_node(unsigned long size, int node)
}
EXPORT_SYMBOL(vmalloc_node);
+/**
+ * vzalloc_node - allocate memory on a specific node with zero fill
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc_node() instead.
+ */
+void *vzalloc_node(unsigned long size, int node)
+{
+ return __vmalloc_node_flags(size, node,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc_node);
+
#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif
@@ -1990,6 +2115,7 @@ void free_vm_area(struct vm_struct *area)
}
EXPORT_SYMBOL_GPL(free_vm_area);
+#ifdef CONFIG_SMP
static struct vmap_area *node_to_va(struct rb_node *n)
{
return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
@@ -2079,17 +2205,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
* @sizes: array containing size of each area
* @nr_vms: the number of areas to allocate
* @align: alignment, all entries in @offsets and @sizes must be aligned to this
- * @gfp_mask: allocation mask
*
* Returns: kmalloc'd vm_struct pointer array pointing to allocated
* vm_structs on success, %NULL on failure
*
* Percpu allocator wants to use congruent vm areas so that it can
* maintain the offsets among percpu areas. This function allocates
- * congruent vmalloc areas for it. These areas tend to be scattered
- * pretty far, distance between two areas easily going up to
- * gigabytes. To avoid interacting with regular vmallocs, these areas
- * are allocated from top.
+ * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
+ * be scattered pretty far, distance between two areas easily going up
+ * to gigabytes. To avoid interacting with regular vmallocs, these
+ * areas are allocated from top.
*
* Despite its complicated look, this allocator is rather simple. It
* does everything top-down and scans areas from the end looking for
@@ -2100,7 +2225,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
*/
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
- size_t align, gfp_t gfp_mask)
+ size_t align)
{
const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2110,8 +2235,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
unsigned long base, start, end, last_end;
bool purged = false;
- gfp_mask &= GFP_RECLAIM_MASK;
-
/* verify parameters and allocate data structures */
BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
for (last_area = 0, area = 0; area < nr_vms; area++) {
@@ -2144,14 +2267,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
return NULL;
}
- vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
- vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
+ vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
+ vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
if (!vas || !vms)
goto err_free;
for (area = 0; area < nr_vms; area++) {
- vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
- vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
+ vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
+ vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
if (!vas[area] || !vms[area])
goto err_free;
}
@@ -2270,9 +2393,11 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
free_vm_area(vms[i]);
kfree(vms);
}
+#endif /* CONFIG_SMP */
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
+ __acquires(&vmlist_lock)
{
loff_t n = *pos;
struct vm_struct *v;
@@ -2299,6 +2424,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
}
static void s_stop(struct seq_file *m, void *p)
+ __releases(&vmlist_lock)
{
read_unlock(&vmlist_lock);
}
@@ -2329,19 +2455,14 @@ static int s_show(struct seq_file *m, void *p)
seq_printf(m, "0x%p-0x%p %7ld",
v->addr, v->addr + v->size, v->size);
- if (v->caller) {
- char buff[KSYM_SYMBOL_LEN];
-
- seq_putc(m, ' ');
- sprint_symbol(buff, (unsigned long)v->caller);
- seq_puts(m, buff);
- }
+ if (v->caller)
+ seq_printf(m, " %pS", v->caller);
if (v->nr_pages)
seq_printf(m, " pages=%d", v->nr_pages);
if (v->phys_addr)
- seq_printf(m, " phys=%lx", v->phys_addr);
+ seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr);
if (v->flags & VM_IOREMAP)
seq_printf(m, " ioremap");
@@ -2375,8 +2496,11 @@ static int vmalloc_open(struct inode *inode, struct file *file)
unsigned int *ptr = NULL;
int ret;
- if (NUMA_BUILD)
+ if (NUMA_BUILD) {
ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+ if (ptr == NULL)
+ return -ENOMEM;
+ }
ret = seq_open(file, &vmalloc_op);
if (!ret) {
struct seq_file *m = file->private_data;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c26986c85ce0..47a50962ce81 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -13,7 +13,7 @@
#include <linux/mm.h>
#include <linux/module.h>
-#include <linux/slab.h>
+#include <linux/gfp.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
@@ -32,6 +32,7 @@
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
+#include <linux/compaction.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
#include <linux/delay.h>
@@ -40,6 +41,7 @@
#include <linux/memcontrol.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
+#include <linux/compaction.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -48,6 +50,27 @@
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/vmscan.h>
+
+/*
+ * reclaim_mode determines how the inactive list is shrunk
+ * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
+ * RECLAIM_MODE_ASYNC: Do not block
+ * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
+ * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
+ * page from the LRU and reclaim all pages within a
+ * naturally aligned range
+ * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
+ * order-0 pages and then compact the zone
+ */
+typedef unsigned __bitwise__ reclaim_mode_t;
+#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
+#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
+#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
+#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
+#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
+
struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
@@ -73,10 +96,14 @@ struct scan_control {
int swappiness;
- int all_unreclaimable;
-
int order;
+ /*
+ * Intend to reclaim enough continuous memory rather than reclaim
+ * enough amount of memory. i.e, mode for high order allocation.
+ */
+ reclaim_mode_t reclaim_mode;
+
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;
@@ -85,12 +112,6 @@ struct scan_control {
* are scanned.
*/
nodemask_t *nodemask;
-
- /* Pluggable isolate pages callback */
- unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
- unsigned long *scanned, int order, int mode,
- struct zone *z, struct mem_cgroup *mem_cont,
- int active, int file);
};
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -215,8 +236,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
- unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
+ unsigned long max_pass;
+ max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
delta = (4 * scanned) / shrinker->seeks;
delta *= max_pass;
do_div(delta, lru_pages + 1);
@@ -244,8 +266,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
int shrink_ret;
int nr_before;
- nr_before = (*shrinker->shrink)(0, gfp_mask);
- shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
+ nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
+ shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
+ gfp_mask);
if (shrink_ret == -1)
break;
if (shrink_ret < nr_before)
@@ -262,25 +285,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
return ret;
}
-/* Called without lock on whether page is mapped, so answer is unstable */
-static inline int page_mapping_inuse(struct page *page)
+static void set_reclaim_mode(int priority, struct scan_control *sc,
+ bool sync)
{
- struct address_space *mapping;
-
- /* Page is in somebody's page tables. */
- if (page_mapped(page))
- return 1;
+ reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
- /* Be more reluctant to reclaim swapcache than pagecache */
- if (PageSwapCache(page))
- return 1;
+ /*
+ * Initially assume we are entering either lumpy reclaim or
+ * reclaim/compaction.Depending on the order, we will either set the
+ * sync mode or just reclaim order-0 pages later.
+ */
+ if (COMPACTION_BUILD)
+ sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
+ else
+ sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
- mapping = page_mapping(page);
- if (!mapping)
- return 0;
+ /*
+ * Avoid using lumpy reclaim or reclaim/compaction if possible by
+ * restricting when its set to either costly allocations or when
+ * under memory pressure
+ */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+ sc->reclaim_mode |= syncmode;
+ else if (sc->order && priority < DEF_PRIORITY - 2)
+ sc->reclaim_mode |= syncmode;
+ else
+ sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
+}
- /* File is mmap'd by somebody? */
- return mapping_mapped(mapping);
+static void reset_reclaim_mode(struct scan_control *sc)
+{
+ sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
}
static inline int is_page_cache_freeable(struct page *page)
@@ -293,7 +328,8 @@ static inline int is_page_cache_freeable(struct page *page)
return page_count(page) - page_has_private(page) == 2;
}
-static int may_write_to_queue(struct backing_dev_info *bdi)
+static int may_write_to_queue(struct backing_dev_info *bdi,
+ struct scan_control *sc)
{
if (current->flags & PF_SWAPWRITE)
return 1;
@@ -301,6 +337,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
return 1;
if (bdi == current->backing_dev_info)
return 1;
+
+ /* lumpy reclaim for hugepage often need a lot of write */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+ return 1;
return 0;
}
@@ -319,18 +359,12 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
static void handle_write_error(struct address_space *mapping,
struct page *page, int error)
{
- lock_page(page);
+ lock_page_nosync(page);
if (page_mapping(page) == mapping)
mapping_set_error(mapping, error);
unlock_page(page);
}
-/* Request for sync pageout. */
-enum pageout_io {
- PAGEOUT_IO_ASYNC,
- PAGEOUT_IO_SYNC,
-};
-
/* possible outcome of pageout() */
typedef enum {
/* failed to write page out, page is locked */
@@ -348,7 +382,7 @@ typedef enum {
* Calls ->writepage().
*/
static pageout_t pageout(struct page *page, struct address_space *mapping,
- enum pageout_io sync_writeback)
+ struct scan_control *sc)
{
/*
* If the page is dirty, only perform writeback if that write
@@ -384,7 +418,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_queue(mapping->backing_dev_info))
+ if (!may_write_to_queue(mapping->backing_dev_info, sc))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
@@ -394,7 +428,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
.nr_to_write = SWAP_CLUSTER_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
- .nonblocking = 1,
.for_reclaim = 1,
};
@@ -412,13 +445,16 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
* direct reclaiming a large contiguous area and the
* first attempt to free a range of pages fails.
*/
- if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
+ if (PageWriteback(page) &&
+ (sc->reclaim_mode & RECLAIM_MODE_SYNC))
wait_on_page_writeback(page);
if (!PageWriteback(page)) {
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
+ trace_mm_vmscan_writepage(page,
+ trace_reclaim_flags(page, sc->reclaim_mode));
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -475,9 +511,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
spin_unlock_irq(&mapping->tree_lock);
swapcache_free(swap, page);
} else {
+ void (*freepage)(struct page *);
+
+ freepage = mapping->a_ops->freepage;
+
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (freepage != NULL)
+ freepage(page);
}
return 1;
@@ -579,27 +622,104 @@ redo:
put_page(page); /* drop ref from isolate */
}
+enum page_references {
+ PAGEREF_RECLAIM,
+ PAGEREF_RECLAIM_CLEAN,
+ PAGEREF_KEEP,
+ PAGEREF_ACTIVATE,
+};
+
+static enum page_references page_check_references(struct page *page,
+ struct scan_control *sc)
+{
+ int referenced_ptes, referenced_page;
+ unsigned long vm_flags;
+
+ referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
+ referenced_page = TestClearPageReferenced(page);
+
+ /* Lumpy reclaim - ignore references */
+ if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+ return PAGEREF_RECLAIM;
+
+ /*
+ * Mlock lost the isolation race with us. Let try_to_unmap()
+ * move the page to the unevictable list.
+ */
+ if (vm_flags & VM_LOCKED)
+ return PAGEREF_RECLAIM;
+
+ if (referenced_ptes) {
+ if (PageAnon(page))
+ return PAGEREF_ACTIVATE;
+ /*
+ * All mapped pages start out with page table
+ * references from the instantiating fault, so we need
+ * to look twice if a mapped file page is used more
+ * than once.
+ *
+ * Mark it and spare it for another trip around the
+ * inactive list. Another page table reference will
+ * lead to its activation.
+ *
+ * Note: the mark is set for activated pages as well
+ * so that recently deactivated but used pages are
+ * quickly recovered.
+ */
+ SetPageReferenced(page);
+
+ if (referenced_page)
+ return PAGEREF_ACTIVATE;
+
+ return PAGEREF_KEEP;
+ }
+
+ /* Reclaim if clean, defer dirty pages to writeback */
+ if (referenced_page && !PageSwapBacked(page))
+ return PAGEREF_RECLAIM_CLEAN;
+
+ return PAGEREF_RECLAIM;
+}
+
+static noinline_for_stack void free_page_list(struct list_head *free_pages)
+{
+ struct pagevec freed_pvec;
+ struct page *page, *tmp;
+
+ pagevec_init(&freed_pvec, 1);
+
+ list_for_each_entry_safe(page, tmp, free_pages, lru) {
+ list_del(&page->lru);
+ if (!pagevec_add(&freed_pvec, page)) {
+ __pagevec_free(&freed_pvec);
+ pagevec_reinit(&freed_pvec);
+ }
+ }
+
+ pagevec_free(&freed_pvec);
+}
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
static unsigned long shrink_page_list(struct list_head *page_list,
- struct scan_control *sc,
- enum pageout_io sync_writeback)
+ struct zone *zone,
+ struct scan_control *sc)
{
LIST_HEAD(ret_pages);
- struct pagevec freed_pvec;
+ LIST_HEAD(free_pages);
int pgactivate = 0;
+ unsigned long nr_dirty = 0;
+ unsigned long nr_congested = 0;
unsigned long nr_reclaimed = 0;
- unsigned long vm_flags;
cond_resched();
- pagevec_init(&freed_pvec, 1);
while (!list_empty(page_list)) {
+ enum page_references references;
struct address_space *mapping;
struct page *page;
int may_enter_fs;
- int referenced;
cond_resched();
@@ -610,6 +730,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep;
VM_BUG_ON(PageActive(page));
+ VM_BUG_ON(page_zone(page) != zone);
sc->nr_scanned++;
@@ -635,23 +756,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* for any page for which writeback has already
* started.
*/
- if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
+ if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
+ may_enter_fs)
wait_on_page_writeback(page);
- else
- goto keep_locked;
+ else {
+ unlock_page(page);
+ goto keep_lumpy;
+ }
}
- referenced = page_referenced(page, 1,
- sc->mem_cgroup, &vm_flags);
- /*
- * In active use or really unfreeable? Activate it.
- * If page which have PG_mlocked lost isoltation race,
- * try_to_unmap moves it to unevictable list
- */
- if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
- referenced && page_mapping_inuse(page)
- && !(vm_flags & VM_LOCKED))
+ references = page_check_references(page, sc);
+ switch (references) {
+ case PAGEREF_ACTIVATE:
goto activate_locked;
+ case PAGEREF_KEEP:
+ goto keep_locked;
+ case PAGEREF_RECLAIM:
+ case PAGEREF_RECLAIM_CLEAN:
+ ; /* try to reclaim the page below */
+ }
/*
* Anonymous process memory has backing store?
@@ -685,7 +808,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
if (PageDirty(page)) {
- if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
+ nr_dirty++;
+
+ if (references == PAGEREF_RECLAIM_CLEAN)
goto keep_locked;
if (!may_enter_fs)
goto keep_locked;
@@ -693,14 +818,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
/* Page is dirty, try to write it out here */
- switch (pageout(page, mapping, sync_writeback)) {
+ switch (pageout(page, mapping, sc)) {
case PAGE_KEEP:
+ nr_congested++;
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
- if (PageWriteback(page) || PageDirty(page))
+ if (PageWriteback(page))
+ goto keep_lumpy;
+ if (PageDirty(page))
goto keep;
+
/*
* A synchronous write - probably a ramdisk. Go
* ahead and try to reclaim the page.
@@ -770,10 +899,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
__clear_page_locked(page);
free_it:
nr_reclaimed++;
- if (!pagevec_add(&freed_pvec, page)) {
- __pagevec_free(&freed_pvec);
- pagevec_reinit(&freed_pvec);
- }
+
+ /*
+ * Is there need to periodically free_page_list? It would
+ * appear not as the counts should be low
+ */
+ list_add(&page->lru, &free_pages);
continue;
cull_mlocked:
@@ -781,6 +912,7 @@ cull_mlocked:
try_to_free_swap(page);
unlock_page(page);
putback_lru_page(page);
+ reset_reclaim_mode(sc);
continue;
activate_locked:
@@ -793,21 +925,28 @@ activate_locked:
keep_locked:
unlock_page(page);
keep:
+ reset_reclaim_mode(sc);
+keep_lumpy:
list_add(&page->lru, &ret_pages);
VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
}
+
+ /*
+ * Tag a zone as congested if all the dirty pages encountered were
+ * backed by a congested BDI. In this case, reclaimers should just
+ * back off and wait for congestion to clear because further reclaim
+ * will encounter the same problem
+ */
+ if (nr_dirty == nr_congested && nr_dirty != 0)
+ zone_set_flag(zone, ZONE_CONGESTED);
+
+ free_page_list(&free_pages);
+
list_splice(&ret_pages, page_list);
- if (pagevec_count(&freed_pvec))
- __pagevec_free(&freed_pvec);
count_vm_events(PGACTIVATE, pgactivate);
return nr_reclaimed;
}
-/* LRU Isolation modes. */
-#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */
-#define ISOLATE_ACTIVE 1 /* Isolate active pages. */
-#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */
-
/*
* Attempt to remove the specified page from its LRU. Only take this page
* if it is of the appropriate PageActive status. Pages which are being
@@ -885,6 +1024,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long *scanned, int order, int mode, int file)
{
unsigned long nr_taken = 0;
+ unsigned long nr_lumpy_taken = 0;
+ unsigned long nr_lumpy_dirty = 0;
+ unsigned long nr_lumpy_failed = 0;
unsigned long scan;
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
@@ -903,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
case 0:
list_move(&page->lru, dst);
mem_cgroup_del_lru(page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
@@ -947,7 +1089,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
/* Check that we have not crossed a zone boundary. */
if (unlikely(page_zone_id(cursor_page) != zone_id))
- continue;
+ break;
/*
* If we don't have enough swap space, reclaiming of
@@ -955,19 +1097,37 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* pointless.
*/
if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
- !PageSwapCache(cursor_page))
- continue;
+ !PageSwapCache(cursor_page))
+ break;
if (__isolate_lru_page(cursor_page, mode, file) == 0) {
list_move(&cursor_page->lru, dst);
mem_cgroup_del_lru(cursor_page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
+ nr_lumpy_taken++;
+ if (PageDirty(cursor_page))
+ nr_lumpy_dirty++;
scan++;
+ } else {
+ /* the page is freed already. */
+ if (!page_count(cursor_page))
+ continue;
+ break;
}
}
+
+ /* If we break out of the loop above, lumpy reclaim failed */
+ if (pfn < end_pfn)
+ nr_lumpy_failed++;
}
*scanned = scan;
+
+ trace_mm_vmscan_lru_isolate(order,
+ nr_to_scan, scan,
+ nr_taken,
+ nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
+ mode);
return nr_taken;
}
@@ -975,7 +1135,6 @@ static unsigned long isolate_pages_global(unsigned long nr,
struct list_head *dst,
unsigned long *scanned, int order,
int mode, struct zone *z,
- struct mem_cgroup *mem_cont,
int active, int file)
{
int lru = LRU_BASE;
@@ -999,13 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
struct page *page;
list_for_each_entry(page, page_list, lru) {
+ int numpages = hpage_nr_pages(page);
lru = page_lru_base_type(page);
if (PageActive(page)) {
lru += LRU_ACTIVE;
ClearPageActive(page);
- nr_active++;
+ nr_active += numpages;
}
- count[lru]++;
+ if (count)
+ count[lru] += numpages;
}
return nr_active;
@@ -1082,176 +1243,210 @@ static int too_many_isolated(struct zone *zone, int file,
}
/*
- * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
- * of reclaimed pages
+ * TODO: Try merging with migrations version of putback_lru_pages
*/
-static unsigned long shrink_inactive_list(unsigned long max_scan,
- struct zone *zone, struct scan_control *sc,
- int priority, int file)
+static noinline_for_stack void
+putback_lru_pages(struct zone *zone, struct scan_control *sc,
+ unsigned long nr_anon, unsigned long nr_file,
+ struct list_head *page_list)
{
- LIST_HEAD(page_list);
+ struct page *page;
struct pagevec pvec;
- unsigned long nr_scanned = 0;
- unsigned long nr_reclaimed = 0;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
- int lumpy_reclaim = 0;
-
- while (unlikely(too_many_isolated(zone, file, sc))) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- /* We are about to die and free our memory. Return now. */
- if (fatal_signal_pending(current))
- return SWAP_CLUSTER_MAX;
- }
+ pagevec_init(&pvec, 1);
/*
- * If we need a large contiguous chunk of memory, or have
- * trouble getting a small set of contiguous pages, we
- * will reclaim both active and inactive pages.
- *
- * We use the same threshold as pageout congestion_wait below.
+ * Put back any unfreeable pages.
*/
- if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
- lumpy_reclaim = 1;
- else if (sc->order && priority < DEF_PRIORITY - 2)
- lumpy_reclaim = 1;
-
- pagevec_init(&pvec, 1);
+ spin_lock(&zone->lru_lock);
+ while (!list_empty(page_list)) {
+ int lru;
+ page = lru_to_page(page_list);
+ VM_BUG_ON(PageLRU(page));
+ list_del(&page->lru);
+ if (unlikely(!page_evictable(page, NULL))) {
+ spin_unlock_irq(&zone->lru_lock);
+ putback_lru_page(page);
+ spin_lock_irq(&zone->lru_lock);
+ continue;
+ }
+ SetPageLRU(page);
+ lru = page_lru(page);
+ add_page_to_lru_list(zone, page, lru);
+ if (is_active_lru(lru)) {
+ int file = is_file_lru(lru);
+ int numpages = hpage_nr_pages(page);
+ reclaim_stat->recent_rotated[file] += numpages;
+ }
+ if (!pagevec_add(&pvec, page)) {
+ spin_unlock_irq(&zone->lru_lock);
+ __pagevec_release(&pvec);
+ spin_lock_irq(&zone->lru_lock);
+ }
+ }
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
- lru_add_drain();
- spin_lock_irq(&zone->lru_lock);
- do {
- struct page *page;
- unsigned long nr_taken;
- unsigned long nr_scan;
- unsigned long nr_freed;
- unsigned long nr_active;
- unsigned int count[NR_LRU_LISTS] = { 0, };
- int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
- unsigned long nr_anon;
- unsigned long nr_file;
-
- nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX,
- &page_list, &nr_scan, sc->order, mode,
- zone, sc->mem_cgroup, 0, file);
+ spin_unlock_irq(&zone->lru_lock);
+ pagevec_release(&pvec);
+}
- if (scanning_global_lru(sc)) {
- zone->pages_scanned += nr_scan;
- if (current_is_kswapd())
- __count_zone_vm_events(PGSCAN_KSWAPD, zone,
- nr_scan);
- else
- __count_zone_vm_events(PGSCAN_DIRECT, zone,
- nr_scan);
- }
+static noinline_for_stack void update_isolated_counts(struct zone *zone,
+ struct scan_control *sc,
+ unsigned long *nr_anon,
+ unsigned long *nr_file,
+ struct list_head *isolated_list)
+{
+ unsigned long nr_active;
+ unsigned int count[NR_LRU_LISTS] = { 0, };
+ struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
- if (nr_taken == 0)
- goto done;
+ nr_active = clear_active_flags(isolated_list, count);
+ __count_vm_events(PGDEACTIVATE, nr_active);
+
+ __mod_zone_page_state(zone, NR_ACTIVE_FILE,
+ -count[LRU_ACTIVE_FILE]);
+ __mod_zone_page_state(zone, NR_INACTIVE_FILE,
+ -count[LRU_INACTIVE_FILE]);
+ __mod_zone_page_state(zone, NR_ACTIVE_ANON,
+ -count[LRU_ACTIVE_ANON]);
+ __mod_zone_page_state(zone, NR_INACTIVE_ANON,
+ -count[LRU_INACTIVE_ANON]);
+
+ *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+ *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
+
+ reclaim_stat->recent_scanned[0] += *nr_anon;
+ reclaim_stat->recent_scanned[1] += *nr_file;
+}
- nr_active = clear_active_flags(&page_list, count);
- __count_vm_events(PGDEACTIVATE, nr_active);
+/*
+ * Returns true if the caller should wait to clean dirty/writeback pages.
+ *
+ * If we are direct reclaiming for contiguous pages and we do not reclaim
+ * everything in the list, try again and wait for writeback IO to complete.
+ * This will stall high-order allocations noticeably. Only do that when really
+ * need to free the pages under high memory pressure.
+ */
+static inline bool should_reclaim_stall(unsigned long nr_taken,
+ unsigned long nr_freed,
+ int priority,
+ struct scan_control *sc)
+{
+ int lumpy_stall_priority;
- __mod_zone_page_state(zone, NR_ACTIVE_FILE,
- -count[LRU_ACTIVE_FILE]);
- __mod_zone_page_state(zone, NR_INACTIVE_FILE,
- -count[LRU_INACTIVE_FILE]);
- __mod_zone_page_state(zone, NR_ACTIVE_ANON,
- -count[LRU_ACTIVE_ANON]);
- __mod_zone_page_state(zone, NR_INACTIVE_ANON,
- -count[LRU_INACTIVE_ANON]);
+ /* kswapd should not stall on sync IO */
+ if (current_is_kswapd())
+ return false;
- nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
- nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
+ /* Only stall on lumpy reclaim */
+ if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
+ return false;
- reclaim_stat->recent_scanned[0] += nr_anon;
- reclaim_stat->recent_scanned[1] += nr_file;
+ /* If we have relaimed everything on the isolated list, no stall */
+ if (nr_freed == nr_taken)
+ return false;
- spin_unlock_irq(&zone->lru_lock);
+ /*
+ * For high-order allocations, there are two stall thresholds.
+ * High-cost allocations stall immediately where as lower
+ * order allocations such as stacks require the scanning
+ * priority to be much higher before stalling.
+ */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+ lumpy_stall_priority = DEF_PRIORITY;
+ else
+ lumpy_stall_priority = DEF_PRIORITY / 3;
- nr_scanned += nr_scan;
- nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+ return priority <= lumpy_stall_priority;
+}
- /*
- * If we are direct reclaiming for contiguous pages and we do
- * not reclaim everything in the list, try again and wait
- * for IO to complete. This will stall high-order allocations
- * but that should be acceptable to the caller
- */
- if (nr_freed < nr_taken && !current_is_kswapd() &&
- lumpy_reclaim) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+/*
+ * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
+ * of reclaimed pages
+ */
+static noinline_for_stack unsigned long
+shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
+ struct scan_control *sc, int priority, int file)
+{
+ LIST_HEAD(page_list);
+ unsigned long nr_scanned;
+ unsigned long nr_reclaimed = 0;
+ unsigned long nr_taken;
+ unsigned long nr_anon;
+ unsigned long nr_file;
- /*
- * The attempt at page out may have made some
- * of the pages active, mark them inactive again.
- */
- nr_active = clear_active_flags(&page_list, count);
- count_vm_events(PGDEACTIVATE, nr_active);
+ while (unlikely(too_many_isolated(zone, file, sc))) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
- nr_freed += shrink_page_list(&page_list, sc,
- PAGEOUT_IO_SYNC);
- }
+ /* We are about to die and free our memory. Return now. */
+ if (fatal_signal_pending(current))
+ return SWAP_CLUSTER_MAX;
+ }
- nr_reclaimed += nr_freed;
+ set_reclaim_mode(priority, sc, false);
+ lru_add_drain();
+ spin_lock_irq(&zone->lru_lock);
- local_irq_disable();
+ if (scanning_global_lru(sc)) {
+ nr_taken = isolate_pages_global(nr_to_scan,
+ &page_list, &nr_scanned, sc->order,
+ sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
+ ISOLATE_BOTH : ISOLATE_INACTIVE,
+ zone, 0, file);
+ zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
- __count_vm_events(KSWAPD_STEAL, nr_freed);
- __count_zone_vm_events(PGSTEAL, zone, nr_freed);
-
- spin_lock(&zone->lru_lock);
+ __count_zone_vm_events(PGSCAN_KSWAPD, zone,
+ nr_scanned);
+ else
+ __count_zone_vm_events(PGSCAN_DIRECT, zone,
+ nr_scanned);
+ } else {
+ nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
+ &page_list, &nr_scanned, sc->order,
+ sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
+ ISOLATE_BOTH : ISOLATE_INACTIVE,
+ zone, sc->mem_cgroup,
+ 0, file);
/*
- * Put back any unfreeable pages.
+ * mem_cgroup_isolate_pages() keeps track of
+ * scanned pages on its own.
*/
- while (!list_empty(&page_list)) {
- int lru;
- page = lru_to_page(&page_list);
- VM_BUG_ON(PageLRU(page));
- list_del(&page->lru);
- if (unlikely(!page_evictable(page, NULL))) {
- spin_unlock_irq(&zone->lru_lock);
- putback_lru_page(page);
- spin_lock_irq(&zone->lru_lock);
- continue;
- }
- SetPageLRU(page);
- lru = page_lru(page);
- add_page_to_lru_list(zone, page, lru);
- if (is_active_lru(lru)) {
- int file = is_file_lru(lru);
- reclaim_stat->recent_rotated[file]++;
- }
- if (!pagevec_add(&pvec, page)) {
- spin_unlock_irq(&zone->lru_lock);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
- }
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+ }
+
+ if (nr_taken == 0) {
+ spin_unlock_irq(&zone->lru_lock);
+ return 0;
+ }
- } while (nr_scanned < max_scan);
+ update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
-done:
spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
- return nr_reclaimed;
-}
-/*
- * We are about to scan this zone at a certain priority level. If that priority
- * level is smaller (ie: more urgent) than the previous priority, then note
- * that priority level within the zone. This is done so that when the next
- * process comes in to scan this zone, it will immediately start out at this
- * priority level rather than having to build up its own scanning priority.
- * Here, this priority affects only the reclaim-mapped threshold.
- */
-static inline void note_zone_scanning_priority(struct zone *zone, int priority)
-{
- if (priority < zone->prev_priority)
- zone->prev_priority = priority;
+ nr_reclaimed = shrink_page_list(&page_list, zone, sc);
+
+ /* Check if we should syncronously wait for writeback */
+ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
+ set_reclaim_mode(priority, sc, true);
+ nr_reclaimed += shrink_page_list(&page_list, zone, sc);
+ }
+
+ local_irq_disable();
+ if (current_is_kswapd())
+ __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
+ __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
+
+ putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+
+ trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
+ zone_idx(zone),
+ nr_scanned, nr_reclaimed,
+ priority,
+ trace_shrink_flags(file, sc->reclaim_mode));
+ return nr_reclaimed;
}
/*
@@ -1290,7 +1485,7 @@ static void move_active_pages_to_lru(struct zone *zone,
list_move(&page->lru, &zone->lru[lru].list);
mem_cgroup_add_lru_list(page, lru);
- pgmoved++;
+ pgmoved += hpage_nr_pages(page);
if (!pagevec_add(&pvec, page) || list_empty(list)) {
spin_unlock_irq(&zone->lru_lock);
@@ -1320,16 +1515,23 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
- nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
- ISOLATE_ACTIVE, zone,
- sc->mem_cgroup, 1, file);
- /*
- * zone->pages_scanned is used for detect zone's oom
- * mem_cgroup remembers nr_scan by itself.
- */
if (scanning_global_lru(sc)) {
+ nr_taken = isolate_pages_global(nr_pages, &l_hold,
+ &pgscanned, sc->order,
+ ISOLATE_ACTIVE, zone,
+ 1, file);
zone->pages_scanned += pgscanned;
+ } else {
+ nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
+ &pgscanned, sc->order,
+ ISOLATE_ACTIVE, zone,
+ sc->mem_cgroup, 1, file);
+ /*
+ * mem_cgroup_isolate_pages() keeps track of
+ * scanned pages on its own.
+ */
}
+
reclaim_stat->recent_scanned[file] += nr_taken;
__count_zone_vm_events(PGREFILL, zone, pgscanned);
@@ -1350,10 +1552,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
continue;
}
- /* page_referenced clears PageReferenced */
- if (page_mapping_inuse(page) &&
- page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
- nr_rotated++;
+ if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
+ nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
@@ -1393,6 +1593,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
spin_unlock_irq(&zone->lru_lock);
}
+#ifdef CONFIG_SWAP
static int inactive_anon_is_low_global(struct zone *zone)
{
unsigned long active, inactive;
@@ -1418,12 +1619,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
{
int low;
+ /*
+ * If we don't have swap space, anonymous page deactivation
+ * is pointless.
+ */
+ if (!total_swap_pages)
+ return 0;
+
if (scanning_global_lru(sc))
low = inactive_anon_is_low_global(zone);
else
low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
return low;
}
+#else
+static inline int inactive_anon_is_low(struct zone *zone,
+ struct scan_control *sc)
+{
+ return 0;
+}
+#endif
static int inactive_file_is_low_global(struct zone *zone)
{
@@ -1485,21 +1700,52 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
}
/*
+ * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
+ * until we collected @swap_cluster_max pages to scan.
+ */
+static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
+ unsigned long *nr_saved_scan)
+{
+ unsigned long nr;
+
+ *nr_saved_scan += nr_to_scan;
+ nr = *nr_saved_scan;
+
+ if (nr >= SWAP_CLUSTER_MAX)
+ *nr_saved_scan = 0;
+ else
+ nr = 0;
+
+ return nr;
+}
+
+/*
* Determine how aggressively the anon and file LRU lists should be
* scanned. The relative value of each set of LRU lists is determined
* by looking at the fraction of the pages scanned we did rotate back
* onto the active list instead of evict.
*
- * percent[0] specifies how much pressure to put on ram/swap backed
- * memory, while percent[1] determines pressure on the file LRUs.
+ * nr[0] = anon pages to scan; nr[1] = file pages to scan
*/
-static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
- unsigned long *percent)
+static void get_scan_count(struct zone *zone, struct scan_control *sc,
+ unsigned long *nr, int priority)
{
unsigned long anon, file, free;
unsigned long anon_prio, file_prio;
unsigned long ap, fp;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+ u64 fraction[2], denominator;
+ enum lru_list l;
+ int noswap = 0;
+
+ /* If we have no swap space, do not bother scanning anon pages. */
+ if (!sc->may_swap || (nr_swap_pages <= 0)) {
+ noswap = 1;
+ fraction[0] = 0;
+ fraction[1] = 1;
+ denominator = 1;
+ goto out;
+ }
anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
@@ -1511,13 +1757,21 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
/* If we have very few page cache pages,
force-scan anon pages. */
if (unlikely(file + free <= high_wmark_pages(zone))) {
- percent[0] = 100;
- percent[1] = 0;
- return;
+ fraction[0] = 1;
+ fraction[1] = 0;
+ denominator = 1;
+ goto out;
}
}
/*
+ * With swappiness at 100, anonymous and file have the same priority.
+ * This scanning priority is essentially the inverse of IO cost.
+ */
+ anon_prio = sc->swappiness;
+ file_prio = 200 - sc->swappiness;
+
+ /*
* OK, so we have swap space and a fair amount of page cache
* pages. We use the recently rotated / recently scanned
* ratios to determine how valuable each cache is.
@@ -1528,28 +1782,18 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
*
* anon in [0], file in [1]
*/
+ spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
- spin_lock_irq(&zone->lru_lock);
reclaim_stat->recent_scanned[0] /= 2;
reclaim_stat->recent_rotated[0] /= 2;
- spin_unlock_irq(&zone->lru_lock);
}
if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
- spin_lock_irq(&zone->lru_lock);
reclaim_stat->recent_scanned[1] /= 2;
reclaim_stat->recent_rotated[1] /= 2;
- spin_unlock_irq(&zone->lru_lock);
}
/*
- * With swappiness at 100, anonymous and file have the same priority.
- * This scanning priority is essentially the inverse of IO cost.
- */
- anon_prio = sc->swappiness;
- file_prio = 200 - sc->swappiness;
-
- /*
* The amount of pressure on anon vs file pages is inversely
* proportional to the fraction of recently scanned pages on
* each list that were recently referenced and in active use.
@@ -1559,30 +1803,75 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;
+ spin_unlock_irq(&zone->lru_lock);
+
+ fraction[0] = ap;
+ fraction[1] = fp;
+ denominator = ap + fp + 1;
+out:
+ for_each_evictable_lru(l) {
+ int file = is_file_lru(l);
+ unsigned long scan;
- /* Normalize to percentages */
- percent[0] = 100 * ap / (ap + fp + 1);
- percent[1] = 100 - percent[0];
+ scan = zone_nr_lru_pages(zone, sc, l);
+ if (priority || noswap) {
+ scan >>= priority;
+ scan = div64_u64(scan * fraction[file], denominator);
+ }
+ nr[l] = nr_scan_try_batch(scan,
+ &reclaim_stat->nr_saved_scan[l]);
+ }
}
/*
- * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
- * until we collected @swap_cluster_max pages to scan.
+ * Reclaim/compaction depends on a number of pages being freed. To avoid
+ * disruption to the system, a small number of order-0 pages continue to be
+ * rotated and reclaimed in the normal fashion. However, by the time we get
+ * back to the allocator and call try_to_compact_zone(), we ensure that
+ * there are enough free pages for it to be likely successful
*/
-static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
- unsigned long *nr_saved_scan)
+static inline bool should_continue_reclaim(struct zone *zone,
+ unsigned long nr_reclaimed,
+ unsigned long nr_scanned,
+ struct scan_control *sc)
{
- unsigned long nr;
+ unsigned long pages_for_compaction;
+ unsigned long inactive_lru_pages;
- *nr_saved_scan += nr_to_scan;
- nr = *nr_saved_scan;
+ /* If not in reclaim/compaction mode, stop */
+ if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
+ return false;
- if (nr >= SWAP_CLUSTER_MAX)
- *nr_saved_scan = 0;
- else
- nr = 0;
+ /*
+ * If we failed to reclaim and have scanned the full list, stop.
+ * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
+ * faster but obviously would be less likely to succeed
+ * allocation. If this is desirable, use GFP_REPEAT to decide
+ * if both reclaimed and scanned should be checked or just
+ * reclaimed
+ */
+ if (!nr_reclaimed && !nr_scanned)
+ return false;
- return nr;
+ /*
+ * If we have not reclaimed enough pages for compaction and the
+ * inactive lists are large enough, continue reclaiming
+ */
+ pages_for_compaction = (2UL << sc->order);
+ inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+ if (sc->nr_reclaimed < pages_for_compaction &&
+ inactive_lru_pages > pages_for_compaction)
+ return true;
+
+ /* If compaction would go ahead or the allocation would succeed, stop */
+ switch (compaction_suitable(zone, sc->order)) {
+ case COMPACT_PARTIAL:
+ case COMPACT_CONTINUE:
+ return false;
+ default:
+ return true;
+ }
}
/*
@@ -1593,33 +1882,14 @@ static void shrink_zone(int priority, struct zone *zone,
{
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
- unsigned long percent[2]; /* anon @ 0; file @ 1 */
enum lru_list l;
- unsigned long nr_reclaimed = sc->nr_reclaimed;
+ unsigned long nr_reclaimed;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
- int noswap = 0;
-
- /* If we have no swap space, do not bother scanning anon pages. */
- if (!sc->may_swap || (nr_swap_pages <= 0)) {
- noswap = 1;
- percent[0] = 0;
- percent[1] = 100;
- } else
- get_scan_ratio(zone, sc, percent);
-
- for_each_evictable_lru(l) {
- int file = is_file_lru(l);
- unsigned long scan;
+ unsigned long nr_scanned = sc->nr_scanned;
- scan = zone_nr_lru_pages(zone, sc, l);
- if (priority || noswap) {
- scan >>= priority;
- scan = (scan * percent[file]) / 100;
- }
- nr[l] = nr_scan_try_batch(scan,
- &reclaim_stat->nr_saved_scan[l]);
- }
+restart:
+ nr_reclaimed = 0;
+ get_scan_count(zone, sc, nr, priority);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
@@ -1644,16 +1914,20 @@ static void shrink_zone(int priority, struct zone *zone,
if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
break;
}
-
- sc->nr_reclaimed = nr_reclaimed;
+ sc->nr_reclaimed += nr_reclaimed;
/*
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
+ if (inactive_anon_is_low(zone, sc))
shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+ /* reclaim/compaction might need reclaim to continue */
+ if (should_continue_reclaim(zone, nr_reclaimed,
+ sc->nr_scanned - nr_scanned, sc))
+ goto restart;
+
throttle_vm_writeout(sc->gfp_mask);
}
@@ -1676,13 +1950,11 @@ static void shrink_zone(int priority, struct zone *zone,
static void shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
- enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
struct zoneref *z;
struct zone *zone;
- sc->all_unreclaimable = 1;
- for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
- sc->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!populated_zone(zone))
continue;
/*
@@ -1692,26 +1964,46 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
if (scanning_global_lru(sc)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- note_zone_scanning_priority(zone, priority);
-
- if (zone_is_all_unreclaimable(zone) &&
- priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
- sc->all_unreclaimable = 0;
- } else {
- /*
- * Ignore cpuset limitation here. We just want to reduce
- * # of used pages by us regardless of memory shortage.
- */
- sc->all_unreclaimable = 0;
- mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
- priority);
}
shrink_zone(priority, zone, sc);
}
}
+static bool zone_reclaimable(struct zone *zone)
+{
+ return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+}
+
+/*
+ * As hibernation is going on, kswapd is freezed so that it can't mark
+ * the zone into all_unreclaimable. It can't handle OOM during hibernation.
+ * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
+ */
+static bool all_unreclaimable(struct zonelist *zonelist,
+ struct scan_control *sc)
+{
+ struct zoneref *z;
+ struct zone *zone;
+ bool all_unreclaimable = true;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
+ if (!populated_zone(zone))
+ continue;
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+ if (zone_reclaimable(zone)) {
+ all_unreclaimable = false;
+ break;
+ }
+ }
+
+ return all_unreclaimable;
+}
+
/*
* This is the main entry point to direct page reclaim.
*
@@ -1732,31 +2024,17 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
int priority;
- unsigned long ret = 0;
unsigned long total_scanned = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
- unsigned long lru_pages = 0;
struct zoneref *z;
struct zone *zone;
- enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
unsigned long writeback_threshold;
+ get_mems_allowed();
delayacct_freepages_start();
if (scanning_global_lru(sc))
count_vm_event(ALLOCSTALL);
- /*
- * mem_cgroup will not do shrink_slab.
- */
- if (scanning_global_lru(sc)) {
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
-
- lru_pages += zone_reclaimable_pages(zone);
- }
- }
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
sc->nr_scanned = 0;
@@ -1768,6 +2046,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
* over limit cgroups
*/
if (scanning_global_lru(sc)) {
+ unsigned long lru_pages = 0;
+ for_each_zone_zonelist(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask)) {
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+
+ lru_pages += zone_reclaimable_pages(zone);
+ }
+
shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1775,10 +2062,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
}
}
total_scanned += sc->nr_scanned;
- if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
- ret = sc->nr_reclaimed;
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim)
goto out;
- }
/*
* Try to write back as many pages as we just scanned. This
@@ -1795,42 +2080,33 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
/* Take a nap, wait for some writeback to complete */
if (!sc->hibernation_mode && sc->nr_scanned &&
- priority < DEF_PRIORITY - 2)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- }
- /* top priority shrink_zones still had more to do? don't OOM, then */
- if (!sc->all_unreclaimable && scanning_global_lru(sc))
- ret = sc->nr_reclaimed;
-out:
- /*
- * Now that we've scanned all the zones at this priority level, note
- * that level within the zone so that the next thread which performs
- * scanning of this zone will immediately start out at this priority
- * level. This affects only the decision whether or not to bring
- * mapped pages onto the inactive list.
- */
- if (priority < 0)
- priority = 0;
-
- if (scanning_global_lru(sc)) {
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
+ priority < DEF_PRIORITY - 2) {
+ struct zone *preferred_zone;
- zone->prev_priority = priority;
+ first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
+ NULL, &preferred_zone);
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
}
- } else
- mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
+ }
+out:
delayacct_freepages_end();
+ put_mems_allowed();
- return ret;
+ if (sc->nr_reclaimed)
+ return sc->nr_reclaimed;
+
+ /* top priority shrink_zones still had more to do? don't OOM, then */
+ if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
+ return 1;
+
+ return 0;
}
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *nodemask)
{
+ unsigned long nr_reclaimed;
struct scan_control sc = {
.gfp_mask = gfp_mask,
.may_writepage = !laptop_mode,
@@ -1840,11 +2116,18 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.swappiness = vm_swappiness,
.order = order,
.mem_cgroup = NULL,
- .isolate_pages = isolate_pages_global,
.nodemask = nodemask,
};
- return do_try_to_free_pages(zonelist, &sc);
+ trace_mm_vmscan_direct_reclaim_begin(order,
+ sc.may_writepage,
+ gfp_mask);
+
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
+ trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+
+ return nr_reclaimed;
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1852,24 +2135,24 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
unsigned int swappiness,
- struct zone *zone, int nid)
+ struct zone *zone)
{
struct scan_control sc = {
+ .nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
.swappiness = swappiness,
.order = 0,
.mem_cgroup = mem,
- .isolate_pages = mem_cgroup_isolate_pages,
};
- nodemask_t nm = nodemask_of_node(nid);
-
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- sc.nodemask = &nm;
- sc.nr_reclaimed = 0;
- sc.nr_scanned = 0;
+
+ trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
+ sc.may_writepage,
+ sc.gfp_mask);
+
/*
* NOTE: Although we can get the priority field, using it
* here is not a good idea, since it limits the pages we can scan.
@@ -1878,6 +2161,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
* the priority and make it zero.
*/
shrink_zone(0, zone, &sc);
+
+ trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+
return sc.nr_reclaimed;
}
@@ -1887,6 +2173,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
unsigned int swappiness)
{
struct zonelist *zonelist;
+ unsigned long nr_reclaimed;
struct scan_control sc = {
.may_writepage = !laptop_mode,
.may_unmap = 1,
@@ -1895,49 +2182,106 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
.swappiness = swappiness,
.order = 0,
.mem_cgroup = mem_cont,
- .isolate_pages = mem_cgroup_isolate_pages,
.nodemask = NULL, /* we don't care the placement */
};
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
zonelist = NODE_DATA(numa_node_id())->node_zonelists;
- return do_try_to_free_pages(zonelist, &sc);
+
+ trace_mm_vmscan_memcg_reclaim_begin(0,
+ sc.may_writepage,
+ sc.gfp_mask);
+
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
+ trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+
+ return nr_reclaimed;
}
#endif
+/*
+ * pgdat_balanced is used when checking if a node is balanced for high-order
+ * allocations. Only zones that meet watermarks and are in a zone allowed
+ * by the callers classzone_idx are added to balanced_pages. The total of
+ * balanced pages must be at least 25% of the zones allowed by classzone_idx
+ * for the node to be considered balanced. Forcing all zones to be balanced
+ * for high orders can cause excessive reclaim when there are imbalanced zones.
+ * The choice of 25% is due to
+ * o a 16M DMA zone that is balanced will not balance a zone on any
+ * reasonable sized machine
+ * o On all other machines, the top zone must be at least a reasonable
+ * precentage of the middle zones. For example, on 32-bit x86, highmem
+ * would need to be at least 256M for it to be balance a whole node.
+ * Similarly, on x86-64 the Normal zone would need to be at least 1G
+ * to balance a node on its own. These seemed like reasonable ratios.
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
+ int classzone_idx)
+{
+ unsigned long present_pages = 0;
+ int i;
+
+ for (i = 0; i <= classzone_idx; i++)
+ present_pages += pgdat->node_zones[i].present_pages;
+
+ return balanced_pages > (present_pages >> 2);
+}
+
/* is kswapd sleeping prematurely? */
-static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+ int classzone_idx)
{
int i;
+ unsigned long balanced = 0;
+ bool all_zones_ok = true;
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
- return 1;
+ return true;
- /* If after HZ/10, a zone is below the high mark, it's premature */
+ /* Check the watermark levels */
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
- if (zone_is_all_unreclaimable(zone))
+ /*
+ * balance_pgdat() skips over all_unreclaimable after
+ * DEF_PRIORITY. Effectively, it considers them balanced so
+ * they must be considered balanced here as well if kswapd
+ * is to sleep
+ */
+ if (zone->all_unreclaimable) {
+ balanced += zone->present_pages;
continue;
+ }
- if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
- 0, 0))
- return 1;
+ if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
+ classzone_idx, 0))
+ all_zones_ok = false;
+ else
+ balanced += zone->present_pages;
}
- return 0;
+ /*
+ * For high-order requests, the balanced zones must contain at least
+ * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
+ * must be balanced
+ */
+ if (order)
+ return pgdat_balanced(pgdat, balanced, classzone_idx);
+ else
+ return !all_zones_ok;
}
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
- * Returns the number of pages which were actually freed.
+ * Returns the final order kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
@@ -1954,11 +2298,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
*/
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
+ int *classzone_idx)
{
int all_zones_ok;
+ unsigned long balanced;
int priority;
int i;
+ int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long total_scanned;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct scan_control sc = {
@@ -1973,26 +2320,14 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
.swappiness = vm_swappiness,
.order = order,
.mem_cgroup = NULL,
- .isolate_pages = isolate_pages_global,
};
- /*
- * temp_priority is used to remember the scanning priority at which
- * this zone was successfully refilled to
- * free_pages == high_wmark_pages(zone).
- */
- int temp_priority[MAX_NR_ZONES];
-
loop_again:
total_scanned = 0;
sc.nr_reclaimed = 0;
sc.may_writepage = !laptop_mode;
count_vm_event(PAGEOUTRUN);
- for (i = 0; i < pgdat->nr_zones; i++)
- temp_priority[i] = DEF_PRIORITY;
-
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
- int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long lru_pages = 0;
int has_under_min_watermark_zone = 0;
@@ -2001,6 +2336,7 @@ loop_again:
disable_swap_token();
all_zones_ok = 1;
+ balanced = 0;
/*
* Scan in the highmem->dma direction for the highest
@@ -2012,8 +2348,7 @@ loop_again:
if (!populated_zone(zone))
continue;
- if (zone_is_all_unreclaimable(zone) &&
- priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;
/*
@@ -2024,9 +2359,10 @@ loop_again:
shrink_active_list(SWAP_CLUSTER_MAX, zone,
&sc, priority, 0);
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
+ *classzone_idx = i;
break;
}
}
@@ -2049,37 +2385,29 @@ loop_again:
* cause too much scanning of the lower zones.
*/
for (i = 0; i <= end_zone; i++) {
+ int compaction;
struct zone *zone = pgdat->node_zones + i;
int nr_slab;
- int nid, zid;
if (!populated_zone(zone))
continue;
- if (zone_is_all_unreclaimable(zone) &&
- priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;
- if (!zone_watermark_ok(zone, order,
- high_wmark_pages(zone), end_zone, 0))
- all_zones_ok = 0;
- temp_priority[i] = priority;
sc.nr_scanned = 0;
- note_zone_scanning_priority(zone, priority);
- nid = pgdat->node_id;
- zid = zone_idx(zone);
/*
* Call soft limit reclaim before calling shrink_zone.
* For now we ignore the return value
*/
- mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
- nid, zid);
+ mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
+
/*
* We put equal pressure on every zone, unless one
* zone has way too many pages free already.
*/
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
8*high_wmark_pages(zone), end_zone, 0))
shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
@@ -2087,12 +2415,27 @@ loop_again:
lru_pages);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
- if (zone_is_all_unreclaimable(zone))
+
+ compaction = 0;
+ if (order &&
+ zone_watermark_ok(zone, 0,
+ high_wmark_pages(zone),
+ end_zone, 0) &&
+ !zone_watermark_ok(zone, order,
+ high_wmark_pages(zone),
+ end_zone, 0)) {
+ compact_zone_order(zone,
+ order,
+ sc.gfp_mask, false,
+ COMPACT_MODE_KSWAPD);
+ compaction = 1;
+ }
+
+ if (zone->all_unreclaimable)
continue;
- if (nr_slab == 0 && zone->pages_scanned >=
- (zone_reclaimable_pages(zone) * 6))
- zone_set_flag(zone,
- ZONE_ALL_UNRECLAIMABLE);
+ if (!compaction && nr_slab == 0 &&
+ !zone_reclaimable(zone))
+ zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
* the reclaim ratio is low, start doing writepage
@@ -2102,16 +2445,32 @@ loop_again:
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
- /*
- * We are still under min water mark. it mean we have
- * GFP_ATOMIC allocation failure risk. Hurry up!
- */
- if (!zone_watermark_ok(zone, order, min_wmark_pages(zone),
- end_zone, 0))
- has_under_min_watermark_zone = 1;
+ if (!zone_watermark_ok_safe(zone, order,
+ high_wmark_pages(zone), end_zone, 0)) {
+ all_zones_ok = 0;
+ /*
+ * We are still under min water mark. This
+ * means that we have a GFP_ATOMIC allocation
+ * failure risk. Hurry up!
+ */
+ if (!zone_watermark_ok_safe(zone, order,
+ min_wmark_pages(zone), end_zone, 0))
+ has_under_min_watermark_zone = 1;
+ } else {
+ /*
+ * If a zone reaches its high watermark,
+ * consider it to be no longer congested. It's
+ * possible there are dirty pages backed by
+ * congested BDIs but as pressure is relieved,
+ * spectulatively avoid congestion waits
+ */
+ zone_clear_flag(zone, ZONE_CONGESTED);
+ if (i <= *classzone_idx)
+ balanced += zone->present_pages;
+ }
}
- if (all_zones_ok)
+ if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
break; /* kswapd: all done */
/*
* OK, kswapd is getting into trouble. Take a nap, then take
@@ -2134,17 +2493,13 @@ loop_again:
break;
}
out:
+
/*
- * Note within each zone the priority level at which this zone was
- * brought into a happy state. So that the next thread which scans this
- * zone will start out at that priority level.
+ * order-0: All zones must meet high watermark for a balanced node
+ * high-order: Balanced zones must make up at least 25% of the node
+ * for the node to be balanced
*/
- for (i = 0; i < pgdat->nr_zones; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- zone->prev_priority = temp_priority[i];
- }
- if (!all_zones_ok) {
+ if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
cond_resched();
try_to_freeze();
@@ -2169,7 +2524,88 @@ out:
goto loop_again;
}
- return sc.nr_reclaimed;
+ /*
+ * If kswapd was reclaiming at a higher order, it has the option of
+ * sleeping without all zones being balanced. Before it does, it must
+ * ensure that the watermarks for order-0 on *all* zones are met and
+ * that the congestion flags are cleared. The congestion flag must
+ * be cleared as kswapd is the only mechanism that clears the flag
+ * and it is potentially going to sleep here.
+ */
+ if (order) {
+ for (i = 0; i <= end_zone; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ continue;
+
+ /* Confirm the zone is balanced for order-0 */
+ if (!zone_watermark_ok(zone, 0,
+ high_wmark_pages(zone), 0, 0)) {
+ order = sc.order = 0;
+ goto loop_again;
+ }
+
+ /* If balanced, clear the congested flag */
+ zone_clear_flag(zone, ZONE_CONGESTED);
+ }
+ }
+
+ /*
+ * Return the order we were reclaiming at so sleeping_prematurely()
+ * makes a decision on the order we were last reclaiming at. However,
+ * if another caller entered the allocator slow path while kswapd
+ * was awake, order will remain at the higher level
+ */
+ *classzone_idx = end_zone;
+ return order;
+}
+
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+{
+ long remaining = 0;
+ DEFINE_WAIT(wait);
+
+ if (freezing(current) || kthread_should_stop())
+ return;
+
+ prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+
+ /* Try to sleep for a short interval */
+ if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+ remaining = schedule_timeout(HZ/10);
+ finish_wait(&pgdat->kswapd_wait, &wait);
+ prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+ }
+
+ /*
+ * After a short sleep, check if it was a premature sleep. If not, then
+ * go fully to sleep until explicitly woken up.
+ */
+ if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+ trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+
+ /*
+ * vmstat counters are not perfectly accurate and the estimated
+ * value for counters such as NR_FREE_PAGES can deviate from the
+ * true value by nr_online_cpus * threshold. To avoid the zone
+ * watermarks being breached while under pressure, we reduce the
+ * per-cpu vmstat threshold while kswapd is awake and restore
+ * them before going back to sleep.
+ */
+ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
+ schedule();
+ set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
+ } else {
+ if (remaining)
+ count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+ else
+ count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
+ }
+ finish_wait(&pgdat->kswapd_wait, &wait);
}
/*
@@ -2188,9 +2624,10 @@ out:
static int kswapd(void *p)
{
unsigned long order;
+ int classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
- DEFINE_WAIT(wait);
+
struct reclaim_state reclaim_state = {
.reclaimed_slab = 0,
};
@@ -2218,48 +2655,30 @@ static int kswapd(void *p)
set_freezable();
order = 0;
+ classzone_idx = MAX_NR_ZONES - 1;
for ( ; ; ) {
unsigned long new_order;
+ int new_classzone_idx;
int ret;
- prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
new_order = pgdat->kswapd_max_order;
+ new_classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
- if (order < new_order) {
+ pgdat->classzone_idx = MAX_NR_ZONES - 1;
+ if (order < new_order || classzone_idx > new_classzone_idx) {
/*
* Don't sleep if someone wants a larger 'order'
- * allocation
+ * allocation or has tigher zone constraints
*/
order = new_order;
+ classzone_idx = new_classzone_idx;
} else {
- if (!freezing(current) && !kthread_should_stop()) {
- long remaining = 0;
-
- /* Try to sleep for a short interval */
- if (!sleeping_prematurely(pgdat, order, remaining)) {
- remaining = schedule_timeout(HZ/10);
- finish_wait(&pgdat->kswapd_wait, &wait);
- prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
- }
-
- /*
- * After a short sleep, check if it was a
- * premature sleep. If not, then go fully
- * to sleep until explicitly woken up
- */
- if (!sleeping_prematurely(pgdat, order, remaining))
- schedule();
- else {
- if (remaining)
- count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
- else
- count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
- }
- }
-
+ kswapd_try_to_sleep(pgdat, order, classzone_idx);
order = pgdat->kswapd_max_order;
+ classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = MAX_NR_ZONES - 1;
}
- finish_wait(&pgdat->kswapd_wait, &wait);
ret = try_to_freeze();
if (kthread_should_stop())
@@ -2269,8 +2688,10 @@ static int kswapd(void *p)
* We can speed up thawing tasks if we don't call balance_pgdat
* after returning from the refrigerator
*/
- if (!ret)
- balance_pgdat(pgdat, order);
+ if (!ret) {
+ trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
+ order = balance_pgdat(pgdat, order, &classzone_idx);
+ }
}
return 0;
}
@@ -2278,22 +2699,26 @@ static int kswapd(void *p)
/*
* A zone is low on free memory, so wake its kswapd task to service it.
*/
-void wakeup_kswapd(struct zone *zone, int order)
+void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
{
pg_data_t *pgdat;
if (!populated_zone(zone))
return;
- pgdat = zone->zone_pgdat;
- if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
- return;
- if (pgdat->kswapd_max_order < order)
- pgdat->kswapd_max_order = order;
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
return;
+ pgdat = zone->zone_pgdat;
+ if (pgdat->kswapd_max_order < order) {
+ pgdat->kswapd_max_order = order;
+ pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
+ }
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
+ if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+ return;
+
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
wake_up_interruptible(&pgdat->kswapd_wait);
}
@@ -2353,7 +2778,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
.hibernation_mode = 1,
.swappiness = vm_swappiness,
.order = 0,
- .isolate_pages = isolate_pages_global,
};
struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
struct task_struct *p = current;
@@ -2538,11 +2962,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
.gfp_mask = gfp_mask,
.swappiness = vm_swappiness,
.order = order,
- .isolate_pages = isolate_pages_global,
};
- unsigned long slab_reclaimable;
+ unsigned long nr_slab_pages0, nr_slab_pages1;
- disable_swap_token();
cond_resched();
/*
* We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -2550,6 +2972,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* and RECLAIM_SWAP.
*/
p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+ lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
@@ -2560,14 +2983,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
*/
priority = ZONE_RECLAIM_PRIORITY;
do {
- note_zone_scanning_priority(zone, priority);
shrink_zone(priority, zone, &sc);
priority--;
} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
}
- slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
- if (slab_reclaimable > zone->min_slab_pages) {
+ nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ if (nr_slab_pages0 > zone->min_slab_pages) {
/*
* shrink_slab() does not currently allow us to determine how
* many pages were freed in this zone. So we take the current
@@ -2578,21 +3000,32 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* Note that shrink_slab will free memory on all zones and may
* take a long time.
*/
- while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
- zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
- slab_reclaimable - nr_pages)
- ;
+ for (;;) {
+ unsigned long lru_pages = zone_reclaimable_pages(zone);
+
+ /* No reclaimable slab or very low memory pressure */
+ if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
+ break;
+
+ /* Freed enough memory */
+ nr_slab_pages1 = zone_page_state(zone,
+ NR_SLAB_RECLAIMABLE);
+ if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
+ break;
+ }
/*
* Update nr_reclaimed by the number of slab pages we
* reclaimed from this zone.
*/
- sc.nr_reclaimed += slab_reclaimable -
- zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ if (nr_slab_pages1 < nr_slab_pages0)
+ sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
}
p->reclaim_state = NULL;
current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+ lockdep_clear_current_reclaim_state();
return sc.nr_reclaimed >= nr_pages;
}
@@ -2615,7 +3048,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
return ZONE_RECLAIM_FULL;
- if (zone_is_all_unreclaimable(zone))
+ if (zone->all_unreclaimable)
return ZONE_RECLAIM_FULL;
/*
@@ -2846,6 +3279,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
return 0;
}
+#ifdef CONFIG_NUMA
/*
* per node 'scan_unevictable_pages' attribute. On demand re-scan of
* a specified node's per zone unevictable lists for evictable pages.
@@ -2892,4 +3326,4 @@ void scan_unevictable_unregister_node(struct node *node)
{
sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
}
-
+#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6051fbab67ba..0c3b5048773e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,22 +12,26 @@
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/vmstat.h>
#include <linux/sched.h>
+#include <linux/math64.h>
+#include <linux/writeback.h>
+#include <linux/compaction.h>
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
EXPORT_PER_CPU_SYMBOL(vm_event_states);
-static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
+static void sum_vm_events(unsigned long *ret)
{
int cpu;
int i;
memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
- for_each_cpu(cpu, cpumask) {
+ for_each_online_cpu(cpu) {
struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
@@ -43,7 +47,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
void all_vm_events(unsigned long *ret)
{
get_online_cpus();
- sum_vm_events(ret, cpu_online_mask);
+ sum_vm_events(ret);
put_online_cpus();
}
EXPORT_SYMBOL_GPL(all_vm_events);
@@ -79,7 +83,31 @@ EXPORT_SYMBOL(vm_stat);
#ifdef CONFIG_SMP
-static int calculate_threshold(struct zone *zone)
+int calculate_pressure_threshold(struct zone *zone)
+{
+ int threshold;
+ int watermark_distance;
+
+ /*
+ * As vmstats are not up to date, there is drift between the estimated
+ * and real values. For high thresholds and a high number of CPUs, it
+ * is possible for the min watermark to be breached while the estimated
+ * value looks fine. The pressure threshold is a reduced value such
+ * that even the maximum amount of drift will not accidentally breach
+ * the min watermark
+ */
+ watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+ threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+
+ /*
+ * Maximum threshold is 125
+ */
+ threshold = min(125, threshold);
+
+ return threshold;
+}
+
+int calculate_normal_threshold(struct zone *zone)
{
int threshold;
int mem; /* memory in 128 MB units */
@@ -136,10 +164,44 @@ static void refresh_zone_stat_thresholds(void)
int threshold;
for_each_populated_zone(zone) {
- threshold = calculate_threshold(zone);
+ unsigned long max_drift, tolerate_drift;
+
+ threshold = calculate_normal_threshold(zone);
for_each_online_cpu(cpu)
- zone_pcp(zone, cpu)->stat_threshold = threshold;
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
+
+ /*
+ * Only set percpu_drift_mark if there is a danger that
+ * NR_FREE_PAGES reports the low watermark is ok when in fact
+ * the min watermark could be breached by an allocation
+ */
+ tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
+ max_drift = num_online_cpus() * threshold;
+ if (max_drift > tolerate_drift)
+ zone->percpu_drift_mark = high_wmark_pages(zone) +
+ max_drift;
+ }
+}
+
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+ int (*calculate_pressure)(struct zone *))
+{
+ struct zone *zone;
+ int cpu;
+ int threshold;
+ int i;
+
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ zone = &pgdat->node_zones[i];
+ if (!zone->percpu_drift_mark)
+ continue;
+
+ threshold = (*calculate_pressure)(zone);
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
}
}
@@ -149,35 +211,24 @@ static void refresh_zone_stat_thresholds(void)
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
int delta)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
- s8 *p = pcp->vm_stat_diff + item;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
long x;
+ long t;
- x = delta + *p;
+ x = delta + __this_cpu_read(*p);
- if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
+ t = __this_cpu_read(pcp->stat_threshold);
+
+ if (unlikely(x > t || x < -t)) {
zone_page_state_add(x, zone, item);
x = 0;
}
- *p = x;
+ __this_cpu_write(*p, x);
}
EXPORT_SYMBOL(__mod_zone_page_state);
/*
- * For an unknown interrupt state
- */
-void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
- int delta)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __mod_zone_page_state(zone, item, delta);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(mod_zone_page_state);
-
-/*
* Optimized increment and decrement functions.
*
* These are only for a single page and therefore can take a struct page *
@@ -202,16 +253,17 @@ EXPORT_SYMBOL(mod_zone_page_state);
*/
void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
- s8 *p = pcp->vm_stat_diff + item;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ s8 v, t;
- (*p)++;
+ v = __this_cpu_inc_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v > t)) {
+ s8 overstep = t >> 1;
- if (unlikely(*p > pcp->stat_threshold)) {
- int overstep = pcp->stat_threshold / 2;
-
- zone_page_state_add(*p + overstep, zone, item);
- *p = -overstep;
+ zone_page_state_add(v + overstep, zone, item);
+ __this_cpu_write(*p, -overstep);
}
}
@@ -223,16 +275,17 @@ EXPORT_SYMBOL(__inc_zone_page_state);
void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
- s8 *p = pcp->vm_stat_diff + item;
-
- (*p)--;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ s8 v, t;
- if (unlikely(*p < - pcp->stat_threshold)) {
- int overstep = pcp->stat_threshold / 2;
+ v = __this_cpu_dec_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v < - t)) {
+ s8 overstep = t >> 1;
- zone_page_state_add(*p - overstep, zone, item);
- *p = overstep;
+ zone_page_state_add(v - overstep, zone, item);
+ __this_cpu_write(*p, overstep);
}
}
@@ -242,6 +295,92 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
}
EXPORT_SYMBOL(__dec_zone_page_state);
+#ifdef CONFIG_CMPXCHG_LOCAL
+/*
+ * If we have cmpxchg_local support then we do not need to incur the overhead
+ * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
+ *
+ * mod_state() modifies the zone counter state through atomic per cpu
+ * operations.
+ *
+ * Overstep mode specifies how overstep should handled:
+ * 0 No overstepping
+ * 1 Overstepping half of threshold
+ * -1 Overstepping minus half of threshold
+*/
+static inline void mod_state(struct zone *zone,
+ enum zone_stat_item item, int delta, int overstep_mode)
+{
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ long o, n, t, z;
+
+ do {
+ z = 0; /* overflow to zone counters */
+
+ /*
+ * The fetching of the stat_threshold is racy. We may apply
+ * a counter threshold to the wrong the cpu if we get
+ * rescheduled while executing here. However, the following
+ * will apply the threshold again and therefore bring the
+ * counter under the threshold.
+ */
+ t = this_cpu_read(pcp->stat_threshold);
+
+ o = this_cpu_read(*p);
+ n = delta + o;
+
+ if (n > t || n < -t) {
+ int os = overstep_mode * (t >> 1) ;
+
+ /* Overflow must be added to zone counters */
+ z = n + os;
+ n = -os;
+ }
+ } while (this_cpu_cmpxchg(*p, o, n) != o);
+
+ if (z)
+ zone_page_state_add(z, zone, item);
+}
+
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ int delta)
+{
+ mod_state(zone, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
+void inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+ mod_state(zone, item, 1, 1);
+}
+
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_state(page_zone(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_state(page_zone(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+#else
+/*
+ * Use interrupt disable to serialize counter updates
+ */
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ int delta)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __mod_zone_page_state(zone, item, delta);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
void inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
unsigned long flags;
@@ -272,6 +411,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
local_irq_restore(flags);
}
EXPORT_SYMBOL(dec_zone_page_state);
+#endif
/*
* Update the zone counters for one cpu.
@@ -300,7 +440,7 @@ void refresh_cpu_vm_stats(int cpu)
for_each_populated_zone(zone) {
struct per_cpu_pageset *p;
- p = zone_pcp(zone, cpu);
+ p = per_cpu_ptr(zone->pageset, cpu);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
if (p->vm_stat_diff[i]) {
@@ -376,7 +516,87 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
}
#endif
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_COMPACTION
+
+struct contig_page_info {
+ unsigned long free_pages;
+ unsigned long free_blocks_total;
+ unsigned long free_blocks_suitable;
+};
+
+/*
+ * Calculate the number of free pages in a zone, how many contiguous
+ * pages are free and how many are large enough to satisfy an allocation of
+ * the target size. Note that this function makes no attempt to estimate
+ * how many suitable free blocks there *might* be if MOVABLE pages were
+ * migrated. Calculating that is possible, but expensive and can be
+ * figured out from userspace
+ */
+static void fill_contig_page_info(struct zone *zone,
+ unsigned int suitable_order,
+ struct contig_page_info *info)
+{
+ unsigned int order;
+
+ info->free_pages = 0;
+ info->free_blocks_total = 0;
+ info->free_blocks_suitable = 0;
+
+ for (order = 0; order < MAX_ORDER; order++) {
+ unsigned long blocks;
+
+ /* Count number of free blocks */
+ blocks = zone->free_area[order].nr_free;
+ info->free_blocks_total += blocks;
+
+ /* Count free base pages */
+ info->free_pages += blocks << order;
+
+ /* Count the suitable free blocks */
+ if (order >= suitable_order)
+ info->free_blocks_suitable += blocks <<
+ (order - suitable_order);
+ }
+}
+
+/*
+ * A fragmentation index only makes sense if an allocation of a requested
+ * size would fail. If that is true, the fragmentation index indicates
+ * whether external fragmentation or a lack of memory was the problem.
+ * The value can be used to determine if page reclaim or compaction
+ * should be used
+ */
+static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
+{
+ unsigned long requested = 1UL << order;
+
+ if (!info->free_blocks_total)
+ return 0;
+
+ /* Fragmentation index only makes sense when a request would fail */
+ if (info->free_blocks_suitable)
+ return -1000;
+
+ /*
+ * Index is between 0 and 1 so return within 3 decimal places
+ *
+ * 0 => allocation would fail due to lack of memory
+ * 1 => allocation would fail due to fragmentation
+ */
+ return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
+}
+
+/* Same as __fragmentation index but allocs contig_page_info on stack */
+int fragmentation_index(struct zone *zone, unsigned int order)
+{
+ struct contig_page_info info;
+
+ fill_contig_page_info(zone, order, &info);
+ return __fragmentation_index(order, &info);
+}
+#endif
+
+#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -429,7 +649,9 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
spin_unlock_irqrestore(&zone->lock, flags);
}
}
+#endif
+#ifdef CONFIG_PROC_FS
static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
@@ -647,6 +869,9 @@ static const char * const vmstat_text[] = {
"nr_isolated_anon",
"nr_isolated_file",
"nr_shmem",
+ "nr_dirtied",
+ "nr_written",
+
#ifdef CONFIG_NUMA
"numa_hit",
"numa_miss",
@@ -655,6 +880,9 @@ static const char * const vmstat_text[] = {
"numa_local",
"numa_other",
#endif
+ "nr_anon_transparent_hugepages",
+ "nr_dirty_threshold",
+ "nr_dirty_background_threshold",
#ifdef CONFIG_VM_EVENT_COUNTERS
"pgpgin",
@@ -690,6 +918,16 @@ static const char * const vmstat_text[] = {
"allocstall",
"pgrotated",
+
+#ifdef CONFIG_COMPACTION
+ "compact_blocks_moved",
+ "compact_pages_moved",
+ "compact_pagemigrate_failed",
+ "compact_stall",
+ "compact_fail",
+ "compact_success",
+#endif
+
#ifdef CONFIG_HUGETLB_PAGE
"htlb_buddy_alloc_success",
"htlb_buddy_alloc_fail",
@@ -741,7 +979,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;
- pageset = zone_pcp(zone, i);
+ pageset = per_cpu_ptr(zone->pageset, i);
seq_printf(m,
"\n cpu: %i"
"\n count: %i"
@@ -758,11 +996,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
}
seq_printf(m,
"\n all_unreclaimable: %u"
- "\n prev_priority: %i"
"\n start_pfn: %lu"
"\n inactive_ratio: %u",
- zone_is_all_unreclaimable(zone),
- zone->prev_priority,
+ zone->all_unreclaimable,
zone->zone_start_pfn,
zone->inactive_ratio);
seq_putc(m, '\n');
@@ -798,36 +1034,44 @@ static const struct file_operations proc_zoneinfo_file_operations = {
.release = seq_release,
};
+enum writeback_stat_item {
+ NR_DIRTY_THRESHOLD,
+ NR_DIRTY_BG_THRESHOLD,
+ NR_VM_WRITEBACK_STAT_ITEMS,
+};
+
static void *vmstat_start(struct seq_file *m, loff_t *pos)
{
unsigned long *v;
-#ifdef CONFIG_VM_EVENT_COUNTERS
- unsigned long *e;
-#endif
- int i;
+ int i, stat_items_size;
if (*pos >= ARRAY_SIZE(vmstat_text))
return NULL;
+ stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
+ NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
#ifdef CONFIG_VM_EVENT_COUNTERS
- v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
- + sizeof(struct vm_event_state), GFP_KERNEL);
-#else
- v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
- GFP_KERNEL);
+ stat_items_size += sizeof(struct vm_event_state);
#endif
+
+ v = kmalloc(stat_items_size, GFP_KERNEL);
m->private = v;
if (!v)
return ERR_PTR(-ENOMEM);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
v[i] = global_page_state(i);
+ v += NR_VM_ZONE_STAT_ITEMS;
+
+ global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
+ v + NR_DIRTY_THRESHOLD);
+ v += NR_VM_WRITEBACK_STAT_ITEMS;
+
#ifdef CONFIG_VM_EVENT_COUNTERS
- e = v + NR_VM_ZONE_STAT_ITEMS;
- all_vm_events(e);
- e[PGPGIN] /= 2; /* sectors -> kbytes */
- e[PGPGOUT] /= 2;
+ all_vm_events(v);
+ v[PGPGIN] /= 2; /* sectors -> kbytes */
+ v[PGPGOUT] /= 2;
#endif
- return v + *pos;
+ return (unsigned long *)m->private + *pos;
}
static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
@@ -905,11 +1149,13 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
+ refresh_zone_stat_thresholds();
start_cpu_timer(cpu);
+ node_set_state(cpu_to_node(cpu), N_CPU);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
- cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
per_cpu(vmstat_work, cpu).work.func = NULL;
break;
case CPU_DOWN_FAILED:
@@ -950,3 +1196,162 @@ static int __init setup_vmstat(void)
return 0;
}
module_init(setup_vmstat)
+
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
+#include <linux/debugfs.h>
+
+static struct dentry *extfrag_debug_root;
+
+/*
+ * Return an index indicating how much of the available free memory is
+ * unusable for an allocation of the requested size.
+ */
+static int unusable_free_index(unsigned int order,
+ struct contig_page_info *info)
+{
+ /* No free memory is interpreted as all free memory is unusable */
+ if (info->free_pages == 0)
+ return 1000;
+
+ /*
+ * Index should be a value between 0 and 1. Return a value to 3
+ * decimal places.
+ *
+ * 0 => no fragmentation
+ * 1 => high fragmentation
+ */
+ return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
+
+}
+
+static void unusable_show_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ unsigned int order;
+ int index;
+ struct contig_page_info info;
+
+ seq_printf(m, "Node %d, zone %8s ",
+ pgdat->node_id,
+ zone->name);
+ for (order = 0; order < MAX_ORDER; ++order) {
+ fill_contig_page_info(zone, order, &info);
+ index = unusable_free_index(order, &info);
+ seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
+ }
+
+ seq_putc(m, '\n');
+}
+
+/*
+ * Display unusable free space index
+ *
+ * The unusable free space index measures how much of the available free
+ * memory cannot be used to satisfy an allocation of a given size and is a
+ * value between 0 and 1. The higher the value, the more of free memory is
+ * unusable and by implication, the worse the external fragmentation is. This
+ * can be expressed as a percentage by multiplying by 100.
+ */
+static int unusable_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ /* check memoryless node */
+ if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
+ return 0;
+
+ walk_zones_in_node(m, pgdat, unusable_show_print);
+
+ return 0;
+}
+
+static const struct seq_operations unusable_op = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = unusable_show,
+};
+
+static int unusable_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &unusable_op);
+}
+
+static const struct file_operations unusable_file_ops = {
+ .open = unusable_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void extfrag_show_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ unsigned int order;
+ int index;
+
+ /* Alloc on stack as interrupts are disabled for zone walk */
+ struct contig_page_info info;
+
+ seq_printf(m, "Node %d, zone %8s ",
+ pgdat->node_id,
+ zone->name);
+ for (order = 0; order < MAX_ORDER; ++order) {
+ fill_contig_page_info(zone, order, &info);
+ index = __fragmentation_index(order, &info);
+ seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
+ }
+
+ seq_putc(m, '\n');
+}
+
+/*
+ * Display fragmentation index for orders that allocations would fail for
+ */
+static int extfrag_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ walk_zones_in_node(m, pgdat, extfrag_show_print);
+
+ return 0;
+}
+
+static const struct seq_operations extfrag_op = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = extfrag_show,
+};
+
+static int extfrag_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &extfrag_op);
+}
+
+static const struct file_operations extfrag_file_ops = {
+ .open = extfrag_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init extfrag_debug_init(void)
+{
+ extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
+ if (!extfrag_debug_root)
+ return -ENOMEM;
+
+ if (!debugfs_create_file("unusable_index", 0444,
+ extfrag_debug_root, NULL, &unusable_file_ops))
+ return -ENOMEM;
+
+ if (!debugfs_create_file("extfrag_index", 0444,
+ extfrag_debug_root, NULL, &extfrag_file_ops))
+ return -ENOMEM;
+
+ return 0;
+}
+
+module_init(extfrag_debug_init);
+#endif