summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/cma.c83
-rw-r--r--mm/compaction.c4
-rw-r--r--mm/gup.c3
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/internal.h4
-rw-r--r--mm/kasan/kasan.c66
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/migrate.c4
-rw-r--r--mm/mmap.c76
-rw-r--r--mm/oom_kill.c81
-rw-r--r--mm/page_alloc.c99
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swapfile.c7
-rw-r--r--mm/vmscan.c2
-rw-r--r--mm/vmstat.c6
-rw-r--r--mm/z3fold.c42
17 files changed, 269 insertions, 215 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d5004d82a1d6..e14c01513bfd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -636,6 +636,7 @@ config DEFERRED_STRUCT_PAGE_INIT
default n
depends on NO_BOOTMEM
depends on !FLATMEM
+ depends on !NEED_PER_CPU_KM
help
Ordinarily all struct pages are initialised during early boot in a
single thread. On very large machines this can take a considerable
diff --git a/mm/cma.c b/mm/cma.c
index aa40e6c7b042..5809bbe360d7 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -39,7 +39,6 @@
#include <trace/events/cma.h>
#include "cma.h"
-#include "internal.h"
struct cma cma_areas[MAX_CMA_AREAS];
unsigned cma_area_count;
@@ -110,25 +109,23 @@ static int __init cma_activate_area(struct cma *cma)
if (!cma->bitmap)
return -ENOMEM;
+ WARN_ON_ONCE(!pfn_valid(pfn));
+ zone = page_zone(pfn_to_page(pfn));
+
do {
unsigned j;
base_pfn = pfn;
- if (!pfn_valid(base_pfn))
- goto err;
-
- zone = page_zone(pfn_to_page(base_pfn));
for (j = pageblock_nr_pages; j; --j, pfn++) {
- if (!pfn_valid(pfn))
- goto err;
-
+ WARN_ON_ONCE(!pfn_valid(pfn));
/*
- * In init_cma_reserved_pageblock(), present_pages
- * is adjusted with assumption that all pages in
- * the pageblock come from a single zone.
+ * alloc_contig_range requires the pfn range
+ * specified to be in the same zone. Make this
+ * simple by forcing the entire CMA resv range
+ * to be in the same zone.
*/
if (page_zone(pfn_to_page(pfn)) != zone)
- goto err;
+ goto not_in_zone;
}
init_cma_reserved_pageblock(pfn_to_page(base_pfn));
} while (--i);
@@ -142,7 +139,7 @@ static int __init cma_activate_area(struct cma *cma)
return 0;
-err:
+not_in_zone:
pr_err("CMA area %s could not be activated\n", cma->name);
kfree(cma->bitmap);
cma->count = 0;
@@ -152,41 +149,6 @@ err:
static int __init cma_init_reserved_areas(void)
{
int i;
- struct zone *zone;
- pg_data_t *pgdat;
-
- if (!cma_area_count)
- return 0;
-
- for_each_online_pgdat(pgdat) {
- unsigned long start_pfn = UINT_MAX, end_pfn = 0;
-
- zone = &pgdat->node_zones[ZONE_MOVABLE];
-
- /*
- * In this case, we cannot adjust the zone range
- * since it is now maximum node span and we don't
- * know original zone range.
- */
- if (populated_zone(zone))
- continue;
-
- for (i = 0; i < cma_area_count; i++) {
- if (pfn_to_nid(cma_areas[i].base_pfn) !=
- pgdat->node_id)
- continue;
-
- start_pfn = min(start_pfn, cma_areas[i].base_pfn);
- end_pfn = max(end_pfn, cma_areas[i].base_pfn +
- cma_areas[i].count);
- }
-
- if (!end_pfn)
- continue;
-
- zone->zone_start_pfn = start_pfn;
- zone->spanned_pages = end_pfn - start_pfn;
- }
for (i = 0; i < cma_area_count; i++) {
int ret = cma_activate_area(&cma_areas[i]);
@@ -195,32 +157,9 @@ static int __init cma_init_reserved_areas(void)
return ret;
}
- /*
- * Reserved pages for ZONE_MOVABLE are now activated and
- * this would change ZONE_MOVABLE's managed page counter and
- * the other zones' present counter. We need to re-calculate
- * various zone information that depends on this initialization.
- */
- build_all_zonelists(NULL);
- for_each_populated_zone(zone) {
- if (zone_idx(zone) == ZONE_MOVABLE) {
- zone_pcp_reset(zone);
- setup_zone_pageset(zone);
- } else
- zone_pcp_update(zone);
-
- set_zone_contiguous(zone);
- }
-
- /*
- * We need to re-init per zone wmark by calling
- * init_per_zone_wmark_min() but doesn't call here because it is
- * registered on core_initcall and it will be called later than us.
- */
-
return 0;
}
-pure_initcall(cma_init_reserved_areas);
+core_initcall(cma_init_reserved_areas);
/**
* cma_init_reserved_mem() - create custom contiguous area from reserved memory
diff --git a/mm/compaction.c b/mm/compaction.c
index 028b7210a669..29bd1df18b98 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1450,12 +1450,14 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
* if compaction succeeds.
* For costly orders, we require low watermark instead of min for
* compaction to proceed to increase its chances.
+ * ALLOC_CMA is used, as pages in CMA pageblocks are considered
+ * suitable migration targets
*/
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
- 0, wmark_target))
+ ALLOC_CMA, wmark_target))
return COMPACT_SKIPPED;
return COMPACT_CONTINUE;
diff --git a/mm/gup.c b/mm/gup.c
index 76af4cfeaf68..541904a7c60f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -544,6 +544,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if (vm_flags & (VM_IO | VM_PFNMAP))
return -EFAULT;
+ if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
+ return -EFAULT;
+
if (write) {
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a3a1815f8e11..b9f3dbd885bd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2431,7 +2431,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond i_size: drop them from page cache */
if (head[i].index >= end) {
- __ClearPageDirty(head + i);
+ ClearPageDirty(head + i);
__delete_from_page_cache(head + i, NULL);
if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
shmem_uncharge(head->mapping->host, 1);
diff --git a/mm/internal.h b/mm/internal.h
index 62d8c34e63d5..502d14189794 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -168,9 +168,6 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
extern int user_min_free_kbytes;
-extern void set_zone_contiguous(struct zone *zone);
-extern void clear_zone_contiguous(struct zone *zone);
-
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
@@ -498,6 +495,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
enum ttu_flags;
struct tlbflush_unmap_batch;
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index bc0e68f7dc75..f185455b3406 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -792,6 +792,40 @@ DEFINE_ASAN_SET_SHADOW(f5);
DEFINE_ASAN_SET_SHADOW(f8);
#ifdef CONFIG_MEMORY_HOTPLUG
+static bool shadow_mapped(unsigned long addr)
+{
+ pgd_t *pgd = pgd_offset_k(addr);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if (pgd_none(*pgd))
+ return false;
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ return false;
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ return false;
+
+ /*
+ * We can't use pud_large() or pud_huge(), the first one is
+ * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse
+ * pud_bad(), if pud is bad then it's bad because it's huge.
+ */
+ if (pud_bad(*pud))
+ return true;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return false;
+
+ if (pmd_bad(*pmd))
+ return true;
+ pte = pte_offset_kernel(pmd, addr);
+ return !pte_none(*pte);
+}
+
static int __meminit kasan_mem_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
@@ -813,6 +847,14 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb,
case MEM_GOING_ONLINE: {
void *ret;
+ /*
+ * If shadow is mapped already than it must have been mapped
+ * during the boot. This could happen if we onlining previously
+ * offlined memory.
+ */
+ if (shadow_mapped(shadow_start))
+ return NOTIFY_OK;
+
ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start,
shadow_end, GFP_KERNEL,
PAGE_KERNEL, VM_NO_GUARD,
@@ -824,8 +866,26 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb,
kmemleak_ignore(ret);
return NOTIFY_OK;
}
- case MEM_OFFLINE:
- vfree((void *)shadow_start);
+ case MEM_CANCEL_ONLINE:
+ case MEM_OFFLINE: {
+ struct vm_struct *vm;
+
+ /*
+ * shadow_start was either mapped during boot by kasan_init()
+ * or during memory online by __vmalloc_node_range().
+ * In the latter case we can use vfree() to free shadow.
+ * Non-NULL result of the find_vm_area() will tell us if
+ * that was the second case.
+ *
+ * Currently it's not possible to free shadow mapped
+ * during boot by kasan_init(). It's because the code
+ * to do that hasn't been written yet. So we'll just
+ * leak the memory.
+ */
+ vm = find_vm_area((void *)shadow_start);
+ if (vm)
+ vfree((void *)shadow_start);
+ }
}
return NOTIFY_OK;
@@ -838,5 +898,5 @@ static int __init kasan_memhotplug_init(void)
return 0;
}
-module_init(kasan_memhotplug_init);
+core_initcall(kasan_memhotplug_init);
#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f74826cdceea..25982467800b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1158,7 +1158,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
* nodes have to go through register_node.
* TODO clean up this mess.
*/
- ret = link_mem_sections(nid, start_pfn, nr_pages);
+ ret = link_mem_sections(nid, start_pfn, nr_pages, false);
register_fail:
/*
* If sysfs file of new node can't create, cpu on the node
diff --git a/mm/migrate.c b/mm/migrate.c
index 568433023831..8c0af0f7cab1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -528,14 +528,12 @@ int migrate_page_move_mapping(struct address_space *mapping,
int i;
int index = page_index(page);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
pslot = radix_tree_lookup_slot(&mapping->i_pages,
index + i);
radix_tree_replace_slot(&mapping->i_pages, pslot,
newpage + i);
}
- } else {
- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
}
/*
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d5968d1e8e3..fc41c0543d7f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1324,6 +1324,35 @@ static inline int mlock_future_check(struct mm_struct *mm,
return 0;
}
+static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode))
+ return MAX_LFS_FILESIZE;
+
+ if (S_ISBLK(inode->i_mode))
+ return MAX_LFS_FILESIZE;
+
+ /* Special "we do even unsigned file positions" case */
+ if (file->f_mode & FMODE_UNSIGNED_OFFSET)
+ return 0;
+
+ /* Yes, random drivers might want more. But I'm tired of buggy drivers */
+ return ULONG_MAX;
+}
+
+static inline bool file_mmap_ok(struct file *file, struct inode *inode,
+ unsigned long pgoff, unsigned long len)
+{
+ u64 maxsize = file_mmap_size_max(file, inode);
+
+ if (maxsize && len > maxsize)
+ return false;
+ maxsize -= len;
+ if (pgoff > maxsize >> PAGE_SHIFT)
+ return false;
+ return true;
+}
+
/*
* The caller must hold down_write(&current->mm->mmap_sem).
*/
@@ -1409,6 +1438,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
struct inode *inode = file_inode(file);
unsigned long flags_mask;
+ if (!file_mmap_ok(file, inode, pgoff, len))
+ return -EOVERFLOW;
+
flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
switch (flags & MAP_TYPE) {
@@ -3024,6 +3056,32 @@ void exit_mmap(struct mm_struct *mm)
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);
+ if (unlikely(mm_is_oom_victim(mm))) {
+ /*
+ * Manually reap the mm to free as much memory as possible.
+ * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
+ * this mm from further consideration. Taking mm->mmap_sem for
+ * write after setting MMF_OOM_SKIP will guarantee that the oom
+ * reaper will not run on this mm again after mmap_sem is
+ * dropped.
+ *
+ * Nothing can be holding mm->mmap_sem here and the above call
+ * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
+ * __oom_reap_task_mm() will not block.
+ *
+ * This needs to be done before calling munlock_vma_pages_all(),
+ * which clears VM_LOCKED, otherwise the oom reaper cannot
+ * reliably test it.
+ */
+ mutex_lock(&oom_lock);
+ __oom_reap_task_mm(mm);
+ mutex_unlock(&oom_lock);
+
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+ }
+
if (mm->locked_vm) {
vma = mm->mmap;
while (vma) {
@@ -3045,24 +3103,6 @@ void exit_mmap(struct mm_struct *mm)
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
-
- if (unlikely(mm_is_oom_victim(mm))) {
- /*
- * Wait for oom_reap_task() to stop working on this
- * mm. Because MMF_OOM_SKIP is already set before
- * calling down_read(), oom_reap_task() will not run
- * on this "mm" post up_write().
- *
- * mm_is_oom_victim() cannot be set from under us
- * either because victim->mm is already set to NULL
- * under task_lock before calling mmput and oom_mm is
- * set not NULL by the OOM killer only if victim->mm
- * is found not NULL while holding the task_lock.
- */
- set_bit(MMF_OOM_SKIP, &mm->flags);
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
- }
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, 0, -1);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ff992fa8760a..8ba6cb88cf58 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -469,7 +469,6 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
return false;
}
-
#ifdef CONFIG_MMU
/*
* OOM Reaper kernel thread which tries to reap the memory used by the OOM
@@ -480,16 +479,54 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);
-static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
+void __oom_reap_task_mm(struct mm_struct *mm)
{
- struct mmu_gather tlb;
struct vm_area_struct *vma;
+
+ /*
+ * Tell all users of get_user/copy_from_user etc... that the content
+ * is no longer stable. No barriers really needed because unmapping
+ * should imply barriers already and the reader would hit a page fault
+ * if it stumbled over a reaped memory.
+ */
+ set_bit(MMF_UNSTABLE, &mm->flags);
+
+ for (vma = mm->mmap ; vma; vma = vma->vm_next) {
+ if (!can_madv_dontneed_vma(vma))
+ continue;
+
+ /*
+ * Only anonymous pages have a good chance to be dropped
+ * without additional steps which we cannot afford as we
+ * are OOM already.
+ *
+ * We do not even care about fs backed pages because all
+ * which are reclaimable have already been reclaimed and
+ * we do not want to block exit_mmap by keeping mm ref
+ * count elevated without a good reason.
+ */
+ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
+ const unsigned long start = vma->vm_start;
+ const unsigned long end = vma->vm_end;
+ struct mmu_gather tlb;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ unmap_page_range(&tlb, vma, start, end, NULL);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+ }
+ }
+}
+
+static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
+{
bool ret = true;
/*
* We have to make sure to not race with the victim exit path
* and cause premature new oom victim selection:
- * __oom_reap_task_mm exit_mm
+ * oom_reap_task_mm exit_mm
* mmget_not_zero
* mmput
* atomic_dec_and_test
@@ -534,39 +571,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
trace_start_task_reaping(tsk->pid);
- /*
- * Tell all users of get_user/copy_from_user etc... that the content
- * is no longer stable. No barriers really needed because unmapping
- * should imply barriers already and the reader would hit a page fault
- * if it stumbled over a reaped memory.
- */
- set_bit(MMF_UNSTABLE, &mm->flags);
-
- for (vma = mm->mmap ; vma; vma = vma->vm_next) {
- if (!can_madv_dontneed_vma(vma))
- continue;
+ __oom_reap_task_mm(mm);
- /*
- * Only anonymous pages have a good chance to be dropped
- * without additional steps which we cannot afford as we
- * are OOM already.
- *
- * We do not even care about fs backed pages because all
- * which are reclaimable have already been reclaimed and
- * we do not want to block exit_mmap by keeping mm ref
- * count elevated without a good reason.
- */
- if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
- const unsigned long start = vma->vm_start;
- const unsigned long end = vma->vm_end;
-
- tlb_gather_mmu(&tlb, mm, start, end);
- mmu_notifier_invalidate_range_start(mm, start, end);
- unmap_page_range(&tlb, vma, start, end, NULL);
- mmu_notifier_invalidate_range_end(mm, start, end);
- tlb_finish_mmu(&tlb, start, end);
- }
- }
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(tsk), tsk->comm,
K(get_mm_counter(mm, MM_ANONPAGES)),
@@ -587,14 +593,13 @@ static void oom_reap_task(struct task_struct *tsk)
struct mm_struct *mm = tsk->signal->oom_mm;
/* Retry the down_read_trylock(mmap_sem) a few times */
- while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
+ while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
schedule_timeout_idle(HZ/10);
if (attempts <= MAX_OOM_REAP_RETRIES ||
test_bit(MMF_OOM_SKIP, &mm->flags))
goto done;
-
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
task_pid_nr(tsk), tsk->comm);
debug_show_all_locks();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 905db9d7962f..22320ea27489 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1743,38 +1743,16 @@ void __init page_alloc_init_late(void)
}
#ifdef CONFIG_CMA
-static void __init adjust_present_page_count(struct page *page, long count)
-{
- struct zone *zone = page_zone(page);
-
- /* We don't need to hold a lock since it is boot-up process */
- zone->present_pages += count;
-}
-
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void __init init_cma_reserved_pageblock(struct page *page)
{
unsigned i = pageblock_nr_pages;
- unsigned long pfn = page_to_pfn(page);
struct page *p = page;
- int nid = page_to_nid(page);
-
- /*
- * ZONE_MOVABLE will steal present pages from other zones by
- * changing page links so page_zone() is changed. Before that,
- * we need to adjust previous zone's page count first.
- */
- adjust_present_page_count(page, -pageblock_nr_pages);
do {
__ClearPageReserved(p);
set_page_count(p, 0);
-
- /* Steal pages from other zones */
- set_page_links(p, ZONE_MOVABLE, nid, pfn);
- } while (++p, ++pfn, --i);
-
- adjust_present_page_count(page, pageblock_nr_pages);
+ } while (++p, --i);
set_pageblock_migratetype(page, MIGRATE_CMA);
@@ -2889,7 +2867,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
* exists.
*/
watermark = min_wmark_pages(zone) + (1UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
return 0;
__mod_zone_freepage_state(zone, -(1UL << order), mt);
@@ -3165,6 +3143,12 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
}
+#ifdef CONFIG_CMA
+ /* If allocation can't use CMA areas don't use free CMA pages */
+ if (!(alloc_flags & ALLOC_CMA))
+ free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
+
/*
* Check watermarks for an order-0 allocation request. If these
* are not met, then a high-order request also cannot go ahead
@@ -3191,8 +3175,10 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
}
#ifdef CONFIG_CMA
- if (!list_empty(&area->free_list[MIGRATE_CMA]))
+ if ((alloc_flags & ALLOC_CMA) &&
+ !list_empty(&area->free_list[MIGRATE_CMA])) {
return true;
+ }
#endif
if (alloc_harder &&
!list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
@@ -3212,6 +3198,13 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx, unsigned int alloc_flags)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
+ long cma_pages = 0;
+
+#ifdef CONFIG_CMA
+ /* If allocation can't use CMA areas don't use free CMA pages */
+ if (!(alloc_flags & ALLOC_CMA))
+ cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
/*
* Fast check for order-0 only. If this fails then the reserves
@@ -3220,7 +3213,7 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
* the caller is !atomic then it'll uselessly search the free
* list. That corner case is then slower but it is harmless.
*/
- if (!order && free_pages > mark + z->lowmem_reserve[classzone_idx])
+ if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
return true;
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
@@ -3856,6 +3849,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
+#ifdef CONFIG_CMA
+ if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+#endif
return alloc_flags;
}
@@ -4322,6 +4319,9 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
if (should_fail_alloc_page(gfp_mask, order))
return false;
+ if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
+ *alloc_flags |= ALLOC_CMA;
+
return true;
}
@@ -6204,7 +6204,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
- unsigned long node_end_pfn = 0;
pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
@@ -6232,13 +6231,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
unsigned long zone_start_pfn = zone->zone_start_pfn;
- unsigned long movable_size = 0;
size = zone->spanned_pages;
realsize = freesize = zone->present_pages;
- if (zone_end_pfn(zone) > node_end_pfn)
- node_end_pfn = zone_end_pfn(zone);
-
/*
* Adjust freesize so that it accounts for how much memory
@@ -6287,30 +6282,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
zone_seqlock_init(zone);
zone_pcp_init(zone);
- /*
- * The size of the CMA area is unknown now so we need to
- * prepare the memory for the usemap at maximum.
- */
- if (IS_ENABLED(CONFIG_CMA) && j == ZONE_MOVABLE &&
- pgdat->node_spanned_pages) {
- movable_size = node_end_pfn - pgdat->node_start_pfn;
- }
-
- if (!size && !movable_size)
+ if (!size)
continue;
set_pageblock_order();
- if (movable_size) {
- zone->zone_start_pfn = pgdat->node_start_pfn;
- zone->spanned_pages = movable_size;
- setup_usemap(pgdat, zone,
- pgdat->node_start_pfn, movable_size);
- init_currently_empty_zone(zone,
- pgdat->node_start_pfn, movable_size);
- } else {
- setup_usemap(pgdat, zone, zone_start_pfn, size);
- init_currently_empty_zone(zone, zone_start_pfn, size);
- }
+ setup_usemap(pgdat, zone, zone_start_pfn, size);
+ init_currently_empty_zone(zone, zone_start_pfn, size);
memmap_init(size, nid, j, zone_start_pfn);
}
}
@@ -7621,11 +7598,12 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
unsigned long pfn, iter, found;
/*
- * For avoiding noise data, lru_add_drain_all() should be called
- * If ZONE_MOVABLE, the zone never contains unmovable pages
+ * TODO we could make this much more efficient by not checking every
+ * page in the range if we know all of them are in MOVABLE_ZONE and
+ * that the movable zone guarantees that pages are migratable but
+ * the later is not the case right now unfortunatelly. E.g. movablecore
+ * can still lead to having bootmem allocations in zone_movable.
*/
- if (zone_idx(zone) == ZONE_MOVABLE)
- return false;
/*
* CMA allocations (alloc_contig_range) really need to mark isolate
@@ -7646,7 +7624,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
page = pfn_to_page(check);
if (PageReserved(page))
- return true;
+ goto unmovable;
/*
* Hugepages are not in LRU lists, but they're movable.
@@ -7696,9 +7674,12 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* page at boot.
*/
if (found > count)
- return true;
+ goto unmovable;
}
return false;
+unmovable:
+ WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
+ return true;
}
bool is_pageblock_removable_nolock(struct page *page)
@@ -7951,7 +7932,7 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
}
#endif
-#if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA
+#ifdef CONFIG_MEMORY_HOTPLUG
/*
* The zone indicated has a new number of managed_pages; batch sizes and percpu
* page high values need to be recalulated.
diff --git a/mm/sparse.c b/mm/sparse.c
index 62eef264a7bd..73dc2fcc0eab 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -629,7 +629,7 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- unsigned long section_nr = pfn_to_section_nr(start_pfn);
+ unsigned long section_nr = pfn_to_section_nr(pfn);
struct mem_section *ms;
/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index cc2cf04d9018..78a015fcec3b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3112,6 +3112,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
unsigned long *frontswap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
+ bool inced_nr_rotate_swap = false;
if (swap_flags & ~SWAP_FLAGS_VALID)
return -EINVAL;
@@ -3215,8 +3216,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
cluster = per_cpu_ptr(p->percpu_cluster, cpu);
cluster_set_null(&cluster->index);
}
- } else
+ } else {
atomic_inc(&nr_rotate_swap);
+ inced_nr_rotate_swap = true;
+ }
error = swap_cgroup_swapon(p->type, maxpages);
if (error)
@@ -3307,6 +3310,8 @@ bad_swap:
vfree(swap_map);
kvfree(cluster_info);
kvfree(frontswap_map);
+ if (inced_nr_rotate_swap)
+ atomic_dec(&nr_rotate_swap);
if (swap_file) {
if (inode && S_ISREG(inode->i_mode)) {
inode_unlock(inode);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9b697323a88c..9270a4370d54 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1418,7 +1418,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
return ret;
mapping = page_mapping(page);
- migrate_dirty = mapping && mapping->a_ops->migratepage;
+ migrate_dirty = !mapping || mapping->a_ops->migratepage;
unlock_page(page);
if (!migrate_dirty)
return ret;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 536332e988b8..a2b9518980ce 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1161,7 +1161,7 @@ const char * const vmstat_text[] = {
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
"nr_written",
- "nr_indirectly_reclaimable",
+ "", /* nr_indirectly_reclaimable */
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -1740,6 +1740,10 @@ static int vmstat_show(struct seq_file *m, void *arg)
unsigned long *l = arg;
unsigned long off = l - (unsigned long *)m->private;
+ /* Skip hidden vmstat items. */
+ if (*vmstat_text[off] == '\0')
+ return 0;
+
seq_puts(m, vmstat_text[off]);
seq_put_decimal_ull(m, " ", *l);
seq_putc(m, '\n');
diff --git a/mm/z3fold.c b/mm/z3fold.c
index c0bca6153b95..4b366d181f35 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -144,7 +144,8 @@ enum z3fold_page_flags {
PAGE_HEADLESS = 0,
MIDDLE_CHUNK_MAPPED,
NEEDS_COMPACTING,
- PAGE_STALE
+ PAGE_STALE,
+ UNDER_RECLAIM
};
/*****************
@@ -173,6 +174,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page,
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
clear_bit(NEEDS_COMPACTING, &page->private);
clear_bit(PAGE_STALE, &page->private);
+ clear_bit(UNDER_RECLAIM, &page->private);
spin_lock_init(&zhdr->page_lock);
kref_init(&zhdr->refcount);
@@ -756,6 +758,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
atomic64_dec(&pool->pages_nr);
return;
}
+ if (test_bit(UNDER_RECLAIM, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ return;
+ }
if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
z3fold_page_unlock(zhdr);
return;
@@ -840,6 +846,8 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
kref_get(&zhdr->refcount);
list_del_init(&zhdr->buddy);
zhdr->cpu = -1;
+ set_bit(UNDER_RECLAIM, &page->private);
+ break;
}
list_del_init(&page->lru);
@@ -887,25 +895,35 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
goto next;
}
next:
- spin_lock(&pool->lock);
if (test_bit(PAGE_HEADLESS, &page->private)) {
if (ret == 0) {
- spin_unlock(&pool->lock);
free_z3fold_page(page);
return 0;
}
- } else if (kref_put(&zhdr->refcount, release_z3fold_page)) {
- atomic64_dec(&pool->pages_nr);
+ spin_lock(&pool->lock);
+ list_add(&page->lru, &pool->lru);
+ spin_unlock(&pool->lock);
+ } else {
+ z3fold_page_lock(zhdr);
+ clear_bit(UNDER_RECLAIM, &page->private);
+ if (kref_put(&zhdr->refcount,
+ release_z3fold_page_locked)) {
+ atomic64_dec(&pool->pages_nr);
+ return 0;
+ }
+ /*
+ * if we are here, the page is still not completely
+ * free. Take the global pool lock then to be able
+ * to add it back to the lru list
+ */
+ spin_lock(&pool->lock);
+ list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
- return 0;
+ z3fold_page_unlock(zhdr);
}
- /*
- * Add to the beginning of LRU.
- * Pool lock has to be kept here to ensure the page has
- * not already been released
- */
- list_add(&page->lru, &pool->lru);
+ /* We started off locked to we need to lock the pool back */
+ spin_lock(&pool->lock);
}
spin_unlock(&pool->lock);
return -EAGAIN;