summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig21
-rw-r--r--mm/compaction.c158
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/highmem.c12
-rw-r--r--mm/huge_memory.c11
-rw-r--r--mm/hugetlb.c37
-rw-r--r--mm/internal.h9
-rw-r--r--mm/memcontrol.c38
-rw-r--r--mm/memory_hotplug.c21
-rw-r--r--mm/mempolicy.c63
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/mmap.c5
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mremap.c47
-rw-r--r--mm/oom_kill.c112
-rw-r--r--mm/page-writeback.c8
-rw-r--r--mm/page_alloc.c885
-rw-r--r--mm/page_isolation.c10
-rw-r--r--mm/page_owner.c5
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/shmem.c130
-rw-r--r--mm/slab.c771
-rw-r--r--mm/slub.c16
-rw-r--r--mm/swap_state.c3
-rw-r--r--mm/util.c23
-rw-r--r--mm/vmscan.c27
-rw-r--r--mm/vmstat.c112
27 files changed, 1556 insertions, 982 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 989f8f3d77e0..b0432b71137d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -192,6 +192,22 @@ config MEMORY_HOTPLUG_SPARSE
def_bool y
depends on SPARSEMEM && MEMORY_HOTPLUG
+config MEMORY_HOTPLUG_DEFAULT_ONLINE
+ bool "Online the newly added memory blocks by default"
+ default n
+ depends on MEMORY_HOTPLUG
+ help
+ This option sets the default policy setting for memory hotplug
+ onlining policy (/sys/devices/system/memory/auto_online_blocks) which
+ determines what happens to newly added memory regions. Policy setting
+ can always be changed at runtime.
+ See Documentation/memory-hotplug.txt for more information.
+
+ Say Y here if you want all hot-plugged memory blocks to appear in
+ 'online' state by default.
+ Say N here if you want the default policy to keep all hot-plugged
+ memory blocks in 'offline' state.
+
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
select MEMORY_ISOLATION
@@ -268,11 +284,6 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
config PHYS_ADDR_T_64BIT
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
-config ZONE_DMA_FLAG
- int
- default "0" if !ZONE_DMA
- default "1"
-
config BOUNCE
bool "Enable bounce buffers"
default y
diff --git a/mm/compaction.c b/mm/compaction.c
index 8fa254043801..eda3c2244f30 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -42,6 +42,11 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
#define CREATE_TRACE_POINTS
#include <trace/events/compaction.h>
+#define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order))
+#define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order))
+#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order)
+#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order)
+
static unsigned long release_freepages(struct list_head *freelist)
{
struct page *page, *next;
@@ -161,7 +166,7 @@ static void reset_cached_positions(struct zone *zone)
zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
zone->compact_cached_free_pfn =
- round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages);
+ pageblock_start_pfn(zone_end_pfn(zone) - 1);
}
/*
@@ -519,10 +524,10 @@ isolate_freepages_range(struct compact_control *cc,
LIST_HEAD(freelist);
pfn = start_pfn;
- block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ block_start_pfn = pageblock_start_pfn(pfn);
if (block_start_pfn < cc->zone->zone_start_pfn)
block_start_pfn = cc->zone->zone_start_pfn;
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(pfn);
for (; pfn < end_pfn; pfn += isolated,
block_start_pfn = block_end_pfn,
@@ -538,8 +543,8 @@ isolate_freepages_range(struct compact_control *cc,
* scanning range to right one.
*/
if (pfn >= block_end_pfn) {
- block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_start_pfn = pageblock_start_pfn(pfn);
+ block_end_pfn = pageblock_end_pfn(pfn);
block_end_pfn = min(block_end_pfn, end_pfn);
}
@@ -633,12 +638,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
{
struct zone *zone = cc->zone;
unsigned long nr_scanned = 0, nr_isolated = 0;
- struct list_head *migratelist = &cc->migratepages;
struct lruvec *lruvec;
unsigned long flags = 0;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
unsigned long start_pfn = low_pfn;
+ bool skip_on_failure = false;
+ unsigned long next_skip_pfn = 0;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -659,10 +665,37 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (compact_should_abort(cc))
return 0;
+ if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
+ skip_on_failure = true;
+ next_skip_pfn = block_end_pfn(low_pfn, cc->order);
+ }
+
/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {
bool is_lru;
+ if (skip_on_failure && low_pfn >= next_skip_pfn) {
+ /*
+ * We have isolated all migration candidates in the
+ * previous order-aligned block, and did not skip it due
+ * to failure. We should migrate the pages now and
+ * hopefully succeed compaction.
+ */
+ if (nr_isolated)
+ break;
+
+ /*
+ * We failed to isolate in the previous order-aligned
+ * block. Set the new boundary to the end of the
+ * current block. Note we can't simply increase
+ * next_skip_pfn by 1 << order, as low_pfn might have
+ * been incremented by a higher number due to skipping
+ * a compound or a high-order buddy page in the
+ * previous loop iteration.
+ */
+ next_skip_pfn = block_end_pfn(low_pfn, cc->order);
+ }
+
/*
* Periodically drop the lock (if held) regardless of its
* contention, to give chance to IRQs. Abort async compaction
@@ -674,7 +707,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
break;
if (!pfn_valid_within(low_pfn))
- continue;
+ goto isolate_fail;
nr_scanned++;
page = pfn_to_page(low_pfn);
@@ -729,11 +762,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (likely(comp_order < MAX_ORDER))
low_pfn += (1UL << comp_order) - 1;
- continue;
+ goto isolate_fail;
}
if (!is_lru)
- continue;
+ goto isolate_fail;
/*
* Migration will fail if an anonymous page is pinned in memory,
@@ -742,7 +775,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
if (!page_mapping(page) &&
page_count(page) > page_mapcount(page))
- continue;
+ goto isolate_fail;
/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
@@ -753,7 +786,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Recheck PageLRU and PageCompound under lock */
if (!PageLRU(page))
- continue;
+ goto isolate_fail;
/*
* Page become compound since the non-locked check,
@@ -762,7 +795,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
if (unlikely(PageCompound(page))) {
low_pfn += (1UL << compound_order(page)) - 1;
- continue;
+ goto isolate_fail;
}
}
@@ -770,7 +803,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Try isolate the page */
if (__isolate_lru_page(page, isolate_mode) != 0)
- continue;
+ goto isolate_fail;
VM_BUG_ON_PAGE(PageCompound(page), page);
@@ -778,15 +811,55 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
del_page_from_lru_list(page, lruvec, page_lru(page));
isolate_success:
- list_add(&page->lru, migratelist);
+ list_add(&page->lru, &cc->migratepages);
cc->nr_migratepages++;
nr_isolated++;
+ /*
+ * Record where we could have freed pages by migration and not
+ * yet flushed them to buddy allocator.
+ * - this is the lowest page that was isolated and likely be
+ * then freed by migration.
+ */
+ if (!cc->last_migrated_pfn)
+ cc->last_migrated_pfn = low_pfn;
+
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
break;
}
+
+ continue;
+isolate_fail:
+ if (!skip_on_failure)
+ continue;
+
+ /*
+ * We have isolated some pages, but then failed. Release them
+ * instead of migrating, as we cannot form the cc->order buddy
+ * page anyway.
+ */
+ if (nr_isolated) {
+ if (locked) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ locked = false;
+ }
+ acct_isolated(zone, cc);
+ putback_movable_pages(&cc->migratepages);
+ cc->nr_migratepages = 0;
+ cc->last_migrated_pfn = 0;
+ nr_isolated = 0;
+ }
+
+ if (low_pfn < next_skip_pfn) {
+ low_pfn = next_skip_pfn - 1;
+ /*
+ * The check near the loop beginning would have updated
+ * next_skip_pfn too, but this is a bit simpler.
+ */
+ next_skip_pfn += 1UL << cc->order;
+ }
}
/*
@@ -834,10 +907,10 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
/* Scan block by block. First and last block may be incomplete */
pfn = start_pfn;
- block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ block_start_pfn = pageblock_start_pfn(pfn);
if (block_start_pfn < cc->zone->zone_start_pfn)
block_start_pfn = cc->zone->zone_start_pfn;
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(pfn);
for (; pfn < end_pfn; pfn = block_end_pfn,
block_start_pfn = block_end_pfn,
@@ -924,10 +997,10 @@ static void isolate_freepages(struct compact_control *cc)
* is using.
*/
isolate_start_pfn = cc->free_pfn;
- block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
+ block_start_pfn = pageblock_start_pfn(cc->free_pfn);
block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
zone_end_pfn(zone));
- low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
+ low_pfn = pageblock_end_pfn(cc->migrate_pfn);
/*
* Isolate free pages until enough are available to migrate the
@@ -1070,7 +1143,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
unsigned long block_start_pfn;
unsigned long block_end_pfn;
unsigned long low_pfn;
- unsigned long isolate_start_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
@@ -1081,12 +1153,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
* initialized by compact_zone()
*/
low_pfn = cc->migrate_pfn;
- block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1);
+ block_start_pfn = pageblock_start_pfn(low_pfn);
if (block_start_pfn < zone->zone_start_pfn)
block_start_pfn = zone->zone_start_pfn;
/* Only scan within a pageblock boundary */
- block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(low_pfn);
/*
* Iterate over whole pageblocks until we find the first suitable.
@@ -1125,7 +1197,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
continue;
/* Perform the isolation */
- isolate_start_pfn = low_pfn;
low_pfn = isolate_migratepages_block(cc, low_pfn,
block_end_pfn, isolate_mode);
@@ -1135,15 +1206,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
}
/*
- * Record where we could have freed pages by migration and not
- * yet flushed them to buddy allocator.
- * - this is the lowest page that could have been isolated and
- * then freed by migration.
- */
- if (cc->nr_migratepages && !cc->last_migrated_pfn)
- cc->last_migrated_pfn = isolate_start_pfn;
-
- /*
* Either we isolated something and proceed with migration. Or
* we failed and compact_zone should decide if we should
* continue or not.
@@ -1251,7 +1313,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
* COMPACT_CONTINUE - If compaction should run now
*/
static unsigned long __compaction_suitable(struct zone *zone, int order,
- int alloc_flags, int classzone_idx)
+ unsigned int alloc_flags,
+ int classzone_idx)
{
int fragindex;
unsigned long watermark;
@@ -1296,7 +1359,8 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
}
unsigned long compaction_suitable(struct zone *zone, int order,
- int alloc_flags, int classzone_idx)
+ unsigned int alloc_flags,
+ int classzone_idx)
{
unsigned long ret;
@@ -1343,7 +1407,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
cc->free_pfn = zone->compact_cached_free_pfn;
if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
- cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages);
+ cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
zone->compact_cached_free_pfn = cc->free_pfn;
}
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
@@ -1398,6 +1462,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
ret = COMPACT_CONTENDED;
goto out;
}
+ /*
+ * We failed to migrate at least one page in the current
+ * order-aligned block, so skip the rest of it.
+ */
+ if (cc->direct_compaction &&
+ (cc->mode == MIGRATE_ASYNC)) {
+ cc->migrate_pfn = block_end_pfn(
+ cc->migrate_pfn - 1, cc->order);
+ /* Draining pcplists is useless in this case */
+ cc->last_migrated_pfn = 0;
+
+ }
}
check_drain:
@@ -1411,7 +1487,7 @@ check_drain:
if (cc->order > 0 && cc->last_migrated_pfn) {
int cpu;
unsigned long current_block_start =
- cc->migrate_pfn & ~((1UL << cc->order) - 1);
+ block_start_pfn(cc->migrate_pfn, cc->order);
if (cc->last_migrated_pfn < current_block_start) {
cpu = get_cpu();
@@ -1436,7 +1512,7 @@ out:
cc->nr_freepages = 0;
VM_BUG_ON(free_pfn == 0);
/* The cached pfn is always the first in a pageblock */
- free_pfn &= ~(pageblock_nr_pages-1);
+ free_pfn = pageblock_start_pfn(free_pfn);
/*
* Only go back, not forward. The cached pfn might have been
* already reset to zone end in compact_finished()
@@ -1456,7 +1532,7 @@ out:
static unsigned long compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum migrate_mode mode, int *contended,
- int alloc_flags, int classzone_idx)
+ unsigned int alloc_flags, int classzone_idx)
{
unsigned long ret;
struct compact_control cc = {
@@ -1497,8 +1573,8 @@ int sysctl_extfrag_threshold = 500;
* This is the main entry point for direct page compaction.
*/
unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
- int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, int *contended)
+ unsigned int alloc_flags, const struct alloc_context *ac,
+ enum migrate_mode mode, int *contended)
{
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
@@ -1526,7 +1602,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
status = compact_zone_order(zone, order, gfp_mask, mode,
&zone_contended, alloc_flags,
- ac->classzone_idx);
+ ac_classzone_idx(ac));
rc = max(status, rc);
/*
* It takes at least one zone that wasn't lock contended
@@ -1536,7 +1612,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
/* If a normal allocation would succeed, stop compacting */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
- ac->classzone_idx, alloc_flags)) {
+ ac_classzone_idx(ac), alloc_flags)) {
/*
* We think the allocation will succeed in this zone,
* but it is not certain, hence the false. The caller
diff --git a/mm/filemap.c b/mm/filemap.c
index 182b21825255..01690338e3d2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -213,7 +213,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
* some other bad page check should catch it later.
*/
page_mapcount_reset(page);
- atomic_sub(mapcount, &page->_count);
+ page_ref_sub(page, mapcount);
}
}
diff --git a/mm/highmem.c b/mm/highmem.c
index 123bcd3ed4f2..50b4ca6787f0 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -112,16 +112,12 @@ EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
unsigned int nr_free_highpages (void)
{
- pg_data_t *pgdat;
+ struct zone *zone;
unsigned int pages = 0;
- for_each_online_pgdat(pgdat) {
- pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
- NR_FREE_PAGES);
- if (zone_movable_is_highmem())
- pages += zone_page_state(
- &pgdat->node_zones[ZONE_MOVABLE],
- NR_FREE_PAGES);
+ for_each_populated_zone(zone) {
+ if (is_highmem(zone))
+ pages += zone_page_state(zone, NR_FREE_PAGES);
}
return pages;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b49ee126d4d1..66675eed67be 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1698,20 +1698,17 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return 1;
}
-bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
- unsigned long old_addr,
+bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
pmd_t pmd;
-
struct mm_struct *mm = vma->vm_mm;
if ((old_addr & ~HPAGE_PMD_MASK) ||
(new_addr & ~HPAGE_PMD_MASK) ||
- old_end - old_addr < HPAGE_PMD_SIZE ||
- (new_vma->vm_flags & VM_NOHUGEPAGE))
+ old_end - old_addr < HPAGE_PMD_SIZE)
return false;
/*
@@ -3113,7 +3110,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
/*
- * tail_page->_count is zero and not changing from under us. But
+ * tail_page->_refcount is zero and not changing from under us. But
* get_page_unless_zero() may be running from under us on the
* tail_page. If we used atomic_set() below instead of atomic_inc(), we
* would then run atomic_set() concurrently with
@@ -3340,7 +3337,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (mlocked)
lru_add_drain();
- /* Prevent deferred_split_scan() touching ->_count */
+ /* Prevent deferred_split_scan() touching ->_refcount */
spin_lock_irqsave(&pgdata->split_queue_lock, flags);
count = page_count(head);
mapcount = total_mapcount(head);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 19d0d08b396f..949d80609a32 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -51,6 +51,7 @@ __initdata LIST_HEAD(huge_boot_pages);
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static unsigned long __initdata default_hstate_size;
+static bool __initdata parsed_valid_hugepagesz = true;
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -144,7 +145,8 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
}
}
- if (spool->min_hpages != -1) { /* minimum size accounting */
+ /* minimum size accounting */
+ if (spool->min_hpages != -1 && spool->rsv_hpages) {
if (delta > spool->rsv_hpages) {
/*
* Asking for more reserves than those already taken on
@@ -182,7 +184,8 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
if (spool->max_hpages != -1) /* maximum size accounting */
spool->used_hpages -= delta;
- if (spool->min_hpages != -1) { /* minimum size accounting */
+ /* minimum size accounting */
+ if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
if (spool->rsv_hpages + delta <= spool->min_hpages)
ret = 0;
else
@@ -937,9 +940,7 @@ err:
*/
static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
{
- nid = next_node(nid, *nodes_allowed);
- if (nid == MAX_NUMNODES)
- nid = first_node(*nodes_allowed);
+ nid = next_node_in(nid, *nodes_allowed);
VM_BUG_ON(nid >= MAX_NUMNODES);
return nid;
@@ -1030,8 +1031,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn,
return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
}
-static bool pfn_range_valid_gigantic(unsigned long start_pfn,
- unsigned long nr_pages)
+static bool pfn_range_valid_gigantic(struct zone *z,
+ unsigned long start_pfn, unsigned long nr_pages)
{
unsigned long i, end_pfn = start_pfn + nr_pages;
struct page *page;
@@ -1042,6 +1043,9 @@ static bool pfn_range_valid_gigantic(unsigned long start_pfn,
page = pfn_to_page(i);
+ if (page_zone(page) != z)
+ return false;
+
if (PageReserved(page))
return false;
@@ -1074,7 +1078,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order)
pfn = ALIGN(z->zone_start_pfn, nr_pages);
while (zone_spans_last_pfn(z, pfn, nr_pages)) {
- if (pfn_range_valid_gigantic(pfn, nr_pages)) {
+ if (pfn_range_valid_gigantic(z, pfn, nr_pages)) {
/*
* We release the zone lock here because
* alloc_contig_range() will also lock the zone
@@ -2659,6 +2663,11 @@ static int __init hugetlb_init(void)
subsys_initcall(hugetlb_init);
/* Should be called on processing a hugepagesz=... option */
+void __init hugetlb_bad_size(void)
+{
+ parsed_valid_hugepagesz = false;
+}
+
void __init hugetlb_add_hstate(unsigned int order)
{
struct hstate *h;
@@ -2678,8 +2687,8 @@ void __init hugetlb_add_hstate(unsigned int order)
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
- h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
- h->next_nid_to_free = first_node(node_states[N_MEMORY]);
+ h->next_nid_to_alloc = first_memory_node;
+ h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
@@ -2691,11 +2700,17 @@ static int __init hugetlb_nrpages_setup(char *s)
unsigned long *mhp;
static unsigned long *last_mhp;
+ if (!parsed_valid_hugepagesz) {
+ pr_warn("hugepages = %s preceded by "
+ "an unsupported hugepagesz, ignoring\n", s);
+ parsed_valid_hugepagesz = true;
+ return 1;
+ }
/*
* !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
* so this hugepages= parameter goes to the "default hstate".
*/
- if (!hugetlb_max_hstate)
+ else if (!hugetlb_max_hstate)
mhp = &default_hstate_max_huge_pages;
else
mhp = &parsed_hstate->max_huge_pages;
diff --git a/mm/internal.h b/mm/internal.h
index b79abb6721cf..3ac544f1963f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -58,7 +58,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra,
}
/*
- * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * Turn a non-refcounted page (->_refcount == 0) into refcounted with
* a count of one.
*/
static inline void set_page_refcounted(struct page *page)
@@ -102,13 +102,14 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
struct alloc_context {
struct zonelist *zonelist;
nodemask_t *nodemask;
- struct zone *preferred_zone;
- int classzone_idx;
+ struct zoneref *preferred_zoneref;
int migratetype;
enum zone_type high_zoneidx;
bool spread_dirty_pages;
};
+#define ac_classzone_idx(ac) zonelist_zone_idx(ac->preferred_zoneref)
+
/*
* Locate the struct page for both the matching buddy in our
* pair (buddy1) and the combined O(n+1) page they form (page).
@@ -175,7 +176,7 @@ struct compact_control {
bool direct_compaction; /* False from kcompactd or /proc/... */
int order; /* order a direct compactor needs */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
- const int alloc_flags; /* alloc flags of a direct compactor */
+ const unsigned int alloc_flags; /* alloc flags of a direct compactor */
const int classzone_idx; /* zone index of a direct compactor */
struct zone *zone;
int contended; /* Signal need_sched() or lock
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fe787f5c41bd..d71d387868e6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1023,22 +1023,40 @@ out:
* @lru: index of lru list the page is sitting on
* @nr_pages: positive when adding or negative when removing
*
- * This function must be called when a page is added to or removed from an
- * lru list.
+ * This function must be called under lru_lock, just before a page is added
+ * to or just after a page is removed from an lru list (that ordering being
+ * so as to allow it to check that lru_size 0 is consistent with list_empty).
*/
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int nr_pages)
{
struct mem_cgroup_per_zone *mz;
unsigned long *lru_size;
+ long size;
+ bool empty;
+
+ __update_lru_size(lruvec, lru, nr_pages);
if (mem_cgroup_disabled())
return;
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
lru_size = mz->lru_size + lru;
- *lru_size += nr_pages;
- VM_BUG_ON((long)(*lru_size) < 0);
+ empty = list_empty(lruvec->lists + lru);
+
+ if (nr_pages < 0)
+ *lru_size += nr_pages;
+
+ size = *lru_size;
+ if (WARN_ONCE(size < 0 || empty != !size,
+ "%s(%p, %d, %d): lru_size %ld but %sempty\n",
+ __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
+ VM_BUG_ON(1);
+ *lru_size = 0;
+ }
+
+ if (nr_pages > 0)
+ *lru_size += nr_pages;
}
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
@@ -1257,6 +1275,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
*/
if (fatal_signal_pending(current) || task_will_free_mem(current)) {
mark_oom_victim(current);
+ try_oom_reaper(current);
goto unlock;
}
@@ -1389,14 +1408,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
mem_cgroup_may_update_nodemask(memcg);
node = memcg->last_scanned_node;
- node = next_node(node, memcg->scan_nodes);
- if (node == MAX_NUMNODES)
- node = first_node(memcg->scan_nodes);
+ node = next_node_in(node, memcg->scan_nodes);
/*
- * We call this when we hit limit, not when pages are added to LRU.
- * No LRU may hold pages because all pages are UNEVICTABLE or
- * memcg is too small and all pages are not on LRU. In that case,
- * we use curret node.
+ * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
+ * last time it really checked all the LRUs due to rate limiting.
+ * Fallback to the current node in that case for simplicity.
*/
if (unlikely(node == MAX_NUMNODES))
node = numa_node_id();
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index aa34431c3f31..caf2a14c37ad 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -78,9 +78,24 @@ static struct {
#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
+#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
bool memhp_auto_online;
+#else
+bool memhp_auto_online = true;
+#endif
EXPORT_SYMBOL_GPL(memhp_auto_online);
+static int __init setup_memhp_default_state(char *str)
+{
+ if (!strcmp(str, "online"))
+ memhp_auto_online = true;
+ else if (!strcmp(str, "offline"))
+ memhp_auto_online = false;
+
+ return 1;
+}
+__setup("memhp_default_state=", setup_memhp_default_state);
+
void get_online_mems(void)
{
might_sleep();
@@ -1410,7 +1425,7 @@ static struct page *next_active_pageblock(struct page *page)
}
/* Checks if this range of memory is likely to be hot-removable. */
-int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
+bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
struct page *page = pfn_to_page(start_pfn);
struct page *end_page = page + nr_pages;
@@ -1418,12 +1433,12 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
/* Check the starting page of each pageblock within the range */
for (; page < end_page; page = next_active_pageblock(page)) {
if (!is_pageblock_removable_nolock(page))
- return 0;
+ return false;
cond_resched();
}
/* All pageblocks in the memory block are likely to be hot-removable */
- return 1;
+ return true;
}
/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 36cc01bc950a..297d6854f849 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -97,7 +97,6 @@
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
-#include <linux/random.h>
#include "internal.h"
@@ -347,9 +346,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
BUG();
if (!node_isset(current->il_next, tmp)) {
- current->il_next = next_node(current->il_next, tmp);
- if (current->il_next >= MAX_NUMNODES)
- current->il_next = first_node(tmp);
+ current->il_next = next_node_in(current->il_next, tmp);
if (current->il_next >= MAX_NUMNODES)
current->il_next = numa_node_id();
}
@@ -1709,9 +1706,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
struct task_struct *me = current;
nid = me->il_next;
- next = next_node(nid, policy->v.nodes);
- if (next >= MAX_NUMNODES)
- next = first_node(policy->v.nodes);
+ next = next_node_in(nid, policy->v.nodes);
if (next < MAX_NUMNODES)
me->il_next = next;
return nid;
@@ -1744,18 +1739,18 @@ unsigned int mempolicy_slab_node(void)
return interleave_nodes(policy);
case MPOL_BIND: {
+ struct zoneref *z;
+
/*
* Follow bind policy behavior and start allocation at the
* first node.
*/
struct zonelist *zonelist;
- struct zone *zone;
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
zonelist = &NODE_DATA(node)->node_zonelists[0];
- (void)first_zones_zonelist(zonelist, highest_zoneidx,
- &policy->v.nodes,
- &zone);
- return zone ? zone->node : node;
+ z = first_zones_zonelist(zonelist, highest_zoneidx,
+ &policy->v.nodes);
+ return z->zone ? z->zone->node : node;
}
default:
@@ -1763,23 +1758,25 @@ unsigned int mempolicy_slab_node(void)
}
}
-/* Do static interleaving for a VMA with known offset. */
+/*
+ * Do static interleaving for a VMA with known offset @n. Returns the n'th
+ * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
+ * number of present nodes.
+ */
static unsigned offset_il_node(struct mempolicy *pol,
- struct vm_area_struct *vma, unsigned long off)
+ struct vm_area_struct *vma, unsigned long n)
{
unsigned nnodes = nodes_weight(pol->v.nodes);
unsigned target;
- int c;
- int nid = NUMA_NO_NODE;
+ int i;
+ int nid;
if (!nnodes)
return numa_node_id();
- target = (unsigned int)off % nnodes;
- c = 0;
- do {
+ target = (unsigned int)n % nnodes;
+ nid = first_node(pol->v.nodes);
+ for (i = 0; i < target; i++)
nid = next_node(nid, pol->v.nodes);
- c++;
- } while (c <= target);
return nid;
}
@@ -1805,21 +1802,6 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
return interleave_nodes(pol);
}
-/*
- * Return the bit number of a random bit set in the nodemask.
- * (returns NUMA_NO_NODE if nodemask is empty)
- */
-int node_random(const nodemask_t *maskp)
-{
- int w, bit = NUMA_NO_NODE;
-
- w = nodes_weight(*maskp);
- if (w)
- bit = bitmap_ord_to_pos(maskp->bits,
- get_random_int() % w, MAX_NUMNODES);
- return bit;
-}
-
#ifdef CONFIG_HUGETLBFS
/*
* huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
@@ -2284,7 +2266,7 @@ static void sp_free(struct sp_node *n)
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol;
- struct zone *zone;
+ struct zoneref *z;
int curnid = page_to_nid(page);
unsigned long pgoff;
int thiscpu = raw_smp_processor_id();
@@ -2316,6 +2298,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
break;
case MPOL_BIND:
+
/*
* allows binding to multiple nodes.
* use current page if in policy nodemask,
@@ -2324,11 +2307,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
*/
if (node_isset(curnid, pol->v.nodes))
goto out;
- (void)first_zones_zonelist(
+ z = first_zones_zonelist(
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
- &pol->v.nodes, &zone);
- polnid = zone->node;
+ &pol->v.nodes);
+ polnid = z->zone->node;
break;
default:
diff --git a/mm/migrate.c b/mm/migrate.c
index f9dfb18a4eba..53ab6398e7a2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -332,7 +332,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
- SetPageSwapBacked(newpage);
+ __SetPageSwapBacked(newpage);
return MIGRATEPAGE_SUCCESS;
}
@@ -378,7 +378,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
- SetPageSwapBacked(newpage);
+ __SetPageSwapBacked(newpage);
get_page(newpage); /* add cache reference */
if (PageSwapCache(page)) {
@@ -1791,7 +1791,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
/* Prepare a page as a migration target */
__SetPageLocked(new_page);
- SetPageSwapBacked(new_page);
+ __SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
new_page->mapping = page->mapping;
diff --git a/mm/mmap.c b/mm/mmap.c
index bd2e1a533bc1..fba246b8f1a5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -55,10 +55,6 @@
#define arch_mmap_check(addr, len, flags) (0)
#endif
-#ifndef arch_rebalance_pgtables
-#define arch_rebalance_pgtables(addr, len) (addr)
-#endif
-
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
@@ -1911,7 +1907,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
if (offset_in_page(addr))
return -EINVAL;
- addr = arch_rebalance_pgtables(addr, len);
error = security_mmap_addr(addr);
return error ? error : addr;
}
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 52687fb4de6f..5652be858e5e 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -52,7 +52,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
}
/* Returns the next zone at or below highest_zoneidx in a zonelist */
-struct zoneref *next_zones_zonelist(struct zoneref *z,
+struct zoneref *__next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx,
nodemask_t *nodes)
{
diff --git a/mm/mremap.c b/mm/mremap.c
index 3fa0a467df66..9dc499977924 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -70,6 +70,22 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
return pmd;
}
+static void take_rmap_locks(struct vm_area_struct *vma)
+{
+ if (vma->vm_file)
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ if (vma->anon_vma)
+ anon_vma_lock_write(vma->anon_vma);
+}
+
+static void drop_rmap_locks(struct vm_area_struct *vma)
+{
+ if (vma->anon_vma)
+ anon_vma_unlock_write(vma->anon_vma);
+ if (vma->vm_file)
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+}
+
static pte_t move_soft_dirty_pte(pte_t pte)
{
/*
@@ -90,8 +106,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
unsigned long new_addr, bool need_rmap_locks)
{
- struct address_space *mapping = NULL;
- struct anon_vma *anon_vma = NULL;
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
spinlock_t *old_ptl, *new_ptl;
@@ -114,16 +128,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* serialize access to individual ptes, but only rmap traversal
* order guarantees that we won't miss both the old and new ptes).
*/
- if (need_rmap_locks) {
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_write(mapping);
- }
- if (vma->anon_vma) {
- anon_vma = vma->anon_vma;
- anon_vma_lock_write(anon_vma);
- }
- }
+ if (need_rmap_locks)
+ take_rmap_locks(vma);
/*
* We don't have to worry about the ordering of src and dst
@@ -151,10 +157,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
spin_unlock(new_ptl);
pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
- if (anon_vma)
- anon_vma_unlock_write(anon_vma);
- if (mapping)
- i_mmap_unlock_write(mapping);
+ if (need_rmap_locks)
+ drop_rmap_locks(vma);
}
#define LATENCY_LIMIT (64 * PAGE_SIZE)
@@ -193,16 +197,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (pmd_trans_huge(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE) {
bool moved;
- VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
- vma);
/* See comment in move_ptes() */
if (need_rmap_locks)
- anon_vma_lock_write(vma->anon_vma);
- moved = move_huge_pmd(vma, new_vma, old_addr,
- new_addr, old_end,
- old_pmd, new_pmd);
+ take_rmap_locks(vma);
+ moved = move_huge_pmd(vma, old_addr, new_addr,
+ old_end, old_pmd, new_pmd);
if (need_rmap_locks)
- anon_vma_unlock_write(vma->anon_vma);
+ drop_rmap_locks(vma);
if (moved) {
need_flush = true;
continue;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 86349586eacb..415f7eb913fa 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -412,6 +412,25 @@ bool oom_killer_disabled __read_mostly;
#define K(x) ((x) << (PAGE_SHIFT-10))
+/*
+ * task->mm can be NULL if the task is the exited group leader. So to
+ * determine whether the task is using a particular mm, we examine all the
+ * task's threads: if one of those is using this mm then this task was also
+ * using it.
+ */
+static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+{
+ struct task_struct *t;
+
+ for_each_thread(p, t) {
+ struct mm_struct *t_mm = READ_ONCE(t->mm);
+ if (t_mm)
+ return t_mm == mm;
+ }
+ return false;
+}
+
+
#ifdef CONFIG_MMU
/*
* OOM Reaper kernel thread which tries to reap the memory used by the OOM
@@ -491,14 +510,10 @@ static bool __oom_reap_task(struct task_struct *tsk)
up_read(&mm->mmap_sem);
/*
- * Clear TIF_MEMDIE because the task shouldn't be sitting on a
- * reasonably reclaimable memory anymore. OOM killer can continue
- * by selecting other victim if unmapping hasn't led to any
- * improvements. This also means that selecting this task doesn't
- * make any sense.
+ * This task can be safely ignored because we cannot do much more
+ * to release its memory.
*/
tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
- exit_oom_victim(tsk);
out:
mmput(mm);
return ret;
@@ -519,6 +534,15 @@ static void oom_reap_task(struct task_struct *tsk)
debug_show_all_locks();
}
+ /*
+ * Clear TIF_MEMDIE because the task shouldn't be sitting on a
+ * reasonably reclaimable memory anymore or it is not a good candidate
+ * for the oom victim right now because it cannot release its memory
+ * itself nor by the oom reaper.
+ */
+ tsk->oom_reaper_list = NULL;
+ exit_oom_victim(tsk);
+
/* Drop a reference taken by wake_oom_reaper */
put_task_struct(tsk);
}
@@ -563,6 +587,53 @@ static void wake_oom_reaper(struct task_struct *tsk)
wake_up(&oom_reaper_wait);
}
+/* Check if we can reap the given task. This has to be called with stable
+ * tsk->mm
+ */
+void try_oom_reaper(struct task_struct *tsk)
+{
+ struct mm_struct *mm = tsk->mm;
+ struct task_struct *p;
+
+ if (!mm)
+ return;
+
+ /*
+ * There might be other threads/processes which are either not
+ * dying or even not killable.
+ */
+ if (atomic_read(&mm->mm_users) > 1) {
+ rcu_read_lock();
+ for_each_process(p) {
+ bool exiting;
+
+ if (!process_shares_mm(p, mm))
+ continue;
+ if (same_thread_group(p, tsk))
+ continue;
+ if (fatal_signal_pending(p))
+ continue;
+
+ /*
+ * If the task is exiting make sure the whole thread group
+ * is exiting and cannot acces mm anymore.
+ */
+ spin_lock_irq(&p->sighand->siglock);
+ exiting = signal_group_exit(p->signal);
+ spin_unlock_irq(&p->sighand->siglock);
+ if (exiting)
+ continue;
+
+ /* Give up */
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+ }
+
+ wake_oom_reaper(tsk);
+}
+
static int __init oom_init(void)
{
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -653,24 +724,6 @@ void oom_killer_enable(void)
}
/*
- * task->mm can be NULL if the task is the exited group leader. So to
- * determine whether the task is using a particular mm, we examine all the
- * task's threads: if one of those is using this mm then this task was also
- * using it.
- */
-static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
-{
- struct task_struct *t;
-
- for_each_thread(p, t) {
- struct mm_struct *t_mm = READ_ONCE(t->mm);
- if (t_mm)
- return t_mm == mm;
- }
- return false;
-}
-
-/*
* Must be called while holding a reference to p, which will be released upon
* returning.
*/
@@ -694,6 +747,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
task_lock(p);
if (p->mm && task_will_free_mem(p)) {
mark_oom_victim(p);
+ try_oom_reaper(p);
task_unlock(p);
put_task_struct(p);
return;
@@ -873,10 +927,20 @@ bool out_of_memory(struct oom_control *oc)
if (current->mm &&
(fatal_signal_pending(current) || task_will_free_mem(current))) {
mark_oom_victim(current);
+ try_oom_reaper(current);
return true;
}
/*
+ * The OOM killer does not compensate for IO-less reclaim.
+ * pagefault_out_of_memory lost its gfp context so we have to
+ * make sure exclude 0 mask - all other users should have at least
+ * ___GFP_DIRECT_RECLAIM to get here.
+ */
+ if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL)))
+ return true;
+
+ /*
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bc5149d5ec38..3b88795ab46e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -296,11 +296,15 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
#ifdef CONFIG_HIGHMEM
int node;
unsigned long x = 0;
+ int i;
for_each_node_state(node, N_HIGH_MEMORY) {
- struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *z = &NODE_DATA(node)->node_zones[i];
- x += zone_dirtyable_memory(z);
+ if (is_highmem(z))
+ x += zone_dirtyable_memory(z);
+ }
}
/*
* Unreclaimable memory (kernel memory or anonymous memory
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c1069efcc4d7..5c469c1dfb8b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -352,6 +352,106 @@ static inline bool update_defer_init(pg_data_t *pgdat,
}
#endif
+/* Return a pointer to the bitmap storing bits affecting a block of pages */
+static inline unsigned long *get_pageblock_bitmap(struct page *page,
+ unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+ return __pfn_to_section(pfn)->pageblock_flags;
+#else
+ return page_zone(page)->pageblock_flags;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+ pfn &= (PAGES_PER_SECTION-1);
+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#else
+ pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+/**
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest to retrieve
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
+ */
+static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
+ unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
+{
+ unsigned long *bitmap;
+ unsigned long bitidx, word_bitidx;
+ unsigned long word;
+
+ bitmap = get_pageblock_bitmap(page, pfn);
+ bitidx = pfn_to_bitidx(page, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
+
+ word = bitmap[word_bitidx];
+ bitidx += end_bitidx;
+ return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
+}
+
+unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
+{
+ return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
+}
+
+static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
+{
+ return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
+}
+
+/**
+ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @flags: The flags to set
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest
+ * @mask: mask of bits that the caller is interested in
+ */
+void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
+ unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
+{
+ unsigned long *bitmap;
+ unsigned long bitidx, word_bitidx;
+ unsigned long old_word, word;
+
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+
+ bitmap = get_pageblock_bitmap(page, pfn);
+ bitidx = pfn_to_bitidx(page, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
+
+ VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+
+ bitidx += end_bitidx;
+ mask <<= (BITS_PER_LONG - bitidx - 1);
+ flags <<= (BITS_PER_LONG - bitidx - 1);
+
+ word = READ_ONCE(bitmap[word_bitidx]);
+ for (;;) {
+ old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+ if (word == old_word)
+ break;
+ word = old_word;
+ }
+}
void set_pageblock_migratetype(struct page *page, int migratetype)
{
@@ -784,17 +884,42 @@ out:
zone->free_area[order].nr_free++;
}
-static inline int free_pages_check(struct page *page)
+/*
+ * A bad page could be due to a number of fields. Instead of multiple branches,
+ * try and check multiple fields with one check. The caller must do a detailed
+ * check if necessary.
+ */
+static inline bool page_expected_state(struct page *page,
+ unsigned long check_flags)
{
- const char *bad_reason = NULL;
- unsigned long bad_flags = 0;
+ if (unlikely(atomic_read(&page->_mapcount) != -1))
+ return false;
+
+ if (unlikely((unsigned long)page->mapping |
+ page_ref_count(page) |
+#ifdef CONFIG_MEMCG
+ (unsigned long)page->mem_cgroup |
+#endif
+ (page->flags & check_flags)))
+ return false;
+
+ return true;
+}
+
+static void free_pages_check_bad(struct page *page)
+{
+ const char *bad_reason;
+ unsigned long bad_flags;
+
+ bad_reason = NULL;
+ bad_flags = 0;
if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
- bad_reason = "nonzero _count";
+ bad_reason = "nonzero _refcount";
if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
@@ -803,16 +928,146 @@ static inline int free_pages_check(struct page *page)
if (unlikely(page->mem_cgroup))
bad_reason = "page still charged to cgroup";
#endif
- if (unlikely(bad_reason)) {
- bad_page(page, bad_reason, bad_flags);
- return 1;
+ bad_page(page, bad_reason, bad_flags);
+}
+
+static inline int free_pages_check(struct page *page)
+{
+ if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
+ return 0;
+
+ /* Something has gone sideways, find it */
+ free_pages_check_bad(page);
+ return 1;
+}
+
+static int free_tail_pages_check(struct page *head_page, struct page *page)
+{
+ int ret = 1;
+
+ /*
+ * We rely page->lru.next never has bit 0 set, unless the page
+ * is PageTail(). Let's make sure that's true even for poisoned ->lru.
+ */
+ BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
+
+ if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
+ ret = 0;
+ goto out;
}
+ switch (page - head_page) {
+ case 1:
+ /* the first tail page: ->mapping is compound_mapcount() */
+ if (unlikely(compound_mapcount(page))) {
+ bad_page(page, "nonzero compound_mapcount", 0);
+ goto out;
+ }
+ break;
+ case 2:
+ /*
+ * the second tail page: ->mapping is
+ * page_deferred_list().next -- ignore value.
+ */
+ break;
+ default:
+ if (page->mapping != TAIL_MAPPING) {
+ bad_page(page, "corrupted mapping in tail page", 0);
+ goto out;
+ }
+ break;
+ }
+ if (unlikely(!PageTail(page))) {
+ bad_page(page, "PageTail not set", 0);
+ goto out;
+ }
+ if (unlikely(compound_head(page) != head_page)) {
+ bad_page(page, "compound_head not consistent", 0);
+ goto out;
+ }
+ ret = 0;
+out:
+ page->mapping = NULL;
+ clear_compound_head(page);
+ return ret;
+}
+
+static __always_inline bool free_pages_prepare(struct page *page,
+ unsigned int order, bool check_free)
+{
+ int bad = 0;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ trace_mm_page_free(page, order);
+ kmemcheck_free_shadow(page, order);
+ kasan_free_pages(page, order);
+
+ /*
+ * Check tail pages before head page information is cleared to
+ * avoid checking PageCompound for order-0 pages.
+ */
+ if (unlikely(order)) {
+ bool compound = PageCompound(page);
+ int i;
+
+ VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
+
+ for (i = 1; i < (1 << order); i++) {
+ if (compound)
+ bad += free_tail_pages_check(page, page + i);
+ if (unlikely(free_pages_check(page + i))) {
+ bad++;
+ continue;
+ }
+ (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ }
+ }
+ if (PageAnonHead(page))
+ page->mapping = NULL;
+ if (check_free)
+ bad += free_pages_check(page);
+ if (bad)
+ return false;
+
page_cpupid_reset_last(page);
- if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
- page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- return 0;
+ page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ reset_page_owner(page, order);
+
+ if (!PageHighMem(page)) {
+ debug_check_no_locks_freed(page_address(page),
+ PAGE_SIZE << order);
+ debug_check_no_obj_freed(page_address(page),
+ PAGE_SIZE << order);
+ }
+ arch_free_page(page, order);
+ kernel_poison_pages(page, 1 << order, 0);
+ kernel_map_pages(page, 1 << order, 0);
+
+ return true;
}
+#ifdef CONFIG_DEBUG_VM
+static inline bool free_pcp_prepare(struct page *page)
+{
+ return free_pages_prepare(page, 0, true);
+}
+
+static inline bool bulkfree_pcp_prepare(struct page *page)
+{
+ return false;
+}
+#else
+static bool free_pcp_prepare(struct page *page)
+{
+ return free_pages_prepare(page, 0, false);
+}
+
+static bool bulkfree_pcp_prepare(struct page *page)
+{
+ return free_pages_check(page);
+}
+#endif /* CONFIG_DEBUG_VM */
+
/*
* Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone, and of same order.
@@ -829,15 +1084,16 @@ static void free_pcppages_bulk(struct zone *zone, int count,
{
int migratetype = 0;
int batch_free = 0;
- int to_free = count;
unsigned long nr_scanned;
+ bool isolated_pageblocks;
spin_lock(&zone->lock);
+ isolated_pageblocks = has_isolate_pageblock(zone);
nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
if (nr_scanned)
__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
- while (to_free) {
+ while (count) {
struct page *page;
struct list_head *list;
@@ -857,7 +1113,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* This is the only non-empty list. Free them all. */
if (batch_free == MIGRATE_PCPTYPES)
- batch_free = to_free;
+ batch_free = count;
do {
int mt; /* migratetype of the to-be-freed page */
@@ -870,12 +1126,15 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* MIGRATE_ISOLATE page should not go to pcplists */
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
/* Pageblock could have been isolated meanwhile */
- if (unlikely(has_isolate_pageblock(zone)))
+ if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
+ if (bulkfree_pcp_prepare(page))
+ continue;
+
__free_one_page(page, page_to_pfn(page), zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
- } while (--to_free && --batch_free && !list_empty(list));
+ } while (--count && --batch_free && !list_empty(list));
}
spin_unlock(&zone->lock);
}
@@ -899,56 +1158,6 @@ static void free_one_page(struct zone *zone,
spin_unlock(&zone->lock);
}
-static int free_tail_pages_check(struct page *head_page, struct page *page)
-{
- int ret = 1;
-
- /*
- * We rely page->lru.next never has bit 0 set, unless the page
- * is PageTail(). Let's make sure that's true even for poisoned ->lru.
- */
- BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
-
- if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
- ret = 0;
- goto out;
- }
- switch (page - head_page) {
- case 1:
- /* the first tail page: ->mapping is compound_mapcount() */
- if (unlikely(compound_mapcount(page))) {
- bad_page(page, "nonzero compound_mapcount", 0);
- goto out;
- }
- break;
- case 2:
- /*
- * the second tail page: ->mapping is
- * page_deferred_list().next -- ignore value.
- */
- break;
- default:
- if (page->mapping != TAIL_MAPPING) {
- bad_page(page, "corrupted mapping in tail page", 0);
- goto out;
- }
- break;
- }
- if (unlikely(!PageTail(page))) {
- bad_page(page, "PageTail not set", 0);
- goto out;
- }
- if (unlikely(compound_head(page) != head_page)) {
- bad_page(page, "compound_head not consistent", 0);
- goto out;
- }
- ret = 0;
-out:
- page->mapping = NULL;
- clear_compound_head(page);
- return ret;
-}
-
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
{
@@ -1022,51 +1231,13 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
}
}
-static bool free_pages_prepare(struct page *page, unsigned int order)
-{
- bool compound = PageCompound(page);
- int i, bad = 0;
-
- VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
-
- trace_mm_page_free(page, order);
- kmemcheck_free_shadow(page, order);
- kasan_free_pages(page, order);
-
- if (PageAnon(page))
- page->mapping = NULL;
- bad += free_pages_check(page);
- for (i = 1; i < (1 << order); i++) {
- if (compound)
- bad += free_tail_pages_check(page, page + i);
- bad += free_pages_check(page + i);
- }
- if (bad)
- return false;
-
- reset_page_owner(page, order);
-
- if (!PageHighMem(page)) {
- debug_check_no_locks_freed(page_address(page),
- PAGE_SIZE << order);
- debug_check_no_obj_freed(page_address(page),
- PAGE_SIZE << order);
- }
- arch_free_page(page, order);
- kernel_poison_pages(page, 1 << order, 0);
- kernel_map_pages(page, 1 << order, 0);
-
- return true;
-}
-
static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
- if (!free_pages_prepare(page, order))
+ if (!free_pages_prepare(page, order, true))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -1076,8 +1247,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
-static void __init __free_pages_boot_core(struct page *page,
- unsigned long pfn, unsigned int order)
+static void __init __free_pages_boot_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
@@ -1154,7 +1324,7 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
{
if (early_page_uninitialised(pfn))
return;
- return __free_pages_boot_core(page, pfn, order);
+ return __free_pages_boot_core(page, order);
}
/*
@@ -1239,12 +1409,12 @@ static void __init deferred_free_range(struct page *page,
if (nr_pages == MAX_ORDER_NR_PAGES &&
(pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, pfn, MAX_ORDER-1);
+ __free_pages_boot_core(page, MAX_ORDER-1);
return;
}
- for (i = 0; i < nr_pages; i++, page++, pfn++)
- __free_pages_boot_core(page, pfn, 0);
+ for (i = 0; i < nr_pages; i++, page++)
+ __free_pages_boot_core(page, 0);
}
/* Completion tracking for deferred_init_memmap() threads */
@@ -1477,10 +1647,7 @@ static inline void expand(struct zone *zone, struct page *page,
}
}
-/*
- * This page is about to be returned from the page allocator
- */
-static inline int check_new_page(struct page *page)
+static void check_new_page_bad(struct page *page)
{
const char *bad_reason = NULL;
unsigned long bad_flags = 0;
@@ -1503,11 +1670,20 @@ static inline int check_new_page(struct page *page)
if (unlikely(page->mem_cgroup))
bad_reason = "page still charged to cgroup";
#endif
- if (unlikely(bad_reason)) {
- bad_page(page, bad_reason, bad_flags);
- return 1;
- }
- return 0;
+ bad_page(page, bad_reason, bad_flags);
+}
+
+/*
+ * This page is about to be returned from the page allocator
+ */
+static inline int check_new_page(struct page *page)
+{
+ if (likely(page_expected_state(page,
+ PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
+ return 0;
+
+ check_new_page_bad(page);
+ return 1;
}
static inline bool free_pages_prezeroed(bool poisoned)
@@ -1516,16 +1692,48 @@ static inline bool free_pages_prezeroed(bool poisoned)
page_poisoning_enabled() && poisoned;
}
-static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
- int alloc_flags)
+#ifdef CONFIG_DEBUG_VM
+static bool check_pcp_refill(struct page *page)
+{
+ return false;
+}
+
+static bool check_new_pcp(struct page *page)
+{
+ return check_new_page(page);
+}
+#else
+static bool check_pcp_refill(struct page *page)
+{
+ return check_new_page(page);
+}
+static bool check_new_pcp(struct page *page)
+{
+ return false;
+}
+#endif /* CONFIG_DEBUG_VM */
+
+static bool check_new_pages(struct page *page, unsigned int order)
+{
+ int i;
+ for (i = 0; i < (1 << order); i++) {
+ struct page *p = page + i;
+
+ if (unlikely(check_new_page(p)))
+ return true;
+ }
+
+ return false;
+}
+
+static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
+ unsigned int alloc_flags)
{
int i;
bool poisoned = true;
for (i = 0; i < (1 << order); i++) {
struct page *p = page + i;
- if (unlikely(check_new_page(p)))
- return 1;
if (poisoned)
poisoned &= page_is_poisoned(p);
}
@@ -1557,8 +1765,6 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
set_page_pfmemalloc(page);
else
clear_page_pfmemalloc(page);
-
- return 0;
}
/*
@@ -1980,6 +2186,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
if (unlikely(page == NULL))
break;
+ if (unlikely(check_pcp_refill(page)))
+ continue;
+
/*
* Split buddy pages returned by expand() are received here
* in physical page order. The page is added to the callers and
@@ -2157,6 +2366,10 @@ void mark_free_pages(struct zone *zone)
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
+
+ if (page_zone(page) != zone)
+ continue;
+
if (!swsusp_page_is_forbidden(page))
swsusp_unset_page_free(page);
}
@@ -2187,7 +2400,7 @@ void free_hot_cold_page(struct page *page, bool cold)
unsigned long pfn = page_to_pfn(page);
int migratetype;
- if (!free_pages_prepare(page, 0))
+ if (!free_pcp_prepare(page))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -2343,12 +2556,44 @@ int split_free_page(struct page *page)
}
/*
+ * Update NUMA hit/miss statistics
+ *
+ * Must be called with interrupts disabled.
+ *
+ * When __GFP_OTHER_NODE is set assume the node of the preferred
+ * zone is the local node. This is useful for daemons who allocate
+ * memory on behalf of other processes.
+ */
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+ gfp_t flags)
+{
+#ifdef CONFIG_NUMA
+ int local_nid = numa_node_id();
+ enum zone_stat_item local_stat = NUMA_LOCAL;
+
+ if (unlikely(flags & __GFP_OTHER_NODE)) {
+ local_stat = NUMA_OTHER;
+ local_nid = preferred_zone->node;
+ }
+
+ if (z->node == local_nid) {
+ __inc_zone_state(z, NUMA_HIT);
+ __inc_zone_state(z, local_stat);
+ } else {
+ __inc_zone_state(z, NUMA_MISS);
+ __inc_zone_state(preferred_zone, NUMA_FOREIGN);
+ }
+#endif
+}
+
+/*
* Allocate a page from the given zone. Use pcplists for order-0 allocations.
*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int alloc_flags, int migratetype)
+ gfp_t gfp_flags, unsigned int alloc_flags,
+ int migratetype)
{
unsigned long flags;
struct page *page;
@@ -2359,21 +2604,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
struct list_head *list;
local_irq_save(flags);
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
- list = &pcp->lists[migratetype];
- if (list_empty(list)) {
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, list,
- migratetype, cold);
- if (unlikely(list_empty(list)))
- goto failed;
- }
+ do {
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ list = &pcp->lists[migratetype];
+ if (list_empty(list)) {
+ pcp->count += rmqueue_bulk(zone, 0,
+ pcp->batch, list,
+ migratetype, cold);
+ if (unlikely(list_empty(list)))
+ goto failed;
+ }
- if (cold)
- page = list_last_entry(list, struct page, lru);
- else
- page = list_first_entry(list, struct page, lru);
+ if (cold)
+ page = list_last_entry(list, struct page, lru);
+ else
+ page = list_first_entry(list, struct page, lru);
+ } while (page && check_new_pcp(page));
+ __dec_zone_state(zone, NR_ALLOC_BATCH);
list_del(&page->lru);
pcp->count--;
} else {
@@ -2384,22 +2632,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
spin_lock_irqsave(&zone->lock, flags);
- page = NULL;
- if (alloc_flags & ALLOC_HARDER) {
- page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
- if (page)
- trace_mm_page_alloc_zone_locked(page, order, migratetype);
- }
- if (!page)
- page = __rmqueue(zone, order, migratetype);
+ do {
+ page = NULL;
+ if (alloc_flags & ALLOC_HARDER) {
+ page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+ if (page)
+ trace_mm_page_alloc_zone_locked(page, order, migratetype);
+ }
+ if (!page)
+ page = __rmqueue(zone, order, migratetype);
+ } while (page && check_new_pages(page, order));
spin_unlock(&zone->lock);
if (!page)
goto failed;
+ __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
}
- __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
!test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
@@ -2501,12 +2751,13 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
* to check in the allocation paths if no pages are free.
*/
static bool __zone_watermark_ok(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx, int alloc_flags,
+ unsigned long mark, int classzone_idx,
+ unsigned int alloc_flags,
long free_pages)
{
long min = mark;
int o;
- const int alloc_harder = (alloc_flags & ALLOC_HARDER);
+ const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
/* free_pages may go negative - that's OK */
free_pages -= (1 << order) - 1;
@@ -2569,12 +2820,38 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
}
bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
+ int classzone_idx, unsigned int alloc_flags)
{
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
zone_page_state(z, NR_FREE_PAGES));
}
+static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
+ unsigned long mark, int classzone_idx, unsigned int alloc_flags)
+{
+ long free_pages = zone_page_state(z, NR_FREE_PAGES);
+ long cma_pages = 0;
+
+#ifdef CONFIG_CMA
+ /* If allocation can't use CMA areas don't use free CMA pages */
+ if (!(alloc_flags & ALLOC_CMA))
+ cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
+
+ /*
+ * Fast check for order-0 only. If this fails then the reserves
+ * need to be calculated. There is a corner case where the check
+ * passes but only the high-order atomic reserve are free. If
+ * the caller is !atomic then it'll uselessly search the free
+ * list. That corner case is then slower but it is harmless.
+ */
+ if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
+ return true;
+
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ free_pages);
+}
+
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx)
{
@@ -2630,27 +2907,24 @@ static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
- struct zonelist *zonelist = ac->zonelist;
- struct zoneref *z;
- struct page *page = NULL;
+ struct zoneref *z = ac->preferred_zoneref;
struct zone *zone;
- int nr_fair_skipped = 0;
- bool zonelist_rescan;
+ bool fair_skipped = false;
+ bool apply_fair = (alloc_flags & ALLOC_FAIR);
zonelist_scan:
- zonelist_rescan = false;
-
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+ for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
+ struct page *page;
unsigned long mark;
if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
- !cpuset_zone_allowed(zone, gfp_mask))
+ !__cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
* Distribute pages in proportion to the individual
@@ -2658,13 +2932,16 @@ zonelist_scan:
* page was allocated in should have no effect on the
* time the page has in memory before being reclaimed.
*/
- if (alloc_flags & ALLOC_FAIR) {
- if (!zone_local(ac->preferred_zone, zone))
- break;
+ if (apply_fair) {
if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
- nr_fair_skipped++;
+ fair_skipped = true;
continue;
}
+ if (!zone_local(ac->preferred_zoneref->zone, zone)) {
+ if (fair_skipped)
+ goto reset_fair;
+ apply_fair = false;
+ }
}
/*
* When allocating a page cache page for writing, we
@@ -2696,8 +2973,8 @@ zonelist_scan:
continue;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
- if (!zone_watermark_ok(zone, order, mark,
- ac->classzone_idx, alloc_flags)) {
+ if (!zone_watermark_fast(zone, order, mark,
+ ac_classzone_idx(ac), alloc_flags)) {
int ret;
/* Checked here to keep the fast path fast */
@@ -2706,7 +2983,7 @@ zonelist_scan:
goto try_this_zone;
if (zone_reclaim_mode == 0 ||
- !zone_allows_reclaim(ac->preferred_zone, zone))
+ !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
ret = zone_reclaim(zone, gfp_mask, order);
@@ -2720,7 +2997,7 @@ zonelist_scan:
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
- ac->classzone_idx, alloc_flags))
+ ac_classzone_idx(ac), alloc_flags))
goto try_this_zone;
continue;
@@ -2728,11 +3005,10 @@ zonelist_scan:
}
try_this_zone:
- page = buffered_rmqueue(ac->preferred_zone, zone, order,
+ page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) {
- if (prep_new_page(page, order, gfp_mask, alloc_flags))
- goto try_this_zone;
+ prep_new_page(page, order, gfp_mask, alloc_flags);
/*
* If this is a high-order atomic allocation then check
@@ -2753,18 +3029,13 @@ try_this_zone:
* include remote zones now, before entering the slowpath and waking
* kswapd: prefer spilling to a remote zone over swapping locally.
*/
- if (alloc_flags & ALLOC_FAIR) {
- alloc_flags &= ~ALLOC_FAIR;
- if (nr_fair_skipped) {
- zonelist_rescan = true;
- reset_alloc_batches(ac->preferred_zone);
- }
- if (nr_online_nodes > 1)
- zonelist_rescan = true;
- }
-
- if (zonelist_rescan)
+ if (fair_skipped) {
+reset_fair:
+ apply_fair = false;
+ fair_skipped = false;
+ reset_alloc_batches(ac->preferred_zoneref->zone);
goto zonelist_scan;
+ }
return NULL;
}
@@ -2872,22 +3143,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/* The OOM killer does not needlessly kill tasks for lowmem */
if (ac->high_zoneidx < ZONE_NORMAL)
goto out;
- /* The OOM killer does not compensate for IO-less reclaim */
- if (!(gfp_mask & __GFP_FS)) {
- /*
- * XXX: Page reclaim didn't yield anything,
- * and the OOM killer can't be invoked, but
- * keep looping as per tradition.
- *
- * But do not keep looping if oom_killer_disable()
- * was already called, for the system is trying to
- * enter a quiescent state during suspend.
- */
- *did_some_progress = !oom_killer_disabled;
- goto out;
- }
if (pm_suspended_storage())
goto out;
+ /*
+ * XXX: GFP_NOFS allocations should rather fail than rely on
+ * other request to make a forward progress.
+ * We are in an unfortunate situation where out_of_memory cannot
+ * do much for this context but let's try it to at least get
+ * access to memory reserved if the current task is killed (see
+ * out_of_memory). Once filesystems are ready to handle allocation
+ * failures more gracefully we should just bail out here.
+ */
+
/* The OOM killer may not free memory on a specific node */
if (gfp_mask & __GFP_THISNODE)
goto out;
@@ -2917,7 +3184,7 @@ out:
/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
- int alloc_flags, const struct alloc_context *ac,
+ unsigned int alloc_flags, const struct alloc_context *ac,
enum migrate_mode mode, int *contended_compaction,
bool *deferred_compaction)
{
@@ -2973,7 +3240,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
#else
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
- int alloc_flags, const struct alloc_context *ac,
+ unsigned int alloc_flags, const struct alloc_context *ac,
enum migrate_mode mode, int *contended_compaction,
bool *deferred_compaction)
{
@@ -3013,7 +3280,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
/* The really slow allocator path where we enter direct reclaim */
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
- int alloc_flags, const struct alloc_context *ac,
+ unsigned int alloc_flags, const struct alloc_context *ac,
unsigned long *did_some_progress)
{
struct page *page = NULL;
@@ -3049,13 +3316,13 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
ac->high_zoneidx, ac->nodemask)
- wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
+ wakeup_kswapd(zone, order, ac_classzone_idx(ac));
}
-static inline int
+static inline unsigned int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
- int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+ unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -3116,7 +3383,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
struct page *page = NULL;
- int alloc_flags;
+ unsigned int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
enum migrate_mode migration_mode = MIGRATE_ASYNC;
@@ -3153,17 +3420,6 @@ retry:
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
- /*
- * Find the true preferred zone if the allocation is unconstrained by
- * cpusets.
- */
- if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
- struct zoneref *preferred_zoneref;
- preferred_zoneref = first_zones_zonelist(ac->zonelist,
- ac->high_zoneidx, NULL, &ac->preferred_zone);
- ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
- }
-
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, order,
alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
@@ -3278,7 +3534,7 @@ retry:
if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
/* Wait for some write requests to complete then retry */
- wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
+ wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50);
goto retry;
}
@@ -3316,17 +3572,24 @@ struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
- struct zoneref *preferred_zoneref;
- struct page *page = NULL;
+ struct page *page;
unsigned int cpuset_mems_cookie;
- int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
- gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
+ unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
+ gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = {
.high_zoneidx = gfp_zone(gfp_mask),
+ .zonelist = zonelist,
.nodemask = nodemask,
.migratetype = gfpflags_to_migratetype(gfp_mask),
};
+ if (cpusets_enabled()) {
+ alloc_mask |= __GFP_HARDWALL;
+ alloc_flags |= ALLOC_CPUSET;
+ if (!ac.nodemask)
+ ac.nodemask = &cpuset_current_mems_allowed;
+ }
+
gfp_mask &= gfp_allowed_mask;
lockdep_trace_alloc(gfp_mask);
@@ -3350,49 +3613,54 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
- /* We set it here, as __alloc_pages_slowpath might have changed it */
- ac.zonelist = zonelist;
-
/* Dirty zone balancing only done in the fast path */
ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
/* The preferred zone is used for statistics later */
- preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
- ac.nodemask ? : &cpuset_current_mems_allowed,
- &ac.preferred_zone);
- if (!ac.preferred_zone)
- goto out;
- ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
+ ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
+ ac.high_zoneidx, ac.nodemask);
+ if (!ac.preferred_zoneref) {
+ page = NULL;
+ goto no_zone;
+ }
/* First allocation attempt */
- alloc_mask = gfp_mask|__GFP_HARDWALL;
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
- if (unlikely(!page)) {
- /*
- * Runtime PM, block IO and its error handling path
- * can deadlock because I/O on the device might not
- * complete.
- */
- alloc_mask = memalloc_noio_flags(gfp_mask);
- ac.spread_dirty_pages = false;
-
- page = __alloc_pages_slowpath(alloc_mask, order, &ac);
- }
+ if (likely(page))
+ goto out;
- if (kmemcheck_enabled && page)
- kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+ /*
+ * Runtime PM, block IO and its error handling path can deadlock
+ * because I/O on the device might not complete.
+ */
+ alloc_mask = memalloc_noio_flags(gfp_mask);
+ ac.spread_dirty_pages = false;
- trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
+ /*
+ * Restore the original nodemask if it was potentially replaced with
+ * &cpuset_current_mems_allowed to optimize the fast-path attempt.
+ */
+ if (cpusets_enabled())
+ ac.nodemask = nodemask;
+ page = __alloc_pages_slowpath(alloc_mask, order, &ac);
-out:
+no_zone:
/*
* When updating a task's mems_allowed, it is possible to race with
* parallel threads in such a way that an allocation can fail while
* the mask is being updated. If a page allocation is about to fail,
* check if the cpuset changed during allocation and if so, retry.
*/
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) {
+ alloc_mask = gfp_mask;
goto retry_cpuset;
+ }
+
+out:
+ if (kmemcheck_enabled && page)
+ kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+
+ trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
return page;
}
@@ -3790,6 +4058,8 @@ void si_meminfo_node(struct sysinfo *val, int nid)
{
int zone_type; /* needs to be signed */
unsigned long managed_pages = 0;
+ unsigned long managed_highpages = 0;
+ unsigned long free_highpages = 0;
pg_data_t *pgdat = NODE_DATA(nid);
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
@@ -3798,12 +4068,19 @@ void si_meminfo_node(struct sysinfo *val, int nid)
val->sharedram = node_page_state(nid, NR_SHMEM);
val->freeram = node_page_state(nid, NR_FREE_PAGES);
#ifdef CONFIG_HIGHMEM
- val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
- val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
- NR_FREE_PAGES);
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ struct zone *zone = &pgdat->node_zones[zone_type];
+
+ if (is_highmem(zone)) {
+ managed_highpages += zone->managed_pages;
+ free_highpages += zone_page_state(zone, NR_FREE_PAGES);
+ }
+ }
+ val->totalhigh = managed_highpages;
+ val->freehigh = free_highpages;
#else
- val->totalhigh = 0;
- val->freehigh = 0;
+ val->totalhigh = managed_highpages;
+ val->freehigh = free_highpages;
#endif
val->mem_unit = PAGE_SIZE;
}
@@ -4390,13 +4667,12 @@ static void build_zonelists(pg_data_t *pgdat)
*/
int local_memory_node(int node)
{
- struct zone *zone;
+ struct zoneref *z;
- (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
+ z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
gfp_zone(GFP_KERNEL),
- NULL,
- &zone);
- return zone->node;
+ NULL);
+ return z->zone->node;
}
#endif
@@ -6725,98 +7001,6 @@ void *__init alloc_large_system_hash(const char *tablename,
return table;
}
-/* Return a pointer to the bitmap storing bits affecting a block of pages */
-static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
- unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
- return __pfn_to_section(pfn)->pageblock_flags;
-#else
- return zone->pageblock_flags;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
- pfn &= (PAGES_PER_SECTION-1);
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-#else
- pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest to retrieve
- * @mask: mask of bits that the caller is interested in
- *
- * Return: pageblock_bits flags
- */
-unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
- unsigned long end_bitidx,
- unsigned long mask)
-{
- struct zone *zone;
- unsigned long *bitmap;
- unsigned long bitidx, word_bitidx;
- unsigned long word;
-
- zone = page_zone(page);
- bitmap = get_pageblock_bitmap(zone, pfn);
- bitidx = pfn_to_bitidx(zone, pfn);
- word_bitidx = bitidx / BITS_PER_LONG;
- bitidx &= (BITS_PER_LONG-1);
-
- word = bitmap[word_bitidx];
- bitidx += end_bitidx;
- return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
-}
-
-/**
- * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @flags: The flags to set
- * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest
- * @mask: mask of bits that the caller is interested in
- */
-void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
- unsigned long pfn,
- unsigned long end_bitidx,
- unsigned long mask)
-{
- struct zone *zone;
- unsigned long *bitmap;
- unsigned long bitidx, word_bitidx;
- unsigned long old_word, word;
-
- BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
-
- zone = page_zone(page);
- bitmap = get_pageblock_bitmap(zone, pfn);
- bitidx = pfn_to_bitidx(zone, pfn);
- word_bitidx = bitidx / BITS_PER_LONG;
- bitidx &= (BITS_PER_LONG-1);
-
- VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
-
- bitidx += end_bitidx;
- mask <<= (BITS_PER_LONG - bitidx - 1);
- flags <<= (BITS_PER_LONG - bitidx - 1);
-
- word = READ_ONCE(bitmap[word_bitidx]);
- for (;;) {
- old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
- if (word == old_word)
- break;
- word = old_word;
- }
-}
-
/*
* This function checks whether pageblock includes unmovable pages or not.
* If @count is not zero, it is okay to include less @count unmovable pages
@@ -6864,7 +7048,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* We can't use page_count without pin a page
* because another CPU can free compound page.
* This check already skips compound tails of THP
- * because their page->_count is zero at all time.
+ * because their page->_refcount is zero at all time.
*/
if (!page_ref_count(page)) {
if (PageBuddy(page))
@@ -7177,7 +7361,8 @@ void zone_pcp_reset(struct zone *zone)
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * All pages in the range must be isolated before calling this.
+ * All pages in the range must be in a single zone and isolated
+ * before calling this.
*/
void
__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c4f568206544..612122bf6a42 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -246,6 +246,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
return pfn;
}
+/* Caller should ensure that requested range is in a single zone */
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
bool skip_hwpoisoned_pages)
{
@@ -288,13 +289,10 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private,
* accordance with memory policy of the user process if possible. For
* now as a simple work-around, we use the next node for destination.
*/
- if (PageHuge(page)) {
- int node = next_online_node(page_to_nid(page));
- if (node == MAX_NUMNODES)
- node = first_online_node;
+ if (PageHuge(page))
return alloc_huge_page_node(page_hstate(compound_head(page)),
- node);
- }
+ next_node_in(page_to_nid(page),
+ node_online_map));
if (PageHighMem(page))
gfp_mask |= __GFP_HIGHMEM;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index ac3d8d129974..792b56da13d8 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -143,7 +143,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
goto err;
/* Print information relevant to grouping pages by mobility */
- pageblock_mt = get_pfnblock_migratetype(page, pfn);
+ pageblock_mt = get_pageblock_migratetype(page);
page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
ret += snprintf(kbuf + ret, count - ret,
"PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
@@ -301,6 +301,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
page = pfn_to_page(pfn);
+ if (page_zone(page) != zone)
+ continue;
+
/*
* We are safe to check buddy flag and order, because
* this is init stage and only single thread runs.
diff --git a/mm/rmap.c b/mm/rmap.c
index 307b555024ef..8a839935b18c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -409,7 +409,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
- BUG_ON(anon_vma->degree);
+ VM_WARN_ON(anon_vma->degree);
put_anon_vma(anon_vma);
list_del(&avc->same_vma);
@@ -1249,7 +1249,7 @@ void page_add_new_anon_rmap(struct page *page,
int nr = compound ? hpage_nr_pages(page) : 1;
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
- SetPageSwapBacked(page);
+ __SetPageSwapBacked(page);
if (compound) {
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/* increment count (starts at -1) */
diff --git a/mm/shmem.c b/mm/shmem.c
index e684a9140228..e418a995427d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -101,7 +101,6 @@ struct shmem_falloc {
enum sgp_type {
SGP_READ, /* don't exceed i_size, don't allocate page */
SGP_CACHE, /* don't exceed i_size, may allocate page */
- SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */
SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
};
@@ -122,13 +121,14 @@ static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
static int shmem_replace_page(struct page **pagep, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index);
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
+ struct page **pagep, enum sgp_type sgp,
+ gfp_t gfp, struct mm_struct *fault_mm, int *fault_type);
static inline int shmem_getpage(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp, int *fault_type)
+ struct page **pagep, enum sgp_type sgp)
{
return shmem_getpage_gfp(inode, index, pagep, sgp,
- mapping_gfp_mask(inode->i_mapping), fault_type);
+ mapping_gfp_mask(inode->i_mapping), NULL, NULL);
}
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
@@ -169,7 +169,7 @@ static inline int shmem_reacct_size(unsigned long flags,
/*
* ... whereas tmpfs objects are accounted incrementally as
- * pages are allocated, in order to allow huge sparse files.
+ * pages are allocated, in order to allow large sparse files.
* shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
*/
@@ -528,7 +528,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (partial_start) {
struct page *page = NULL;
- shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
+ shmem_getpage(inode, start - 1, &page, SGP_READ);
if (page) {
unsigned int top = PAGE_SIZE;
if (start > end) {
@@ -543,7 +543,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
}
if (partial_end) {
struct page *page = NULL;
- shmem_getpage(inode, end, &page, SGP_READ, NULL);
+ shmem_getpage(inode, end, &page, SGP_READ);
if (page) {
zero_user_segment(page, 0, partial_end);
set_page_dirty(page);
@@ -947,8 +947,7 @@ redirty:
return 0;
}
-#ifdef CONFIG_NUMA
-#ifdef CONFIG_TMPFS
+#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
{
char buffer[64];
@@ -972,7 +971,18 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
}
return mpol;
}
-#endif /* CONFIG_TMPFS */
+#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
+{
+}
+static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+ return NULL;
+}
+#endif /* CONFIG_NUMA && CONFIG_TMPFS */
+#ifndef CONFIG_NUMA
+#define vm_policy vm_private_data
+#endif
static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
@@ -1008,39 +1018,17 @@ static struct page *shmem_alloc_page(gfp_t gfp,
pvma.vm_ops = NULL;
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
- page = alloc_page_vma(gfp, &pvma, 0);
+ page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false);
+ if (page) {
+ __SetPageLocked(page);
+ __SetPageSwapBacked(page);
+ }
/* Drop reference taken by mpol_shared_policy_lookup() */
mpol_cond_put(pvma.vm_policy);
return page;
}
-#else /* !CONFIG_NUMA */
-#ifdef CONFIG_TMPFS
-static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
-{
-}
-#endif /* CONFIG_TMPFS */
-
-static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
- struct shmem_inode_info *info, pgoff_t index)
-{
- return swapin_readahead(swap, gfp, NULL, 0);
-}
-
-static inline struct page *shmem_alloc_page(gfp_t gfp,
- struct shmem_inode_info *info, pgoff_t index)
-{
- return alloc_page(gfp);
-}
-#endif /* CONFIG_NUMA */
-
-#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
-static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
-{
- return NULL;
-}
-#endif
/*
* When a page is moved from swapcache to shmem filecache (either by the
@@ -1084,9 +1072,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
copy_highpage(newpage, oldpage);
flush_dcache_page(newpage);
- __SetPageLocked(newpage);
SetPageUptodate(newpage);
- SetPageSwapBacked(newpage);
set_page_private(newpage, swap_index);
SetPageSwapCache(newpage);
@@ -1130,14 +1116,19 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*
* If we allocate a new one we do not mark it dirty. That's up to the
* vm. If we swap it in we mark it dirty since we also free the swap
- * entry since a page cannot live in both the swap and page cache
+ * entry since a page cannot live in both the swap and page cache.
+ *
+ * fault_mm and fault_type are only supplied by shmem_fault:
+ * otherwise they are NULL.
*/
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
+ struct page **pagep, enum sgp_type sgp, gfp_t gfp,
+ struct mm_struct *fault_mm, int *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo;
+ struct mm_struct *charge_mm;
struct mem_cgroup *memcg;
struct page *page;
swp_entry_t swap;
@@ -1155,7 +1146,7 @@ repeat:
page = NULL;
}
- if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
+ if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
error = -EINVAL;
goto unlock;
@@ -1183,14 +1174,19 @@ repeat:
*/
info = SHMEM_I(inode);
sbinfo = SHMEM_SB(inode->i_sb);
+ charge_mm = fault_mm ? : current->mm;
if (swap.val) {
/* Look it up and read it in.. */
page = lookup_swap_cache(swap);
if (!page) {
- /* here we actually do the io */
- if (fault_type)
+ /* Or update major stats only when swapin succeeds?? */
+ if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT);
+ }
+ /* Here we actually start the io */
page = shmem_swapin(swap, gfp, info, index);
if (!page) {
error = -ENOMEM;
@@ -1217,7 +1213,7 @@ repeat:
goto failed;
}
- error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+ error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
false);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
@@ -1275,13 +1271,10 @@ repeat:
error = -ENOMEM;
goto decused;
}
-
- __SetPageSwapBacked(page);
- __SetPageLocked(page);
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
- error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+ error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
false);
if (error)
goto decused;
@@ -1321,12 +1314,10 @@ clear:
flush_dcache_page(page);
SetPageUptodate(page);
}
- if (sgp == SGP_DIRTY)
- set_page_dirty(page);
}
/* Perhaps the file has been truncated since we checked */
- if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
+ if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
if (alloced) {
ClearPageDirty(page);
@@ -1372,6 +1363,7 @@ unlock:
static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = file_inode(vma->vm_file);
+ gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
int error;
int ret = VM_FAULT_LOCKED;
@@ -1433,14 +1425,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
spin_unlock(&inode->i_lock);
}
- error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
+ error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE,
+ gfp, vma->vm_mm, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
-
- if (ret & VM_FAULT_MAJOR) {
- count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
- }
return ret;
}
@@ -1587,7 +1575,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
return -EPERM;
}
- return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+ return shmem_getpage(inode, index, pagep, SGP_WRITE);
}
static int
@@ -1633,7 +1621,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
* and even mark them dirty, so it cannot exceed the max_blocks limit.
*/
if (!iter_is_iovec(to))
- sgp = SGP_DIRTY;
+ sgp = SGP_CACHE;
index = *ppos >> PAGE_SHIFT;
offset = *ppos & ~PAGE_MASK;
@@ -1653,14 +1641,17 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
break;
}
- error = shmem_getpage(inode, index, &page, sgp, NULL);
+ error = shmem_getpage(inode, index, &page, sgp);
if (error) {
if (error == -EINVAL)
error = 0;
break;
}
- if (page)
+ if (page) {
+ if (sgp == SGP_CACHE)
+ set_page_dirty(page);
unlock_page(page);
+ }
/*
* We must evaluate after, since reads (unlike writes)
@@ -1766,7 +1757,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
error = 0;
while (spd.nr_pages < nr_pages) {
- error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
+ error = shmem_getpage(inode, index, &page, SGP_CACHE);
if (error)
break;
unlock_page(page);
@@ -1788,8 +1779,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
page = spd.pages[page_nr];
if (!PageUptodate(page) || page->mapping != mapping) {
- error = shmem_getpage(inode, index, &page,
- SGP_CACHE, NULL);
+ error = shmem_getpage(inode, index, &page, SGP_CACHE);
if (error)
break;
unlock_page(page);
@@ -2232,8 +2222,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
error = -ENOMEM;
else
- error = shmem_getpage(inode, index, &page, SGP_FALLOC,
- NULL);
+ error = shmem_getpage(inode, index, &page, SGP_FALLOC);
if (error) {
/* Remove the !PageUptodate pages we added */
shmem_undo_range(inode,
@@ -2551,7 +2540,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
inode->i_op = &shmem_short_symlink_operations;
} else {
inode_nohighmem(inode);
- error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
+ error = shmem_getpage(inode, 0, &page, SGP_WRITE);
if (error) {
iput(inode);
return error;
@@ -2592,7 +2581,7 @@ static const char *shmem_get_link(struct dentry *dentry,
return ERR_PTR(-ECHILD);
}
} else {
- error = shmem_getpage(inode, 0, &page, SGP_READ, NULL);
+ error = shmem_getpage(inode, 0, &page, SGP_READ);
if (error)
return ERR_PTR(error);
unlock_page(page);
@@ -3496,7 +3485,8 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
int error;
BUG_ON(mapping->a_ops != &shmem_aops);
- error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
+ error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
+ gfp, NULL, NULL);
if (error)
page = ERR_PTR(error);
else
diff --git a/mm/slab.c b/mm/slab.c
index 17e2848979c5..c11bf5007952 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -213,6 +213,11 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list);
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
static void cache_reap(struct work_struct *unused);
+static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
+ void **list);
+static inline void fixup_slab_list(struct kmem_cache *cachep,
+ struct kmem_cache_node *n, struct page *page,
+ void **list);
static int slab_early_init = 1;
#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
@@ -421,8 +426,6 @@ static struct kmem_cache kmem_cache_boot = {
.name = "kmem_cache",
};
-#define BAD_ALIEN_MAGIC 0x01020304ul
-
static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -519,22 +522,15 @@ static DEFINE_PER_CPU(unsigned long, slab_reap_node);
static void init_reap_node(int cpu)
{
- int node;
-
- node = next_node(cpu_to_mem(cpu), node_online_map);
- if (node == MAX_NUMNODES)
- node = first_node(node_online_map);
-
- per_cpu(slab_reap_node, cpu) = node;
+ per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu),
+ node_online_map);
}
static void next_reap_node(void)
{
int node = __this_cpu_read(slab_reap_node);
- node = next_node(node, node_online_map);
- if (unlikely(node >= MAX_NUMNODES))
- node = first_node(node_online_map);
+ node = next_node_in(node, node_online_map);
__this_cpu_write(slab_reap_node, node);
}
@@ -644,7 +640,7 @@ static int transfer_objects(struct array_cache *to,
static inline struct alien_cache **alloc_alien_cache(int node,
int limit, gfp_t gfp)
{
- return (struct alien_cache **)BAD_ALIEN_MAGIC;
+ return NULL;
}
static inline void free_alien_cache(struct alien_cache **ac_ptr)
@@ -850,6 +846,46 @@ static inline gfp_t gfp_exact_node(gfp_t flags)
}
#endif
+static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
+{
+ struct kmem_cache_node *n;
+
+ /*
+ * Set up the kmem_cache_node for cpu before we can
+ * begin anything. Make sure some other cpu on this
+ * node has not already allocated this
+ */
+ n = get_node(cachep, node);
+ if (n) {
+ spin_lock_irq(&n->list_lock);
+ n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
+ cachep->num;
+ spin_unlock_irq(&n->list_lock);
+
+ return 0;
+ }
+
+ n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
+ if (!n)
+ return -ENOMEM;
+
+ kmem_cache_node_init(n);
+ n->next_reap = jiffies + REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
+
+ n->free_limit =
+ (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num;
+
+ /*
+ * The kmem_cache_nodes don't come and go as CPUs
+ * come and go. slab_mutex is sufficient
+ * protection here.
+ */
+ cachep->node[node] = n;
+
+ return 0;
+}
+
/*
* Allocates and initializes node for a node on each slab cache, used for
* either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node
@@ -861,46 +897,82 @@ static inline gfp_t gfp_exact_node(gfp_t flags)
*/
static int init_cache_node_node(int node)
{
+ int ret;
struct kmem_cache *cachep;
- struct kmem_cache_node *n;
- const size_t memsize = sizeof(struct kmem_cache_node);
list_for_each_entry(cachep, &slab_caches, list) {
- /*
- * Set up the kmem_cache_node for cpu before we can
- * begin anything. Make sure some other cpu on this
- * node has not already allocated this
- */
- n = get_node(cachep, node);
- if (!n) {
- n = kmalloc_node(memsize, GFP_KERNEL, node);
- if (!n)
- return -ENOMEM;
- kmem_cache_node_init(n);
- n->next_reap = jiffies + REAPTIMEOUT_NODE +
- ((unsigned long)cachep) % REAPTIMEOUT_NODE;
-
- /*
- * The kmem_cache_nodes don't come and go as CPUs
- * come and go. slab_mutex is sufficient
- * protection here.
- */
- cachep->node[node] = n;
- }
-
- spin_lock_irq(&n->list_lock);
- n->free_limit =
- (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
- spin_unlock_irq(&n->list_lock);
+ ret = init_cache_node(cachep, node, GFP_KERNEL);
+ if (ret)
+ return ret;
}
+
return 0;
}
-static inline int slabs_tofree(struct kmem_cache *cachep,
- struct kmem_cache_node *n)
+static int setup_kmem_cache_node(struct kmem_cache *cachep,
+ int node, gfp_t gfp, bool force_change)
{
- return (n->free_objects + cachep->num - 1) / cachep->num;
+ int ret = -ENOMEM;
+ struct kmem_cache_node *n;
+ struct array_cache *old_shared = NULL;
+ struct array_cache *new_shared = NULL;
+ struct alien_cache **new_alien = NULL;
+ LIST_HEAD(list);
+
+ if (use_alien_caches) {
+ new_alien = alloc_alien_cache(node, cachep->limit, gfp);
+ if (!new_alien)
+ goto fail;
+ }
+
+ if (cachep->shared) {
+ new_shared = alloc_arraycache(node,
+ cachep->shared * cachep->batchcount, 0xbaadf00d, gfp);
+ if (!new_shared)
+ goto fail;
+ }
+
+ ret = init_cache_node(cachep, node, gfp);
+ if (ret)
+ goto fail;
+
+ n = get_node(cachep, node);
+ spin_lock_irq(&n->list_lock);
+ if (n->shared && force_change) {
+ free_block(cachep, n->shared->entry,
+ n->shared->avail, node, &list);
+ n->shared->avail = 0;
+ }
+
+ if (!n->shared || force_change) {
+ old_shared = n->shared;
+ n->shared = new_shared;
+ new_shared = NULL;
+ }
+
+ if (!n->alien) {
+ n->alien = new_alien;
+ new_alien = NULL;
+ }
+
+ spin_unlock_irq(&n->list_lock);
+ slabs_destroy(cachep, &list);
+
+ /*
+ * To protect lockless access to n->shared during irq disabled context.
+ * If n->shared isn't NULL in irq disabled context, accessing to it is
+ * guaranteed to be valid until irq is re-enabled, because it will be
+ * freed after synchronize_sched().
+ */
+ if (force_change)
+ synchronize_sched();
+
+fail:
+ kfree(old_shared);
+ kfree(new_shared);
+ free_alien_cache(new_alien);
+
+ return ret;
}
static void cpuup_canceled(long cpu)
@@ -967,14 +1039,13 @@ free_slab:
n = get_node(cachep, node);
if (!n)
continue;
- drain_freelist(cachep, n, slabs_tofree(cachep, n));
+ drain_freelist(cachep, n, INT_MAX);
}
}
static int cpuup_prepare(long cpu)
{
struct kmem_cache *cachep;
- struct kmem_cache_node *n = NULL;
int node = cpu_to_mem(cpu);
int err;
@@ -993,44 +1064,9 @@ static int cpuup_prepare(long cpu)
* array caches
*/
list_for_each_entry(cachep, &slab_caches, list) {
- struct array_cache *shared = NULL;
- struct alien_cache **alien = NULL;
-
- if (cachep->shared) {
- shared = alloc_arraycache(node,
- cachep->shared * cachep->batchcount,
- 0xbaadf00d, GFP_KERNEL);
- if (!shared)
- goto bad;
- }
- if (use_alien_caches) {
- alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
- if (!alien) {
- kfree(shared);
- goto bad;
- }
- }
- n = get_node(cachep, node);
- BUG_ON(!n);
-
- spin_lock_irq(&n->list_lock);
- if (!n->shared) {
- /*
- * We are serialised from CPU_DEAD or
- * CPU_UP_CANCELLED by the cpucontrol lock
- */
- n->shared = shared;
- shared = NULL;
- }
-#ifdef CONFIG_NUMA
- if (!n->alien) {
- n->alien = alien;
- alien = NULL;
- }
-#endif
- spin_unlock_irq(&n->list_lock);
- kfree(shared);
- free_alien_cache(alien);
+ err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false);
+ if (err)
+ goto bad;
}
return 0;
@@ -1119,7 +1155,7 @@ static int __meminit drain_cache_node_node(int node)
if (!n)
continue;
- drain_freelist(cachep, n, slabs_tofree(cachep, n));
+ drain_freelist(cachep, n, INT_MAX);
if (!list_empty(&n->slabs_full) ||
!list_empty(&n->slabs_partial)) {
@@ -1200,6 +1236,61 @@ static void __init set_up_node(struct kmem_cache *cachep, int index)
}
}
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list,
+ size_t count)
+{
+ size_t i;
+ unsigned int rand;
+
+ for (i = 0; i < count; i++)
+ list[i] = i;
+
+ /* Fisher-Yates shuffle */
+ for (i = count - 1; i > 0; i--) {
+ rand = prandom_u32_state(state);
+ rand %= (i + 1);
+ swap(list[i], list[rand]);
+ }
+}
+
+/* Create a random sequence per cache */
+static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
+{
+ unsigned int seed, count = cachep->num;
+ struct rnd_state state;
+
+ if (count < 2)
+ return 0;
+
+ /* If it fails, we will just use the global lists */
+ cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp);
+ if (!cachep->random_seq)
+ return -ENOMEM;
+
+ /* Get best entropy at this stage */
+ get_random_bytes_arch(&seed, sizeof(seed));
+ prandom_seed_state(&state, seed);
+
+ freelist_randomize(&state, cachep->random_seq, count);
+ return 0;
+}
+
+/* Destroy the per-cache random freelist sequence */
+static void cache_random_seq_destroy(struct kmem_cache *cachep)
+{
+ kfree(cachep->random_seq);
+ cachep->random_seq = NULL;
+}
+#else
+static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
+{
+ return 0;
+}
+static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
+
/*
* Initialisation. Called after the page allocator have been initialised and
* before smp_init().
@@ -1212,7 +1303,7 @@ void __init kmem_cache_init(void)
sizeof(struct rcu_head));
kmem_cache = &kmem_cache_boot;
- if (num_possible_nodes() == 1)
+ if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1)
use_alien_caches = 0;
for (i = 0; i < NUM_INIT_LISTS; i++)
@@ -1781,7 +1872,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
/*
* Needed to avoid possible looping condition
- * in cache_grow()
+ * in cache_grow_begin()
*/
if (OFF_SLAB(freelist_cache))
continue;
@@ -2138,7 +2229,7 @@ done:
cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
cachep->flags = flags;
cachep->allocflags = __GFP_COMP;
- if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
+ if (flags & SLAB_CACHE_DMA)
cachep->allocflags |= GFP_DMA;
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
@@ -2180,6 +2271,11 @@ static void check_irq_on(void)
BUG_ON(irqs_disabled());
}
+static void check_mutex_acquired(void)
+{
+ BUG_ON(!mutex_is_locked(&slab_mutex));
+}
+
static void check_spinlock_acquired(struct kmem_cache *cachep)
{
#ifdef CONFIG_SMP
@@ -2199,13 +2295,27 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
#else
#define check_irq_off() do { } while(0)
#define check_irq_on() do { } while(0)
+#define check_mutex_acquired() do { } while(0)
#define check_spinlock_acquired(x) do { } while(0)
#define check_spinlock_acquired_node(x, y) do { } while(0)
#endif
-static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
- struct array_cache *ac,
- int force, int node);
+static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
+ int node, bool free_all, struct list_head *list)
+{
+ int tofree;
+
+ if (!ac || !ac->avail)
+ return;
+
+ tofree = free_all ? ac->avail : (ac->limit + 4) / 5;
+ if (tofree > ac->avail)
+ tofree = (ac->avail + 1) / 2;
+
+ free_block(cachep, ac->entry, tofree, node, list);
+ ac->avail -= tofree;
+ memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail);
+}
static void do_drain(void *arg)
{
@@ -2229,6 +2339,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
{
struct kmem_cache_node *n;
int node;
+ LIST_HEAD(list);
on_each_cpu(do_drain, cachep, 1);
check_irq_on();
@@ -2236,8 +2347,13 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
if (n->alien)
drain_alien_cache(cachep, n->alien);
- for_each_kmem_cache_node(cachep, node, n)
- drain_array(cachep, n, n->shared, 1, node);
+ for_each_kmem_cache_node(cachep, node, n) {
+ spin_lock_irq(&n->list_lock);
+ drain_array_locked(cachep, n->shared, node, true, &list);
+ spin_unlock_irq(&n->list_lock);
+
+ slabs_destroy(cachep, &list);
+ }
}
/*
@@ -2288,7 +2404,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
check_irq_on();
for_each_kmem_cache_node(cachep, node, n) {
- drain_freelist(cachep, n, slabs_tofree(cachep, n));
+ drain_freelist(cachep, n, INT_MAX);
ret += !list_empty(&n->slabs_full) ||
!list_empty(&n->slabs_partial);
@@ -2306,6 +2422,8 @@ void __kmem_cache_release(struct kmem_cache *cachep)
int i;
struct kmem_cache_node *n;
+ cache_random_seq_destroy(cachep);
+
free_percpu(cachep->cpu_cache);
/* NUMA: free the node structures */
@@ -2412,15 +2530,115 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
#endif
}
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+/* Hold information during a freelist initialization */
+union freelist_init_state {
+ struct {
+ unsigned int pos;
+ freelist_idx_t *list;
+ unsigned int count;
+ unsigned int rand;
+ };
+ struct rnd_state rnd_state;
+};
+
+/*
+ * Initialize the state based on the randomization methode available.
+ * return true if the pre-computed list is available, false otherwize.
+ */
+static bool freelist_state_initialize(union freelist_init_state *state,
+ struct kmem_cache *cachep,
+ unsigned int count)
+{
+ bool ret;
+ unsigned int rand;
+
+ /* Use best entropy available to define a random shift */
+ get_random_bytes_arch(&rand, sizeof(rand));
+
+ /* Use a random state if the pre-computed list is not available */
+ if (!cachep->random_seq) {
+ prandom_seed_state(&state->rnd_state, rand);
+ ret = false;
+ } else {
+ state->list = cachep->random_seq;
+ state->count = count;
+ state->pos = 0;
+ state->rand = rand;
+ ret = true;
+ }
+ return ret;
+}
+
+/* Get the next entry on the list and randomize it using a random shift */
+static freelist_idx_t next_random_slot(union freelist_init_state *state)
+{
+ return (state->list[state->pos++] + state->rand) % state->count;
+}
+
+/*
+ * Shuffle the freelist initialization state based on pre-computed lists.
+ * return true if the list was successfully shuffled, false otherwise.
+ */
+static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
+{
+ unsigned int objfreelist = 0, i, count = cachep->num;
+ union freelist_init_state state;
+ bool precomputed;
+
+ if (count < 2)
+ return false;
+
+ precomputed = freelist_state_initialize(&state, cachep, count);
+
+ /* Take a random entry as the objfreelist */
+ if (OBJFREELIST_SLAB(cachep)) {
+ if (!precomputed)
+ objfreelist = count - 1;
+ else
+ objfreelist = next_random_slot(&state);
+ page->freelist = index_to_obj(cachep, page, objfreelist) +
+ obj_offset(cachep);
+ count--;
+ }
+
+ /*
+ * On early boot, generate the list dynamically.
+ * Later use a pre-computed list for speed.
+ */
+ if (!precomputed) {
+ freelist_randomize(&state.rnd_state, page->freelist, count);
+ } else {
+ for (i = 0; i < count; i++)
+ set_free_obj(page, i, next_random_slot(&state));
+ }
+
+ if (OBJFREELIST_SLAB(cachep))
+ set_free_obj(page, cachep->num - 1, objfreelist);
+
+ return true;
+}
+#else
+static inline bool shuffle_freelist(struct kmem_cache *cachep,
+ struct page *page)
+{
+ return false;
+}
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
static void cache_init_objs(struct kmem_cache *cachep,
struct page *page)
{
int i;
void *objp;
+ bool shuffled;
cache_init_objs_debug(cachep, page);
- if (OBJFREELIST_SLAB(cachep)) {
+ /* Try to randomize the freelist if enabled */
+ shuffled = shuffle_freelist(cachep, page);
+
+ if (!shuffled && OBJFREELIST_SLAB(cachep)) {
page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
obj_offset(cachep);
}
@@ -2434,17 +2652,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
kasan_poison_object_data(cachep, objp);
}
- set_free_obj(page, i, i);
- }
-}
-
-static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
-{
- if (CONFIG_ZONE_DMA_FLAG) {
- if (flags & GFP_DMA)
- BUG_ON(!(cachep->allocflags & GFP_DMA));
- else
- BUG_ON(cachep->allocflags & GFP_DMA);
+ if (!shuffled)
+ set_free_obj(page, i, i);
}
}
@@ -2502,13 +2711,15 @@ static void slab_map_pages(struct kmem_cache *cache, struct page *page,
* Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
-static int cache_grow(struct kmem_cache *cachep,
- gfp_t flags, int nodeid, struct page *page)
+static struct page *cache_grow_begin(struct kmem_cache *cachep,
+ gfp_t flags, int nodeid)
{
void *freelist;
size_t offset;
gfp_t local_flags;
+ int page_node;
struct kmem_cache_node *n;
+ struct page *page;
/*
* Be lazy and only check for valid flags here, keeping it out of the
@@ -2520,43 +2731,35 @@ static int cache_grow(struct kmem_cache *cachep,
}
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
- /* Take the node list lock to change the colour_next on this node */
check_irq_off();
- n = get_node(cachep, nodeid);
- spin_lock(&n->list_lock);
-
- /* Get colour for the slab, and cal the next value. */
- offset = n->colour_next;
- n->colour_next++;
- if (n->colour_next >= cachep->colour)
- n->colour_next = 0;
- spin_unlock(&n->list_lock);
-
- offset *= cachep->colour_off;
-
if (gfpflags_allow_blocking(local_flags))
local_irq_enable();
/*
- * The test for missing atomic flag is performed here, rather than
- * the more obvious place, simply to reduce the critical path length
- * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
- * will eventually be caught here (where it matters).
- */
- kmem_flagcheck(cachep, flags);
-
- /*
* Get mem for the objs. Attempt to allocate a physical page from
* 'nodeid'.
*/
- if (!page)
- page = kmem_getpages(cachep, local_flags, nodeid);
+ page = kmem_getpages(cachep, local_flags, nodeid);
if (!page)
goto failed;
+ page_node = page_to_nid(page);
+ n = get_node(cachep, page_node);
+
+ /* Get colour for the slab, and cal the next value. */
+ n->colour_next++;
+ if (n->colour_next >= cachep->colour)
+ n->colour_next = 0;
+
+ offset = n->colour_next;
+ if (offset >= cachep->colour)
+ offset = 0;
+
+ offset *= cachep->colour_off;
+
/* Get slab management. */
freelist = alloc_slabmgmt(cachep, page, offset,
- local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
+ local_flags & ~GFP_CONSTRAINT_MASK, page_node);
if (OFF_SLAB(cachep) && !freelist)
goto opps1;
@@ -2567,21 +2770,40 @@ static int cache_grow(struct kmem_cache *cachep,
if (gfpflags_allow_blocking(local_flags))
local_irq_disable();
- check_irq_off();
- spin_lock(&n->list_lock);
- /* Make slab active. */
- list_add_tail(&page->lru, &(n->slabs_free));
- STATS_INC_GROWN(cachep);
- n->free_objects += cachep->num;
- spin_unlock(&n->list_lock);
- return 1;
+ return page;
+
opps1:
kmem_freepages(cachep, page);
failed:
if (gfpflags_allow_blocking(local_flags))
local_irq_disable();
- return 0;
+ return NULL;
+}
+
+static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
+{
+ struct kmem_cache_node *n;
+ void *list = NULL;
+
+ check_irq_off();
+
+ if (!page)
+ return;
+
+ INIT_LIST_HEAD(&page->lru);
+ n = get_node(cachep, page_to_nid(page));
+
+ spin_lock(&n->list_lock);
+ if (!page->active)
+ list_add_tail(&page->lru, &(n->slabs_free));
+ else
+ fixup_slab_list(cachep, n, page, &list);
+ STATS_INC_GROWN(cachep);
+ n->free_objects += cachep->num - page->active;
+ spin_unlock(&n->list_lock);
+
+ fixup_objfreelist_debug(cachep, &list);
}
#if DEBUG
@@ -2785,18 +3007,42 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
return obj;
}
+/*
+ * Slab list should be fixed up by fixup_slab_list() for existing slab
+ * or cache_grow_end() for new slab
+ */
+static __always_inline int alloc_block(struct kmem_cache *cachep,
+ struct array_cache *ac, struct page *page, int batchcount)
+{
+ /*
+ * There must be at least one object available for
+ * allocation.
+ */
+ BUG_ON(page->active >= cachep->num);
+
+ while (page->active < cachep->num && batchcount--) {
+ STATS_INC_ALLOCED(cachep);
+ STATS_INC_ACTIVE(cachep);
+ STATS_SET_HIGH(cachep);
+
+ ac->entry[ac->avail++] = slab_get_obj(cachep, page);
+ }
+
+ return batchcount;
+}
+
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
int batchcount;
struct kmem_cache_node *n;
- struct array_cache *ac;
+ struct array_cache *ac, *shared;
int node;
void *list = NULL;
+ struct page *page;
check_irq_off();
node = numa_mem_id();
-retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -2810,16 +3056,20 @@ retry:
n = get_node(cachep, node);
BUG_ON(ac->avail > 0 || !n);
+ shared = READ_ONCE(n->shared);
+ if (!n->free_objects && (!shared || !shared->avail))
+ goto direct_grow;
+
spin_lock(&n->list_lock);
+ shared = READ_ONCE(n->shared);
/* See if we can refill from the shared array */
- if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
- n->shared->touched = 1;
+ if (shared && transfer_objects(ac, shared, batchcount)) {
+ shared->touched = 1;
goto alloc_done;
}
while (batchcount > 0) {
- struct page *page;
/* Get slab alloc is to come from. */
page = get_first_slab(n, false);
if (!page)
@@ -2827,21 +3077,7 @@ retry:
check_spinlock_acquired(cachep);
- /*
- * The slab was either on partial or free list so
- * there must be at least one object available for
- * allocation.
- */
- BUG_ON(page->active >= cachep->num);
-
- while (page->active < cachep->num && batchcount--) {
- STATS_INC_ALLOCED(cachep);
- STATS_INC_ACTIVE(cachep);
- STATS_SET_HIGH(cachep);
-
- ac->entry[ac->avail++] = slab_get_obj(cachep, page);
- }
-
+ batchcount = alloc_block(cachep, ac, page, batchcount);
fixup_slab_list(cachep, n, page, &list);
}
@@ -2851,9 +3087,8 @@ alloc_done:
spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
+direct_grow:
if (unlikely(!ac->avail)) {
- int x;
-
/* Check if we can use obj in pfmemalloc slab */
if (sk_memalloc_socks()) {
void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
@@ -2862,18 +3097,19 @@ alloc_done:
return obj;
}
- x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
+ page = cache_grow_begin(cachep, gfp_exact_node(flags), node);
- /* cache_grow can reenable interrupts, then ac could change. */
+ /*
+ * cache_grow_begin() can reenable interrupts,
+ * then ac could change.
+ */
ac = cpu_cache_get(cachep);
- node = numa_mem_id();
+ if (!ac->avail && page)
+ alloc_block(cachep, ac, page, batchcount);
+ cache_grow_end(cachep, page);
- /* no objects in sight? abort */
- if (!x && ac->avail == 0)
+ if (!ac->avail)
return NULL;
-
- if (!ac->avail) /* objects refilled by interrupt? */
- goto retry;
}
ac->touched = 1;
@@ -2884,9 +3120,6 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
gfp_t flags)
{
might_sleep_if(gfpflags_allow_blocking(flags));
-#if DEBUG
- kmem_flagcheck(cachep, flags);
-#endif
}
#if DEBUG
@@ -2998,19 +3231,17 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
{
struct zonelist *zonelist;
- gfp_t local_flags;
struct zoneref *z;
struct zone *zone;
enum zone_type high_zoneidx = gfp_zone(flags);
void *obj = NULL;
+ struct page *page;
int nid;
unsigned int cpuset_mems_cookie;
if (flags & __GFP_THISNODE)
return NULL;
- local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
-
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist = node_zonelist(mempolicy_slab_node(), flags);
@@ -3040,33 +3271,19 @@ retry:
* We may trigger various forms of reclaim on the allowed
* set and go into memory reserves if necessary.
*/
- struct page *page;
-
- if (gfpflags_allow_blocking(local_flags))
- local_irq_enable();
- kmem_flagcheck(cache, flags);
- page = kmem_getpages(cache, local_flags, numa_mem_id());
- if (gfpflags_allow_blocking(local_flags))
- local_irq_disable();
+ page = cache_grow_begin(cache, flags, numa_mem_id());
+ cache_grow_end(cache, page);
if (page) {
+ nid = page_to_nid(page);
+ obj = ____cache_alloc_node(cache,
+ gfp_exact_node(flags), nid);
+
/*
- * Insert into the appropriate per node queues
+ * Another processor may allocate the objects in
+ * the slab since we are not holding any locks.
*/
- nid = page_to_nid(page);
- if (cache_grow(cache, flags, nid, page)) {
- obj = ____cache_alloc_node(cache,
- gfp_exact_node(flags), nid);
- if (!obj)
- /*
- * Another processor may allocate the
- * objects in the slab since we are
- * not holding any locks.
- */
- goto retry;
- } else {
- /* cache_grow already freed obj */
- obj = NULL;
- }
+ if (!obj)
+ goto retry;
}
}
@@ -3083,15 +3300,13 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
{
struct page *page;
struct kmem_cache_node *n;
- void *obj;
+ void *obj = NULL;
void *list = NULL;
- int x;
VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
n = get_node(cachep, nodeid);
BUG_ON(!n);
-retry:
check_irq_off();
spin_lock(&n->list_lock);
page = get_first_slab(n, false);
@@ -3113,18 +3328,18 @@ retry:
spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
- goto done;
+ return obj;
must_grow:
spin_unlock(&n->list_lock);
- x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL);
- if (x)
- goto retry;
-
- return fallback_alloc(cachep, flags);
+ page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
+ if (page) {
+ /* This slab isn't counted yet so don't update free_objects */
+ obj = slab_get_obj(cachep, page);
+ }
+ cache_grow_end(cachep, page);
-done:
- return obj;
+ return obj ? obj : fallback_alloc(cachep, flags);
}
static __always_inline void *
@@ -3242,6 +3457,9 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
{
int i;
struct kmem_cache_node *n = get_node(cachep, node);
+ struct page *page;
+
+ n->free_objects += nr_objects;
for (i = 0; i < nr_objects; i++) {
void *objp;
@@ -3254,17 +3472,11 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
check_spinlock_acquired_node(cachep, node);
slab_put_obj(cachep, page, objp);
STATS_DEC_ACTIVE(cachep);
- n->free_objects++;
/* fixup slab chains */
- if (page->active == 0) {
- if (n->free_objects > n->free_limit) {
- n->free_objects -= cachep->num;
- list_add_tail(&page->lru, list);
- } else {
- list_add(&page->lru, &n->slabs_free);
- }
- } else {
+ if (page->active == 0)
+ list_add(&page->lru, &n->slabs_free);
+ else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
@@ -3272,6 +3484,14 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
list_add_tail(&page->lru, &n->slabs_partial);
}
}
+
+ while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
+ n->free_objects -= cachep->num;
+
+ page = list_last_entry(&n->slabs_free, struct page, lru);
+ list_del(&page->lru);
+ list_add(&page->lru, list);
+ }
}
static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
@@ -3645,72 +3865,19 @@ EXPORT_SYMBOL(kfree);
/*
* This initializes kmem_cache_node or resizes various caches for all nodes.
*/
-static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
+static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp)
{
+ int ret;
int node;
struct kmem_cache_node *n;
- struct array_cache *new_shared;
- struct alien_cache **new_alien = NULL;
for_each_online_node(node) {
-
- if (use_alien_caches) {
- new_alien = alloc_alien_cache(node, cachep->limit, gfp);
- if (!new_alien)
- goto fail;
- }
-
- new_shared = NULL;
- if (cachep->shared) {
- new_shared = alloc_arraycache(node,
- cachep->shared*cachep->batchcount,
- 0xbaadf00d, gfp);
- if (!new_shared) {
- free_alien_cache(new_alien);
- goto fail;
- }
- }
-
- n = get_node(cachep, node);
- if (n) {
- struct array_cache *shared = n->shared;
- LIST_HEAD(list);
-
- spin_lock_irq(&n->list_lock);
-
- if (shared)
- free_block(cachep, shared->entry,
- shared->avail, node, &list);
-
- n->shared = new_shared;
- if (!n->alien) {
- n->alien = new_alien;
- new_alien = NULL;
- }
- n->free_limit = (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
- spin_unlock_irq(&n->list_lock);
- slabs_destroy(cachep, &list);
- kfree(shared);
- free_alien_cache(new_alien);
- continue;
- }
- n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
- if (!n) {
- free_alien_cache(new_alien);
- kfree(new_shared);
+ ret = setup_kmem_cache_node(cachep, node, gfp, true);
+ if (ret)
goto fail;
- }
- kmem_cache_node_init(n);
- n->next_reap = jiffies + REAPTIMEOUT_NODE +
- ((unsigned long)cachep) % REAPTIMEOUT_NODE;
- n->shared = new_shared;
- n->alien = new_alien;
- n->free_limit = (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
- cachep->node[node] = n;
}
+
return 0;
fail:
@@ -3752,7 +3919,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
cachep->shared = shared;
if (!prev)
- goto alloc_node;
+ goto setup_node;
for_each_online_cpu(cpu) {
LIST_HEAD(list);
@@ -3769,8 +3936,8 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
}
free_percpu(prev);
-alloc_node:
- return alloc_kmem_cache_node(cachep, gfp);
+setup_node:
+ return setup_kmem_cache_nodes(cachep, gfp);
}
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
@@ -3804,6 +3971,10 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
int shared = 0;
int batchcount = 0;
+ err = cache_random_seq_create(cachep, gfp);
+ if (err)
+ goto end;
+
if (!is_root_cache(cachep)) {
struct kmem_cache *root = memcg_root_cache(cachep);
limit = root->limit;
@@ -3857,6 +4028,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
batchcount = (limit + 1) / 2;
skip_setup:
err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
+end:
if (err)
pr_err("enable_cpucache failed for %s, error %d\n",
cachep->name, -err);
@@ -3869,29 +4041,26 @@ skip_setup:
* if drain_array() is used on the shared array.
*/
static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
- struct array_cache *ac, int force, int node)
+ struct array_cache *ac, int node)
{
LIST_HEAD(list);
- int tofree;
+
+ /* ac from n->shared can be freed if we don't hold the slab_mutex. */
+ check_mutex_acquired();
if (!ac || !ac->avail)
return;
- if (ac->touched && !force) {
+
+ if (ac->touched) {
ac->touched = 0;
- } else {
- spin_lock_irq(&n->list_lock);
- if (ac->avail) {
- tofree = force ? ac->avail : (ac->limit + 4) / 5;
- if (tofree > ac->avail)
- tofree = (ac->avail + 1) / 2;
- free_block(cachep, ac->entry, tofree, node, &list);
- ac->avail -= tofree;
- memmove(ac->entry, &(ac->entry[tofree]),
- sizeof(void *) * ac->avail);
- }
- spin_unlock_irq(&n->list_lock);
- slabs_destroy(cachep, &list);
+ return;
}
+
+ spin_lock_irq(&n->list_lock);
+ drain_array_locked(cachep, ac, node, false, &list);
+ spin_unlock_irq(&n->list_lock);
+
+ slabs_destroy(cachep, &list);
}
/**
@@ -3929,7 +4098,7 @@ static void cache_reap(struct work_struct *w)
reap_alien(searchp, n);
- drain_array(searchp, n, cpu_cache_get(searchp), 0, node);
+ drain_array(searchp, n, cpu_cache_get(searchp), node);
/*
* These are racy checks but it does not matter
@@ -3940,7 +4109,7 @@ static void cache_reap(struct work_struct *w)
n->next_reap = jiffies + REAPTIMEOUT_NODE;
- drain_array(searchp, n, n->shared, 0, node);
+ drain_array(searchp, n, n->shared, node);
if (n->free_touched)
n->free_touched = 0;
diff --git a/mm/slub.c b/mm/slub.c
index 4dbb109eb8cd..cf1faa4d3992 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -329,8 +329,8 @@ static inline void set_page_slub_counters(struct page *page, unsigned long count
tmp.counters = counters_new;
/*
* page->counters can cover frozen/inuse/objects as well
- * as page->_count. If we assign to ->counters directly
- * we run the risk of losing updates to page->_count, so
+ * as page->_refcount. If we assign to ->counters directly
+ * we run the risk of losing updates to page->_refcount, so
* be careful and only assign to the fields we need.
*/
page->frozen = tmp.frozen;
@@ -1735,11 +1735,11 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
* may return off node objects because partial slabs are obtained
* from other nodes and filled up.
*
- * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
- * defrag_ratio = 1000) then every (well almost) allocation will
- * first attempt to defrag slab caches on other nodes. This means
- * scanning over all nodes to look for partial slabs which may be
- * expensive if we do it every time we are trying to find a slab
+ * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
+ * (which makes defrag_ratio = 1000) then every (well almost)
+ * allocation will first attempt to defrag slab caches on other nodes.
+ * This means scanning over all nodes to look for partial slabs which
+ * may be expensive if we do it every time we are trying to find a slab
* with available objects.
*/
if (!s->remote_node_defrag_ratio ||
@@ -3697,7 +3697,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
* s->cpu_partial is checked locklessly (see put_cpu_partial),
* so we have to make sure the change is visible.
*/
- kick_all_cpus_sync();
+ synchronize_sched();
}
flush_all(s);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 366ce3518703..0d457e7db8d6 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -358,7 +358,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
__SetPageLocked(new_page);
- SetPageSwapBacked(new_page);
+ __SetPageSwapBacked(new_page);
err = __add_to_swap_cache(new_page, entry);
if (likely(!err)) {
radix_tree_preload_end();
@@ -370,7 +370,6 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return new_page;
}
radix_tree_preload_end();
- ClearPageSwapBacked(new_page);
__ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
diff --git a/mm/util.c b/mm/util.c
index 6cc81e7b8705..8a1b3a1fb595 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -346,6 +346,29 @@ void *page_rmapping(struct page *page)
return __page_rmapping(page);
}
+/*
+ * Return true if this page is mapped into pagetables.
+ * For compound page it returns true if any subpage of compound page is mapped.
+ */
+bool page_mapped(struct page *page)
+{
+ int i;
+
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) >= 0;
+ page = compound_head(page);
+ if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+ return true;
+ if (PageHuge(page))
+ return false;
+ for (i = 0; i < hpage_nr_pages(page); i++) {
+ if (atomic_read(&page[i]._mapcount) >= 0)
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL(page_mapped);
+
struct anon_vma *page_anon_vma(struct page *page)
{
unsigned long mapping;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 142cb61f4822..dcfdfc1a0942 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -633,7 +633,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
*
* Reversing the order of the tests ensures such a situation cannot
* escape unnoticed. The smp_rmb is needed to ensure the page->flags
- * load is not satisfied before that of page->_count.
+ * load is not satisfied before that of page->_refcount.
*
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under tree_lock, then this ordering is not required.
@@ -1374,7 +1374,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
!list_empty(src); scan++) {
struct page *page;
- int nr_pages;
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
@@ -1383,10 +1382,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
switch (__isolate_lru_page(page, mode)) {
case 0:
- nr_pages = hpage_nr_pages(page);
- mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
+ nr_taken += hpage_nr_pages(page);
list_move(&page->lru, dst);
- nr_taken += nr_pages;
break;
case -EBUSY:
@@ -1602,8 +1599,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, isolate_mode, lru);
- __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
+ update_lru_size(lruvec, lru, -nr_taken);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+ reclaim_stat->recent_scanned[file] += nr_taken;
if (global_reclaim(sc)) {
__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
@@ -1624,8 +1622,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_lock_irq(&zone->lru_lock);
- reclaim_stat->recent_scanned[file] += nr_taken;
-
if (global_reclaim(sc)) {
if (current_is_kswapd())
__count_zone_vm_events(PGSTEAL_KSWAPD, zone,
@@ -1720,7 +1716,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* It is safe to rely on PG_active against the non-LRU pages in here because
* nobody will play with that bit on a non-LRU page.
*
- * The downside is that we have to touch page->_count against each page.
+ * The downside is that we have to touch page->_refcount against each page.
* But we had to alter page->flags anyway.
*/
@@ -1742,7 +1738,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
SetPageLRU(page);
nr_pages = hpage_nr_pages(page);
- mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+ update_lru_size(lruvec, lru, nr_pages);
list_move(&page->lru, &lruvec->lists[lru]);
pgmoved += nr_pages;
@@ -1760,7 +1756,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
list_add(&page->lru, pages_to_free);
}
}
- __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
+
if (!is_active_lru(lru))
__count_vm_events(PGDEACTIVATE, pgmoved);
}
@@ -1794,14 +1790,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, lru);
- if (global_reclaim(sc))
- __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
+ update_lru_size(lruvec, lru, -nr_taken);
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
+ if (global_reclaim(sc))
+ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
__count_zone_vm_events(PGREFILL, zone, nr_scanned);
- __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
- __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+
spin_unlock_irq(&zone->lru_lock);
while (!list_empty(&l_hold)) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 5e4300482897..5b72a8ad2813 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -570,49 +570,18 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
#ifdef CONFIG_NUMA
/*
- * zonelist = the list of zones passed to the allocator
- * z = the zone from which the allocation occurred.
- *
- * Must be called with interrupts disabled.
- *
- * When __GFP_OTHER_NODE is set assume the node of the preferred
- * zone is the local node. This is useful for daemons who allocate
- * memory on behalf of other processes.
- */
-void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
-{
- if (z->zone_pgdat == preferred_zone->zone_pgdat) {
- __inc_zone_state(z, NUMA_HIT);
- } else {
- __inc_zone_state(z, NUMA_MISS);
- __inc_zone_state(preferred_zone, NUMA_FOREIGN);
- }
- if (z->node == ((flags & __GFP_OTHER_NODE) ?
- preferred_zone->node : numa_node_id()))
- __inc_zone_state(z, NUMA_LOCAL);
- else
- __inc_zone_state(z, NUMA_OTHER);
-}
-
-/*
* Determine the per node value of a stat item.
*/
unsigned long node_page_state(int node, enum zone_stat_item item)
{
struct zone *zones = NODE_DATA(node)->node_zones;
+ int i;
+ unsigned long count = 0;
- return
-#ifdef CONFIG_ZONE_DMA
- zone_page_state(&zones[ZONE_DMA], item) +
-#endif
-#ifdef CONFIG_ZONE_DMA32
- zone_page_state(&zones[ZONE_DMA32], item) +
-#endif
-#ifdef CONFIG_HIGHMEM
- zone_page_state(&zones[ZONE_HIGHMEM], item) +
-#endif
- zone_page_state(&zones[ZONE_NORMAL], item) +
- zone_page_state(&zones[ZONE_MOVABLE], item);
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ count += zone_page_state(zones + i, item);
+
+ return count;
}
#endif
@@ -1010,6 +979,9 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
if (!memmap_valid_within(pfn, page, zone))
continue;
+ if (page_zone(page) != zone)
+ continue;
+
mtype = get_pageblock_migratetype(page);
if (mtype < MIGRATE_TYPES)
@@ -1069,13 +1041,17 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
block_end_pfn = min(block_end_pfn, end_pfn);
page = pfn_to_page(pfn);
- pageblock_mt = get_pfnblock_migratetype(page, pfn);
+ pageblock_mt = get_pageblock_migratetype(page);
for (; pfn < block_end_pfn; pfn++) {
if (!pfn_valid_within(pfn))
continue;
page = pfn_to_page(pfn);
+
+ if (page_zone(page) != zone)
+ continue;
+
if (PageBuddy(page)) {
pfn += (1UL << page_order(page)) - 1;
continue;
@@ -1378,6 +1354,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
static cpumask_var_t cpu_stat_off;
+#ifdef CONFIG_PROC_FS
+static void refresh_vm_stats(struct work_struct *work)
+{
+ refresh_cpu_vm_stats(true);
+}
+
+int vmstat_refresh(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ long val;
+ int err;
+ int i;
+
+ /*
+ * The regular update, every sysctl_stat_interval, may come later
+ * than expected: leaving a significant amount in per_cpu buckets.
+ * This is particularly misleading when checking a quantity of HUGE
+ * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
+ * which can equally be echo'ed to or cat'ted from (by root),
+ * can be used to update the stats just before reading them.
+ *
+ * Oh, and since global_page_state() etc. are so careful to hide
+ * transiently negative values, report an error here if any of
+ * the stats is negative, so we know to go looking for imbalance.
+ */
+ err = schedule_on_each_cpu(refresh_vm_stats);
+ if (err)
+ return err;
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+ val = atomic_long_read(&vm_stat[i]);
+ if (val < 0) {
+ switch (i) {
+ case NR_ALLOC_BATCH:
+ case NR_PAGES_SCANNED:
+ /*
+ * These are often seen to go negative in
+ * recent kernels, but not to go permanently
+ * negative. Whilst it would be nicer not to
+ * have exceptions, rooting them out would be
+ * another task, of rather low priority.
+ */
+ break;
+ default:
+ pr_warn("%s: %s %ld\n",
+ __func__, vmstat_text[i], val);
+ err = -EINVAL;
+ break;
+ }
+ }
+ }
+ if (err)
+ return err;
+ if (write)
+ *ppos += *lenp;
+ else
+ *lenp = 0;
+ return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
static void vmstat_update(struct work_struct *w)
{
if (refresh_cpu_vm_stats(true)) {