summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig25
-rw-r--r--mm/Makefile1
-rw-r--r--mm/balloon_compaction.c4
-rw-r--r--mm/bounce.c44
-rw-r--r--mm/cleancache.c6
-rw-r--r--mm/compaction.c7
-rw-r--r--mm/filemap.c58
-rw-r--r--mm/huge_memory.c60
-rw-r--r--mm/hugetlb.c10
-rw-r--r--mm/hugetlb_cgroup.c2
-rw-r--r--mm/internal.h20
-rw-r--r--mm/ksm.c14
-rw-r--r--mm/memblock.c36
-rw-r--r--mm/memcontrol.c531
-rw-r--r--mm/memory-failure.c21
-rw-r--r--mm/memory.c10
-rw-r--r--mm/memory_hotplug.c9
-rw-r--r--mm/mempolicy.c32
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/mincore.c7
-rw-r--r--mm/mlock.c108
-rw-r--r--mm/mm_init.c3
-rw-r--r--mm/mmap.c18
-rw-r--r--mm/mmu_notifier.c3
-rw-r--r--mm/nobootmem.c25
-rw-r--r--mm/oom_kill.c14
-rw-r--r--mm/page-writeback.c57
-rw-r--r--mm/page_alloc.c102
-rw-r--r--mm/page_io.c14
-rw-r--r--mm/readahead.c15
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/shmem.c65
-rw-r--r--mm/slab.h26
-rw-r--r--mm/slab_common.c89
-rw-r--r--mm/slub.c19
-rw-r--r--mm/swap.c36
-rw-r--r--mm/swap_state.c16
-rw-r--r--mm/swapfile.c11
-rw-r--r--mm/vmalloc.c20
-rw-r--r--mm/vmscan.c87
-rw-r--r--mm/zsmalloc.c1106
-rw-r--r--mm/zswap.c4
42 files changed, 1994 insertions, 765 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 723bbe04a0b0..2d9f1504d75e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -552,3 +552,28 @@ config MEM_SOFT_DIRTY
it can be cleared by hands.
See Documentation/vm/soft-dirty.txt for more details.
+
+config ZSMALLOC
+ bool "Memory allocator for compressed pages"
+ depends on MMU
+ default n
+ help
+ zsmalloc is a slab-based memory allocator designed to store
+ compressed RAM pages. zsmalloc uses virtual memory mapping
+ in order to reduce fragmentation. However, this results in a
+ non-standard allocator interface where a handle, not a pointer, is
+ returned by an alloc(). This handle must be mapped in order to
+ access the allocated space.
+
+config PGTABLE_MAPPING
+ bool "Use page table mapping to access object in zsmalloc"
+ depends on ZSMALLOC
+ help
+ By default, zsmalloc uses a copy-based object mapping method to
+ access allocations that span two pages. However, if a particular
+ architecture (ex, ARM) performs VM mapping faster than copying,
+ then you should select this. This causes zsmalloc to use page table
+ mapping rather than copying for object mapping.
+
+ You can check speed with zsmalloc benchmark[1].
+ [1] https://github.com/spartacus06/zsmalloc
diff --git a/mm/Makefile b/mm/Makefile
index 305d10acd081..310c90a09264 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -60,3 +60,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZBUD) += zbud.o
+obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 07dbc8ec46cf..6e45a5074bf0 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -267,7 +267,7 @@ void balloon_page_putback(struct page *page)
put_page(page);
} else {
WARN_ON(1);
- dump_page(page);
+ dump_page(page, "not movable balloon page");
}
unlock_page(page);
}
@@ -287,7 +287,7 @@ int balloon_page_migrate(struct page *newpage,
BUG_ON(!trylock_page(newpage));
if (WARN_ON(!__is_movable_balloon_page(page))) {
- dump_page(page);
+ dump_page(page, "not movable balloon page");
unlock_page(newpage);
return rc;
}
diff --git a/mm/bounce.c b/mm/bounce.c
index 5a7d58fb883b..523918b8c6dc 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -98,27 +98,24 @@ int init_emergency_isa_pool(void)
static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
{
unsigned char *vfrom;
- struct bio_vec *tovec, *fromvec;
- int i;
-
- bio_for_each_segment(tovec, to, i) {
- fromvec = from->bi_io_vec + i;
-
- /*
- * not bounced
- */
- if (tovec->bv_page == fromvec->bv_page)
- continue;
-
- /*
- * fromvec->bv_offset and fromvec->bv_len might have been
- * modified by the block layer, so use the original copy,
- * bounce_copy_vec already uses tovec->bv_len
- */
- vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
+ struct bio_vec tovec, *fromvec = from->bi_io_vec;
+ struct bvec_iter iter;
+
+ bio_for_each_segment(tovec, to, iter) {
+ if (tovec.bv_page != fromvec->bv_page) {
+ /*
+ * fromvec->bv_offset and fromvec->bv_len might have
+ * been modified by the block layer, so use the original
+ * copy, bounce_copy_vec already uses tovec->bv_len
+ */
+ vfrom = page_address(fromvec->bv_page) +
+ tovec.bv_offset;
+
+ bounce_copy_vec(&tovec, vfrom);
+ flush_dcache_page(tovec.bv_page);
+ }
- bounce_copy_vec(tovec, vfrom);
- flush_dcache_page(tovec->bv_page);
+ fromvec++;
}
}
@@ -201,13 +198,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
{
struct bio *bio;
int rw = bio_data_dir(*bio_orig);
- struct bio_vec *to, *from;
+ struct bio_vec *to, from;
+ struct bvec_iter iter;
unsigned i;
if (force)
goto bounce;
- bio_for_each_segment(from, *bio_orig, i)
- if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q))
+ bio_for_each_segment(from, *bio_orig, iter)
+ if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q))
goto bounce;
return;
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 5875f48ce279..d0eac4350403 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -237,7 +237,7 @@ int __cleancache_get_page(struct page *page)
goto out;
}
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
if (fake_pool_id < 0)
goto out;
@@ -279,7 +279,7 @@ void __cleancache_put_page(struct page *page)
return;
}
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
if (fake_pool_id < 0)
return;
@@ -318,7 +318,7 @@ void __cleancache_invalidate_page(struct address_space *mapping,
if (pool_id < 0)
return;
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (cleancache_get_key(mapping->host, &key) >= 0) {
cleancache_ops->invalidate_page(pool_id,
key, page->index);
diff --git a/mm/compaction.c b/mm/compaction.c
index 3a91a2ea3d34..b48c5259ea33 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -523,7 +523,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
if (!isolation_suitable(cc, page))
goto next_pageblock;
- /* Skip if free */
+ /*
+ * Skip if free. page_order cannot be used without zone->lock
+ * as nothing prevents parallel allocations or buddy merging.
+ */
if (PageBuddy(page))
continue;
@@ -601,7 +604,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
if (__isolate_lru_page(page, mode) != 0)
continue;
- VM_BUG_ON(PageTransCompound(page));
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
/* Successfully isolated */
cc->finished_update_migrate = true;
diff --git a/mm/filemap.c b/mm/filemap.c
index b7749a92021c..d56d3c145b9f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -409,9 +409,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
{
int error;
- VM_BUG_ON(!PageLocked(old));
- VM_BUG_ON(!PageLocked(new));
- VM_BUG_ON(new->mapping);
+ VM_BUG_ON_PAGE(!PageLocked(old), old);
+ VM_BUG_ON_PAGE(!PageLocked(new), new);
+ VM_BUG_ON_PAGE(new->mapping, new);
error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (!error) {
@@ -461,8 +461,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
{
int error;
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(PageSwapBacked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageSwapBacked(page), page);
error = mem_cgroup_cache_charge(page, current->mm,
gfp_mask & GFP_RECLAIM_MASK);
@@ -607,7 +607,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
*/
void unlock_page(struct page *page)
{
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
clear_bit_unlock(PG_locked, &page->flags);
smp_mb__after_clear_bit();
wake_up_page(page, PG_locked);
@@ -760,7 +760,7 @@ repeat:
page_cache_release(page);
goto repeat;
}
- VM_BUG_ON(page->index != offset);
+ VM_BUG_ON_PAGE(page->index != offset, page);
}
return page;
}
@@ -1428,30 +1428,28 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
if (!count)
goto out; /* skip atime */
size = i_size_read(inode);
- if (pos < size) {
- retval = filemap_write_and_wait_range(mapping, pos,
+ retval = filemap_write_and_wait_range(mapping, pos,
pos + iov_length(iov, nr_segs) - 1);
- if (!retval) {
- retval = mapping->a_ops->direct_IO(READ, iocb,
- iov, pos, nr_segs);
- }
- if (retval > 0) {
- *ppos = pos + retval;
- count -= retval;
- }
+ if (!retval) {
+ retval = mapping->a_ops->direct_IO(READ, iocb,
+ iov, pos, nr_segs);
+ }
+ if (retval > 0) {
+ *ppos = pos + retval;
+ count -= retval;
+ }
- /*
- * Btrfs can have a short DIO read if we encounter
- * compressed extents, so if there was an error, or if
- * we've already read everything we wanted to, or if
- * there was a short read because we hit EOF, go ahead
- * and return. Otherwise fallthrough to buffered io for
- * the rest of the read.
- */
- if (retval < 0 || !count || *ppos >= size) {
- file_accessed(filp);
- goto out;
- }
+ /*
+ * Btrfs can have a short DIO read if we encounter
+ * compressed extents, so if there was an error, or if
+ * we've already read everything we wanted to, or if
+ * there was a short read because we hit EOF, go ahead
+ * and return. Otherwise fallthrough to buffered io for
+ * the rest of the read.
+ */
+ if (retval < 0 || !count || *ppos >= size) {
+ file_accessed(filp);
+ goto out;
}
}
@@ -1656,7 +1654,7 @@ retry_find:
put_page(page);
goto retry_find;
}
- VM_BUG_ON(page->index != offset);
+ VM_BUG_ON_PAGE(page->index != offset, page);
/*
* We have a locked page in the page cache, now we need to check
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 95d1acb0f3d2..82166bf974e1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -130,8 +130,14 @@ static int set_recommended_min_free_kbytes(void)
(unsigned long) nr_free_buffer_pages() / 20);
recommended_min <<= (PAGE_SHIFT-10);
- if (recommended_min > min_free_kbytes)
+ if (recommended_min > min_free_kbytes) {
+ if (user_min_free_kbytes >= 0)
+ pr_info("raising min_free_kbytes from %d to %lu "
+ "to help transparent hugepage allocations\n",
+ min_free_kbytes, recommended_min);
+
min_free_kbytes = recommended_min;
+ }
setup_per_zone_wmarks();
return 0;
}
@@ -655,7 +661,7 @@ out:
hugepage_exit_sysfs(hugepage_kobj);
return err;
}
-module_init(hugepage_init)
+subsys_initcall(hugepage_init);
static int __init setup_transparent_hugepage(char *str)
{
@@ -712,7 +718,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pgtable_t pgtable;
spinlock_t *ptl;
- VM_BUG_ON(!PageCompound(page));
+ VM_BUG_ON_PAGE(!PageCompound(page), page);
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
@@ -893,7 +899,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out;
}
src_page = pmd_page(pmd);
- VM_BUG_ON(!PageHead(src_page));
+ VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
get_page(src_page);
page_dup_rmap(src_page);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -1067,7 +1073,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_pages;
- VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
@@ -1133,7 +1139,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_unlock;
page = pmd_page(orig_pmd);
- VM_BUG_ON(!PageCompound(page) || !PageHead(page));
+ VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
if (page_mapcount(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
@@ -1211,7 +1217,7 @@ alloc:
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
put_huge_zero_page();
} else {
- VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
page_remove_rmap(page);
put_page(page);
}
@@ -1249,7 +1255,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
goto out;
page = pmd_page(*pmd);
- VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
if (flags & FOLL_TOUCH) {
pmd_t _pmd;
/*
@@ -1274,7 +1280,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
}
}
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
- VM_BUG_ON(!PageCompound(page));
+ VM_BUG_ON_PAGE(!PageCompound(page), page);
if (flags & FOLL_GET)
get_page_foll(page);
@@ -1432,9 +1438,9 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
} else {
page = pmd_page(orig_pmd);
page_remove_rmap(page);
- VM_BUG_ON(page_mapcount(page) < 0);
+ VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
- VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
atomic_long_dec(&tlb->mm->nr_ptes);
spin_unlock(ptl);
tlb_remove_page(tlb, page);
@@ -1502,19 +1508,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
- set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
- if (new_ptl != old_ptl) {
- pgtable_t pgtable;
- /*
- * Move preallocated PTE page table if new_pmd is on
- * different PMD page table.
- */
+ if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
+ pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
-
- spin_unlock(new_ptl);
}
+ set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
spin_unlock(old_ptl);
}
out:
@@ -2176,9 +2178,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (unlikely(!page))
goto out;
- VM_BUG_ON(PageCompound(page));
- BUG_ON(!PageAnon(page));
- VM_BUG_ON(!PageSwapBacked(page));
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
/* cannot use mapcount: can't collapse if there's a gup pin */
if (page_count(page) != 1)
@@ -2201,8 +2203,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
/* 0 stands for page_is_file_cache(page) == false */
inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
/* If there is no mapped pte young don't collapse the page */
if (pte_young(pteval) || PageReferenced(page) ||
@@ -2232,7 +2234,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
} else {
src_page = pte_page(pteval);
copy_user_highpage(page, src_page, address, vma);
- VM_BUG_ON(page_mapcount(src_page) != 1);
+ VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
release_pte_page(src_page);
/*
* ptl mostly unnecessary, but preempt has to
@@ -2311,7 +2313,7 @@ static struct page
struct vm_area_struct *vma, unsigned long address,
int node)
{
- VM_BUG_ON(*hpage);
+ VM_BUG_ON_PAGE(*hpage, *hpage);
/*
* Allocate the page while the vma is still valid and under
* the mmap_sem read mode so there is no memory allocation
@@ -2580,7 +2582,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
*/
node = page_to_nid(page);
khugepaged_node_load[node]++;
- VM_BUG_ON(PageCompound(page));
+ VM_BUG_ON_PAGE(PageCompound(page), page);
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
/* cannot use mapcount: can't collapse if there's a gup pin */
@@ -2876,7 +2878,7 @@ again:
return;
}
page = pmd_page(*pmd);
- VM_BUG_ON(!page_count(page));
+ VM_BUG_ON_PAGE(!page_count(page), page);
get_page(page);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 04306b9de90d..c01cb9fedb18 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -584,7 +584,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
1 << PG_active | 1 << PG_reserved |
1 << PG_private | 1 << PG_writeback);
}
- VM_BUG_ON(hugetlb_cgroup_from_page(page));
+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
arch_release_hugepage(page);
@@ -1089,7 +1089,7 @@ retry:
* no users -- drop the buddy allocator's reference.
*/
put_page_testzero(page);
- VM_BUG_ON(page_count(page));
+ VM_BUG_ON_PAGE(page_count(page), page);
enqueue_huge_page(h, page);
}
free:
@@ -3503,7 +3503,7 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
bool isolate_huge_page(struct page *page, struct list_head *list)
{
- VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
if (!get_page_unless_zero(page))
return false;
spin_lock(&hugetlb_lock);
@@ -3514,7 +3514,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
void putback_active_hugepage(struct page *page)
{
- VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
spin_lock(&hugetlb_lock);
list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
spin_unlock(&hugetlb_lock);
@@ -3523,7 +3523,7 @@ void putback_active_hugepage(struct page *page)
bool is_hugepage_active(struct page *page)
{
- VM_BUG_ON(!PageHuge(page));
+ VM_BUG_ON_PAGE(!PageHuge(page), page);
/*
* This function can be called for a tail page because the caller,
* scan_movable_pages, scans through a given pfn-range which typically
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index d747a84e09b0..cb00829bb466 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -390,7 +390,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
if (hugetlb_cgroup_disabled())
return;
- VM_BUG_ON(!PageHuge(oldhpage));
+ VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
spin_lock(&hugetlb_lock);
h_cg = hugetlb_cgroup_from_page(oldhpage);
set_hugetlb_cgroup(oldhpage, NULL);
diff --git a/mm/internal.h b/mm/internal.h
index a346ba120e42..29e1e761f9eb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -27,8 +27,8 @@ static inline void set_page_count(struct page *page, int v)
*/
static inline void set_page_refcounted(struct page *page)
{
- VM_BUG_ON(PageTail(page));
- VM_BUG_ON(atomic_read(&page->_count));
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(atomic_read(&page->_count), page);
set_page_count(page, 1);
}
@@ -46,7 +46,7 @@ static inline void __get_page_tail_foll(struct page *page,
* speculative page access (like in
* page_cache_get_speculative()) on tail pages.
*/
- VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+ VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page);
if (get_page_head)
atomic_inc(&page->first_page->_count);
get_huge_page_tail(page);
@@ -71,7 +71,7 @@ static inline void get_page_foll(struct page *page)
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count.
*/
- VM_BUG_ON(atomic_read(&page->_count) <= 0);
+ VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
atomic_inc(&page->_count);
}
}
@@ -83,7 +83,6 @@ extern unsigned long highest_memmap_pfn;
*/
extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);
-extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern bool zone_reclaimable(struct zone *zone);
/*
@@ -99,6 +98,7 @@ extern void prep_compound_page(struct page *page, unsigned long order);
#ifdef CONFIG_MEMORY_FAILURE
extern bool is_free_buddy_page(struct page *page);
#endif
+extern int user_min_free_kbytes;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -142,9 +142,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
#endif
/*
- * function for dealing with page's order in buddy system.
- * zone->lock is already acquired when we use these.
- * So, we don't need atomic page->flags operations here.
+ * This function returns the order of a free page in the buddy system. In
+ * general, page_zone(page)->lock must be held by the caller to prevent the
+ * page from being allocated in parallel and returning garbage as the order.
+ * If a caller does not hold page_zone(page)->lock, it must guarantee that the
+ * page cannot be allocated or merged in parallel.
*/
static inline unsigned long page_order(struct page *page)
{
@@ -173,7 +175,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
struct page *page)
{
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
return 0;
diff --git a/mm/ksm.c b/mm/ksm.c
index 3df141e5f3e0..aa4c7c7250c1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1898,13 +1898,13 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
int ret = SWAP_AGAIN;
int search_new_forks = 0;
- VM_BUG_ON(!PageKsm(page));
+ VM_BUG_ON_PAGE(!PageKsm(page), page);
/*
* Rely on the page lock to protect against concurrent modifications
* to that page's node of the stable tree.
*/
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
stable_node = page_stable_node(page);
if (!stable_node)
@@ -1958,13 +1958,13 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{
struct stable_node *stable_node;
- VM_BUG_ON(!PageLocked(oldpage));
- VM_BUG_ON(!PageLocked(newpage));
- VM_BUG_ON(newpage->mapping != oldpage->mapping);
+ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
stable_node = page_stable_node(newpage);
if (stable_node) {
- VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
+ VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
stable_node->kpfn = page_to_pfn(newpage);
/*
* newpage->mapping was set in advance; now we need smp_wmb()
@@ -2345,4 +2345,4 @@ out_free:
out:
return err;
}
-module_init(ksm_init)
+subsys_initcall(ksm_init);
diff --git a/mm/memblock.c b/mm/memblock.c
index 1c2ef2c7edab..39a31e7f0045 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -266,31 +266,34 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
}
}
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+
phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
phys_addr_t *addr)
{
if (memblock.reserved.regions == memblock_reserved_init_regions)
return 0;
- /*
- * Don't allow nobootmem allocator to free reserved memory regions
- * array if
- * - CONFIG_DEBUG_FS is enabled;
- * - CONFIG_ARCH_DISCARD_MEMBLOCK is not enabled;
- * - reserved memory regions array have been resized during boot.
- * Otherwise debug_fs entry "sys/kernel/debug/memblock/reserved"
- * will show garbage instead of state of memory reservations.
- */
- if (IS_ENABLED(CONFIG_DEBUG_FS) &&
- !IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK))
- return 0;
-
*addr = __pa(memblock.reserved.regions);
return PAGE_ALIGN(sizeof(struct memblock_region) *
memblock.reserved.max);
}
+phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
+ phys_addr_t *addr)
+{
+ if (memblock.memory.regions == memblock_memory_init_regions)
+ return 0;
+
+ *addr = __pa(memblock.memory.regions);
+
+ return PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.memory.max);
+}
+
+#endif
+
/**
* memblock_double_array - double the size of the memblock regions array
* @type: memblock type of the regions array being doubled
@@ -981,9 +984,6 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
if (!align)
align = SMP_CACHE_BYTES;
- /* align @size to avoid excessive fragmentation on reserved array */
- size = round_up(size, align);
-
found = memblock_find_in_range_node(size, align, 0, max_addr, nid);
if (found && !memblock_reserve(found, size))
return found;
@@ -1077,8 +1077,8 @@ static void * __init memblock_virt_alloc_internal(
if (!align)
align = SMP_CACHE_BYTES;
- /* align @size to avoid excessive fragmentation on reserved array */
- size = round_up(size, align);
+ if (max_addr > memblock.current_limit)
+ max_addr = memblock.current_limit;
again:
alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 67dd2a881433..53385cd4e6f0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -49,7 +49,6 @@
#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
-#include <linux/vmalloc.h>
#include <linux/vmpressure.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
@@ -150,7 +149,7 @@ struct mem_cgroup_reclaim_iter {
* matches memcg->dead_count of the hierarchy root group.
*/
struct mem_cgroup *last_visited;
- unsigned long last_dead_count;
+ int last_dead_count;
/* scan generation, increased every round-trip */
unsigned int generation;
@@ -381,23 +380,12 @@ struct mem_cgroup {
/* WARNING: nodeinfo must be the last member here */
};
-static size_t memcg_size(void)
-{
- return sizeof(struct mem_cgroup) +
- nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-}
-
/* internal only representation about the status of kmem accounting. */
enum {
- KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
- KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
+ KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
};
-/* We account when limit is on, but only after call sites are patched */
-#define KMEM_ACCOUNTED_MASK \
- ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
-
#ifdef CONFIG_MEMCG_KMEM
static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
{
@@ -409,16 +397,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
-static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
-{
- set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
-static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
-{
- clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
{
/*
@@ -1139,16 +1117,22 @@ skip_node:
* skipped and we should continue the tree walk.
* last_visited css is safe to use because it is
* protected by css_get and the tree walk is rcu safe.
+ *
+ * We do not take a reference on the root of the tree walk
+ * because we might race with the root removal when it would
+ * be the only node in the iterated hierarchy and mem_cgroup_iter
+ * would end up in an endless loop because it expects that at
+ * least one valid node will be returned. Root cannot disappear
+ * because caller of the iterator should hold it already so
+ * skipping css reference should be safe.
*/
if (next_css) {
- struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
+ if ((next_css->flags & CSS_ONLINE) &&
+ (next_css == &root->css || css_tryget(next_css)))
+ return mem_cgroup_from_css(next_css);
- if (css_tryget(&mem->css))
- return mem;
- else {
- prev_css = next_css;
- goto skip_node;
- }
+ prev_css = next_css;
+ goto skip_node;
}
return NULL;
@@ -1182,7 +1166,15 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
if (iter->last_dead_count == *sequence) {
smp_rmb();
position = iter->last_visited;
- if (position && !css_tryget(&position->css))
+
+ /*
+ * We cannot take a reference to root because we might race
+ * with root removal and returning NULL would end up in
+ * an endless loop on the iterator user level when root
+ * would be returned all the time.
+ */
+ if (position && position != root &&
+ !css_tryget(&position->css))
position = NULL;
}
return position;
@@ -1191,9 +1183,11 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
struct mem_cgroup *last_visited,
struct mem_cgroup *new_position,
+ struct mem_cgroup *root,
int sequence)
{
- if (last_visited)
+ /* root reference counting symmetric to mem_cgroup_iter_load */
+ if (last_visited && last_visited != root)
css_put(&last_visited->css);
/*
* We store the sequence count from the time @last_visited was
@@ -1268,7 +1262,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
memcg = __mem_cgroup_iter_next(root, last_visited);
if (reclaim) {
- mem_cgroup_iter_update(iter, last_visited, memcg, seq);
+ mem_cgroup_iter_update(iter, last_visited, memcg, root,
+ seq);
if (!memcg)
iter->generation++;
@@ -1865,13 +1860,18 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
break;
};
points = oom_badness(task, memcg, NULL, totalpages);
- if (points > chosen_points) {
- if (chosen)
- put_task_struct(chosen);
- chosen = task;
- chosen_points = points;
- get_task_struct(chosen);
- }
+ if (!points || points < chosen_points)
+ continue;
+ /* Prefer thread group leaders for display purposes */
+ if (points == chosen_points &&
+ thread_group_leader(chosen))
+ continue;
+
+ if (chosen)
+ put_task_struct(chosen);
+ chosen = task;
+ chosen_points = points;
+ get_task_struct(chosen);
}
css_task_iter_end(&it);
}
@@ -2904,7 +2904,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
unsigned short id;
swp_entry_t ent;
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
@@ -2938,7 +2938,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
bool anon;
lock_page_cgroup(pc);
- VM_BUG_ON(PageCgroupUsed(pc));
+ VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
/*
* we don't need page_cgroup_lock about tail pages, becase they are not
* accessed by any other context at this point.
@@ -2973,7 +2973,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
if (lrucare) {
if (was_on_lru) {
lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, page_lru(page));
}
@@ -2999,11 +2999,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
static DEFINE_MUTEX(set_limit_mutex);
#ifdef CONFIG_MEMCG_KMEM
+static DEFINE_MUTEX(activate_kmem_mutex);
+
static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
{
return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
- (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK) ==
- KMEM_ACCOUNTED_MASK;
+ memcg_kmem_is_active(memcg);
}
/*
@@ -3102,16 +3103,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
css_put(&memcg->css);
}
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
-{
- if (!memcg)
- return;
-
- mutex_lock(&memcg->slab_caches_mutex);
- list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
- mutex_unlock(&memcg->slab_caches_mutex);
-}
-
/*
* helper for acessing a memcg's index. It will be used as an index in the
* child cache array in kmem_cache, and also to derive its name. This function
@@ -3122,43 +3113,6 @@ int memcg_cache_id(struct mem_cgroup *memcg)
return memcg ? memcg->kmemcg_id : -1;
}
-/*
- * This ends up being protected by the set_limit mutex, during normal
- * operation, because that is its main call site.
- *
- * But when we create a new cache, we can call this as well if its parent
- * is kmem-limited. That will have to hold set_limit_mutex as well.
- */
-static int memcg_update_cache_sizes(struct mem_cgroup *memcg)
-{
- int num, ret;
-
- num = ida_simple_get(&kmem_limited_groups,
- 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
- if (num < 0)
- return num;
- /*
- * After this point, kmem_accounted (that we test atomically in
- * the beginning of this conditional), is no longer 0. This
- * guarantees only one process will set the following boolean
- * to true. We don't need test_and_set because we're protected
- * by the set_limit_mutex anyway.
- */
- memcg_kmem_set_activated(memcg);
-
- ret = memcg_update_all_caches(num+1);
- if (ret) {
- ida_simple_remove(&kmem_limited_groups, num);
- memcg_kmem_clear_activated(memcg);
- return ret;
- }
-
- memcg->kmemcg_id = num;
- INIT_LIST_HEAD(&memcg->memcg_slab_caches);
- mutex_init(&memcg->slab_caches_mutex);
- return 0;
-}
-
static size_t memcg_caches_array_size(int num_groups)
{
ssize_t size;
@@ -3195,18 +3149,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
if (num_groups > memcg_limited_groups_array_size) {
int i;
+ struct memcg_cache_params *new_params;
ssize_t size = memcg_caches_array_size(num_groups);
size *= sizeof(void *);
size += offsetof(struct memcg_cache_params, memcg_caches);
- s->memcg_params = kzalloc(size, GFP_KERNEL);
- if (!s->memcg_params) {
- s->memcg_params = cur_params;
+ new_params = kzalloc(size, GFP_KERNEL);
+ if (!new_params)
return -ENOMEM;
- }
- s->memcg_params->is_root_cache = true;
+ new_params->is_root_cache = true;
/*
* There is the chance it will be bigger than
@@ -3220,7 +3173,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
for (i = 0; i < memcg_limited_groups_array_size; i++) {
if (!cur_params->memcg_caches[i])
continue;
- s->memcg_params->memcg_caches[i] =
+ new_params->memcg_caches[i] =
cur_params->memcg_caches[i];
}
@@ -3233,13 +3186,15 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
* bigger than the others. And all updates will reset this
* anyway.
*/
- kfree(cur_params);
+ rcu_assign_pointer(s->memcg_params, new_params);
+ if (cur_params)
+ kfree_rcu(cur_params, rcu_head);
}
return 0;
}
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
- struct kmem_cache *root_cache)
+int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+ struct kmem_cache *root_cache)
{
size_t size;
@@ -3267,35 +3222,85 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
return 0;
}
-void memcg_release_cache(struct kmem_cache *s)
+void memcg_free_cache_params(struct kmem_cache *s)
+{
+ kfree(s->memcg_params);
+}
+
+void memcg_register_cache(struct kmem_cache *s)
{
struct kmem_cache *root;
struct mem_cgroup *memcg;
int id;
- /*
- * This happens, for instance, when a root cache goes away before we
- * add any memcg.
- */
- if (!s->memcg_params)
+ if (is_root_cache(s))
return;
- if (s->memcg_params->is_root_cache)
- goto out;
+ /*
+ * Holding the slab_mutex assures nobody will touch the memcg_caches
+ * array while we are modifying it.
+ */
+ lockdep_assert_held(&slab_mutex);
+ root = s->memcg_params->root_cache;
memcg = s->memcg_params->memcg;
- id = memcg_cache_id(memcg);
+ id = memcg_cache_id(memcg);
+
+ css_get(&memcg->css);
+
+
+ /*
+ * Since readers won't lock (see cache_from_memcg_idx()), we need a
+ * barrier here to ensure nobody will see the kmem_cache partially
+ * initialized.
+ */
+ smp_wmb();
+
+ /*
+ * Initialize the pointer to this cache in its parent's memcg_params
+ * before adding it to the memcg_slab_caches list, otherwise we can
+ * fail to convert memcg_params_to_cache() while traversing the list.
+ */
+ VM_BUG_ON(root->memcg_params->memcg_caches[id]);
+ root->memcg_params->memcg_caches[id] = s;
+
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
+ mutex_unlock(&memcg->slab_caches_mutex);
+}
+
+void memcg_unregister_cache(struct kmem_cache *s)
+{
+ struct kmem_cache *root;
+ struct mem_cgroup *memcg;
+ int id;
+
+ if (is_root_cache(s))
+ return;
+
+ /*
+ * Holding the slab_mutex assures nobody will touch the memcg_caches
+ * array while we are modifying it.
+ */
+ lockdep_assert_held(&slab_mutex);
root = s->memcg_params->root_cache;
- root->memcg_params->memcg_caches[id] = NULL;
+ memcg = s->memcg_params->memcg;
+ id = memcg_cache_id(memcg);
mutex_lock(&memcg->slab_caches_mutex);
list_del(&s->memcg_params->list);
mutex_unlock(&memcg->slab_caches_mutex);
+ /*
+ * Clear the pointer to this cache in its parent's memcg_params only
+ * after removing it from the memcg_slab_caches list, otherwise we can
+ * fail to convert memcg_params_to_cache() while traversing the list.
+ */
+ VM_BUG_ON(!root->memcg_params->memcg_caches[id]);
+ root->memcg_params->memcg_caches[id] = NULL;
+
css_put(&memcg->css);
-out:
- kfree(s->memcg_params);
}
/*
@@ -3354,11 +3359,9 @@ static void kmem_cache_destroy_work_func(struct work_struct *w)
* So if we aren't down to zero, we'll just schedule a worker and try
* again
*/
- if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
+ if (atomic_read(&cachep->memcg_params->nr_pages) != 0)
kmem_cache_shrink(cachep);
- if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
- return;
- } else
+ else
kmem_cache_destroy(cachep);
}
@@ -3394,27 +3397,16 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
schedule_work(&cachep->memcg_params->destroy);
}
-/*
- * This lock protects updaters, not readers. We want readers to be as fast as
- * they can, and they will either see NULL or a valid cache value. Our model
- * allow them to see NULL, in which case the root memcg will be selected.
- *
- * We need this lock because multiple allocations to the same cache from a non
- * will span more than one worker. Only one of them can create the cache.
- */
-static DEFINE_MUTEX(memcg_cache_mutex);
-
-/*
- * Called with memcg_cache_mutex held
- */
-static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
- struct kmem_cache *s)
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *s)
{
- struct kmem_cache *new;
+ struct kmem_cache *new = NULL;
static char *tmp_name = NULL;
+ static DEFINE_MUTEX(mutex); /* protects tmp_name */
- lockdep_assert_held(&memcg_cache_mutex);
+ BUG_ON(!memcg_can_account_kmem(memcg));
+ mutex_lock(&mutex);
/*
* kmem_cache_create_memcg duplicates the given name and
* cgroup_name for this name requires RCU context.
@@ -3424,7 +3416,7 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
if (!tmp_name) {
tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
if (!tmp_name)
- return NULL;
+ goto out;
}
rcu_read_lock();
@@ -3434,48 +3426,13 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
(s->flags & ~SLAB_PANIC), s->ctor, s);
-
if (new)
new->allocflags |= __GFP_KMEMCG;
-
- return new;
-}
-
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- struct kmem_cache *new_cachep;
- int idx;
-
- BUG_ON(!memcg_can_account_kmem(memcg));
-
- idx = memcg_cache_id(memcg);
-
- mutex_lock(&memcg_cache_mutex);
- new_cachep = cache_from_memcg_idx(cachep, idx);
- if (new_cachep) {
- css_put(&memcg->css);
- goto out;
- }
-
- new_cachep = kmem_cache_dup(memcg, cachep);
- if (new_cachep == NULL) {
- new_cachep = cachep;
- css_put(&memcg->css);
- goto out;
- }
-
- atomic_set(&new_cachep->memcg_params->nr_pages , 0);
-
- cachep->memcg_params->memcg_caches[idx] = new_cachep;
- /*
- * the readers won't lock, make sure everybody sees the updated value,
- * so they won't put stuff in the queue again for no reason
- */
- wmb();
+ else
+ new = s;
out:
- mutex_unlock(&memcg_cache_mutex);
- return new_cachep;
+ mutex_unlock(&mutex);
+ return new;
}
void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
@@ -3495,9 +3452,10 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
*
* Still, we don't want anyone else freeing memcg_caches under our
* noses, which can happen if a new memcg comes to life. As usual,
- * we'll take the set_limit_mutex to protect ourselves against this.
+ * we'll take the activate_kmem_mutex to protect ourselves against
+ * this.
*/
- mutex_lock(&set_limit_mutex);
+ mutex_lock(&activate_kmem_mutex);
for_each_memcg_cache_index(i) {
c = cache_from_memcg_idx(s, i);
if (!c)
@@ -3520,7 +3478,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
cancel_work_sync(&c->memcg_params->destroy);
kmem_cache_destroy(c);
}
- mutex_unlock(&set_limit_mutex);
+ mutex_unlock(&activate_kmem_mutex);
}
struct create_work {
@@ -3552,6 +3510,7 @@ static void memcg_create_cache_work_func(struct work_struct *w)
cw = container_of(w, struct create_work, work);
memcg_create_kmem_cache(cw->memcg, cw->cachep);
+ css_put(&cw->memcg->css);
kfree(cw);
}
@@ -3611,7 +3570,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
gfp_t gfp)
{
struct mem_cgroup *memcg;
- int idx;
+ struct kmem_cache *memcg_cachep;
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
@@ -3625,15 +3584,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
if (!memcg_can_account_kmem(memcg))
goto out;
- idx = memcg_cache_id(memcg);
-
- /*
- * barrier to mare sure we're always seeing the up to date value. The
- * code updating memcg_caches will issue a write barrier to match this.
- */
- read_barrier_depends();
- if (likely(cache_from_memcg_idx(cachep, idx))) {
- cachep = cache_from_memcg_idx(cachep, idx);
+ memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+ if (likely(memcg_cachep)) {
+ cachep = memcg_cachep;
goto out;
}
@@ -3787,7 +3740,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
if (!memcg)
return;
- VM_BUG_ON(mem_cgroup_is_root(memcg));
+ VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
}
#else
@@ -3866,7 +3819,7 @@ static int mem_cgroup_move_account(struct page *page,
bool anon = PageAnon(page);
VM_BUG_ON(from == to);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
/*
* The page is isolated from LRU. So, collapse function
* will not handle this page. But page splitting can happen.
@@ -3959,7 +3912,7 @@ static int mem_cgroup_move_parent(struct page *page,
parent = root_mem_cgroup;
if (nr_pages > 1) {
- VM_BUG_ON(!PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
flags = compound_lock_irqsave(page);
}
@@ -3993,7 +3946,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
- VM_BUG_ON(!PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/*
* Never OOM-kill a process for a huge page. The
* fault handler will fall back to regular pages.
@@ -4013,8 +3966,8 @@ int mem_cgroup_newpage_charge(struct page *page,
{
if (mem_cgroup_disabled())
return 0;
- VM_BUG_ON(page_mapped(page));
- VM_BUG_ON(page->mapping && !PageAnon(page));
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
VM_BUG_ON(!mm);
return mem_cgroup_charge_common(page, mm, gfp_mask,
MEM_CGROUP_CHARGE_TYPE_ANON);
@@ -4218,7 +4171,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
- VM_BUG_ON(!PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
}
/*
* Check if our page_cgroup is valid
@@ -4310,7 +4263,7 @@ void mem_cgroup_uncharge_page(struct page *page)
/* early check. */
if (page_mapped(page))
return;
- VM_BUG_ON(page->mapping && !PageAnon(page));
+ VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
/*
* If the page is in swap cache, uncharge should be deferred
* to the swap path, which also properly accounts swap usage
@@ -4330,8 +4283,8 @@ void mem_cgroup_uncharge_page(struct page *page)
void mem_cgroup_uncharge_cache_page(struct page *page)
{
- VM_BUG_ON(page_mapped(page));
- VM_BUG_ON(page->mapping);
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ VM_BUG_ON_PAGE(page->mapping, page);
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
}
@@ -5189,11 +5142,23 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
return val;
}
-static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
-{
- int ret = -EINVAL;
#ifdef CONFIG_MEMCG_KMEM
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+/* should be called with activate_kmem_mutex held */
+static int __memcg_activate_kmem(struct mem_cgroup *memcg,
+ unsigned long long limit)
+{
+ int err = 0;
+ int memcg_id;
+
+ if (memcg_kmem_is_active(memcg))
+ return 0;
+
+ /*
+ * We are going to allocate memory for data shared by all memory
+ * cgroups so let's stop accounting here.
+ */
+ memcg_stop_kmem_account();
+
/*
* For simplicity, we won't allow this to be disabled. It also can't
* be changed if the cgroup has children already, or if tasks had
@@ -5207,72 +5172,101 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
* of course permitted.
*/
mutex_lock(&memcg_create_mutex);
- mutex_lock(&set_limit_mutex);
- if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
- if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
- ret = -EBUSY;
- goto out;
- }
- ret = res_counter_set_limit(&memcg->kmem, val);
- VM_BUG_ON(ret);
+ if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg))
+ err = -EBUSY;
+ mutex_unlock(&memcg_create_mutex);
+ if (err)
+ goto out;
- ret = memcg_update_cache_sizes(memcg);
- if (ret) {
- res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
- goto out;
- }
- static_key_slow_inc(&memcg_kmem_enabled_key);
- /*
- * setting the active bit after the inc will guarantee no one
- * starts accounting before all call sites are patched
- */
- memcg_kmem_set_active(memcg);
- } else
- ret = res_counter_set_limit(&memcg->kmem, val);
+ memcg_id = ida_simple_get(&kmem_limited_groups,
+ 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+ if (memcg_id < 0) {
+ err = memcg_id;
+ goto out;
+ }
+
+ /*
+ * Make sure we have enough space for this cgroup in each root cache's
+ * memcg_params.
+ */
+ err = memcg_update_all_caches(memcg_id + 1);
+ if (err)
+ goto out_rmid;
+
+ memcg->kmemcg_id = memcg_id;
+ INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+ mutex_init(&memcg->slab_caches_mutex);
+
+ /*
+ * We couldn't have accounted to this cgroup, because it hasn't got the
+ * active bit set yet, so this should succeed.
+ */
+ err = res_counter_set_limit(&memcg->kmem, limit);
+ VM_BUG_ON(err);
+
+ static_key_slow_inc(&memcg_kmem_enabled_key);
+ /*
+ * Setting the active bit after enabling static branching will
+ * guarantee no one starts accounting before all call sites are
+ * patched.
+ */
+ memcg_kmem_set_active(memcg);
out:
- mutex_unlock(&set_limit_mutex);
- mutex_unlock(&memcg_create_mutex);
-#endif
+ memcg_resume_kmem_account();
+ return err;
+
+out_rmid:
+ ida_simple_remove(&kmem_limited_groups, memcg_id);
+ goto out;
+}
+
+static int memcg_activate_kmem(struct mem_cgroup *memcg,
+ unsigned long long limit)
+{
+ int ret;
+
+ mutex_lock(&activate_kmem_mutex);
+ ret = __memcg_activate_kmem(memcg, limit);
+ mutex_unlock(&activate_kmem_mutex);
+ return ret;
+}
+
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
+{
+ int ret;
+
+ if (!memcg_kmem_is_active(memcg))
+ ret = memcg_activate_kmem(memcg, val);
+ else
+ ret = res_counter_set_limit(&memcg->kmem, val);
return ret;
}
-#ifdef CONFIG_MEMCG_KMEM
static int memcg_propagate_kmem(struct mem_cgroup *memcg)
{
int ret = 0;
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- if (!parent)
- goto out;
- memcg->kmem_account_flags = parent->kmem_account_flags;
- /*
- * When that happen, we need to disable the static branch only on those
- * memcgs that enabled it. To achieve this, we would be forced to
- * complicate the code by keeping track of which memcgs were the ones
- * that actually enabled limits, and which ones got it from its
- * parents.
- *
- * It is a lot simpler just to do static_key_slow_inc() on every child
- * that is accounted.
- */
- if (!memcg_kmem_is_active(memcg))
- goto out;
+ if (!parent)
+ return 0;
+ mutex_lock(&activate_kmem_mutex);
/*
- * __mem_cgroup_free() will issue static_key_slow_dec() because this
- * memcg is active already. If the later initialization fails then the
- * cgroup core triggers the cleanup so we do not have to do it here.
+ * If the parent cgroup is not kmem-active now, it cannot be activated
+ * after this point, because it has at least one child already.
*/
- static_key_slow_inc(&memcg_kmem_enabled_key);
-
- mutex_lock(&set_limit_mutex);
- memcg_stop_kmem_account();
- ret = memcg_update_cache_sizes(memcg);
- memcg_resume_kmem_account();
- mutex_unlock(&set_limit_mutex);
-out:
+ if (memcg_kmem_is_active(parent))
+ ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
+ mutex_unlock(&activate_kmem_mutex);
return ret;
}
+#else
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
+{
+ return -EINVAL;
+}
#endif /* CONFIG_MEMCG_KMEM */
/*
@@ -5306,7 +5300,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
else if (type == _MEMSWAP)
ret = mem_cgroup_resize_memsw_limit(memcg, val);
else if (type == _KMEM)
- ret = memcg_update_kmem_limit(css, val);
+ ret = memcg_update_kmem_limit(memcg, val);
else
return -EINVAL;
break;
@@ -6405,14 +6399,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- size_t size = memcg_size();
+ size_t size;
- /* Can be very big if nr_node_ids is very big */
- if (size < PAGE_SIZE)
- memcg = kzalloc(size, GFP_KERNEL);
- else
- memcg = vzalloc(size);
+ size = sizeof(struct mem_cgroup);
+ size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
+ memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
return NULL;
@@ -6423,10 +6415,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
return memcg;
out_free:
- if (size < PAGE_SIZE)
- kfree(memcg);
- else
- vfree(memcg);
+ kfree(memcg);
return NULL;
}
@@ -6444,7 +6433,6 @@ out_free:
static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- size_t size = memcg_size();
mem_cgroup_remove_from_trees(memcg);
@@ -6465,10 +6453,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
* the cgroup_lock.
*/
disarm_static_keys(memcg);
- if (size < PAGE_SIZE)
- kfree(memcg);
- else
- vfree(memcg);
+ kfree(memcg);
}
/*
@@ -6549,7 +6534,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
- int error = 0;
if (css->cgroup->id > MEM_CGROUP_ID_MAX)
return -ENOSPC;
@@ -6584,10 +6568,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent != root_mem_cgroup)
mem_cgroup_subsys.broken_hierarchy = true;
}
-
- error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
mutex_unlock(&memcg_create_mutex);
- return error;
+
+ return memcg_init_kmem(memcg, &mem_cgroup_subsys);
}
/*
@@ -6896,7 +6879,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
enum mc_target_type ret = MC_TARGET_NONE;
page = pmd_page(pmd);
- VM_BUG_ON(!page || !PageHead(page));
+ VM_BUG_ON_PAGE(!page || !PageHead(page), page);
if (!move_anon())
return ret;
pc = lookup_page_cgroup(page);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b25ed321e667..4f08a2d61487 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -856,14 +856,14 @@ static int page_action(struct page_state *ps, struct page *p,
* the pages and send SIGBUS to the processes if the data was dirty.
*/
static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
- int trapno, int flags)
+ int trapno, int flags, struct page **hpagep)
{
enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
int kill = 1, forcekill;
- struct page *hpage = compound_head(p);
+ struct page *hpage = *hpagep;
struct page *ppage;
if (PageReserved(p) || PageSlab(p))
@@ -942,11 +942,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* We pinned the head page for hwpoison handling,
* now we split the thp and we are interested in
* the hwpoisoned raw page, so move the refcount
- * to it.
+ * to it. Similarly, page lock is shifted.
*/
if (hpage != p) {
put_page(hpage);
get_page(p);
+ lock_page(p);
+ unlock_page(hpage);
+ *hpagep = p;
}
/* THP is split, so ppage should be the real poisoned page. */
ppage = p;
@@ -964,17 +967,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (kill)
collect_procs(ppage, &tokill);
- if (hpage != ppage)
- lock_page(ppage);
-
ret = try_to_unmap(ppage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(ppage));
- if (hpage != ppage)
- unlock_page(ppage);
-
/*
* Now that the dirty bit has been propagated to the
* struct page and all unmaps done we can decide if
@@ -1193,8 +1190,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
/*
* Now take care of user space mappings.
* Abort on fail: __delete_from_page_cache() assumes unmapped page.
+ *
+ * When the raw error page is thp tail page, hpage points to the raw
+ * page after thp split.
*/
- if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
+ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
+ != SWAP_SUCCESS) {
printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
res = -EBUSY;
goto out;
diff --git a/mm/memory.c b/mm/memory.c
index 86487dfa5e59..be6a0c0d4ae0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -289,7 +289,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
return 0;
batch = tlb->active;
}
- VM_BUG_ON(batch->nr > batch->max);
+ VM_BUG_ON_PAGE(batch->nr > batch->max, page);
return batch->max - batch->nr;
}
@@ -671,7 +671,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
current->comm,
(long long)pte_val(pte), (long long)pmd_val(*pmd));
if (page)
- dump_page(page);
+ dump_page(page, "bad pte");
printk(KERN_ALERT
"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
@@ -2702,7 +2702,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto unwritable_page;
}
} else
- VM_BUG_ON(!PageLocked(old_page));
+ VM_BUG_ON_PAGE(!PageLocked(old_page), old_page);
/*
* Since we dropped the lock we need to revalidate
@@ -3358,7 +3358,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(!(ret & VM_FAULT_LOCKED)))
lock_page(vmf.page);
else
- VM_BUG_ON(!PageLocked(vmf.page));
+ VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
/*
* Should we do an early C-O-W break?
@@ -3395,7 +3395,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto unwritable_page;
}
} else
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
page_mkwrite = 1;
}
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index cc2ab37220b7..a650db29606f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1107,17 +1107,18 @@ int __ref add_memory(int nid, u64 start, u64 size)
if (ret)
return ret;
- lock_memory_hotplug();
-
res = register_memory_resource(start, size);
ret = -EEXIST;
if (!res)
- goto out;
+ return ret;
{ /* Stupid hack to suppress address-never-null warning */
void *p = NODE_DATA(nid);
new_pgdat = !p;
}
+
+ lock_memory_hotplug();
+
new_node = !node_online(nid);
if (new_node) {
pgdat = hotadd_new_pgdat(nid, start);
@@ -1309,7 +1310,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
#ifdef CONFIG_DEBUG_VM
printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
pfn);
- dump_page(page);
+ dump_page(page, "failed to remove from LRU");
#endif
put_page(page);
/* Because we don't have big zone->lock. we should
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0cd2c4d4e270..ae3c8f3595d4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -613,7 +613,7 @@ static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
return 0;
}
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+#ifdef CONFIG_NUMA_BALANCING
/*
* This is used to mark a range of virtual addresses to be inaccessible.
* These are later cleared by a NUMA hinting fault. Depending on these
@@ -627,7 +627,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
int nr_updated;
- BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
if (nr_updated)
@@ -641,7 +640,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
{
return 0;
}
-#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+#endif /* CONFIG_NUMA_BALANCING */
/*
* Walk through page tables and collect pages to be migrated.
@@ -1199,10 +1198,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
}
if (PageHuge(page)) {
- if (vma)
- return alloc_huge_page_noerr(vma, address, 1);
- else
- return NULL;
+ BUG_ON(!vma);
+ return alloc_huge_page_noerr(vma, address, 1);
}
/*
* if !vma, alloc_page_vma() will use task or system default policy
@@ -2657,7 +2654,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
}
#ifdef CONFIG_NUMA_BALANCING
-static bool __initdata numabalancing_override;
+static int __initdata numabalancing_override;
static void __init check_numabalancing_enable(void)
{
@@ -2666,9 +2663,15 @@ static void __init check_numabalancing_enable(void)
if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
numabalancing_default = true;
+ /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
+ if (numabalancing_override)
+ set_numabalancing_state(numabalancing_override == 1);
+
if (nr_node_ids > 1 && !numabalancing_override) {
- printk(KERN_INFO "Enabling automatic NUMA balancing. "
- "Configure with numa_balancing= or sysctl");
+ pr_info("%s automatic NUMA balancing. "
+ "Configure with numa_balancing= or the "
+ "kernel.numa_balancing sysctl",
+ numabalancing_default ? "Enabling" : "Disabling");
set_numabalancing_state(numabalancing_default);
}
}
@@ -2678,18 +2681,17 @@ static int __init setup_numabalancing(char *str)
int ret = 0;
if (!str)
goto out;
- numabalancing_override = true;
if (!strcmp(str, "enable")) {
- set_numabalancing_state(true);
+ numabalancing_override = 1;
ret = 1;
} else if (!strcmp(str, "disable")) {
- set_numabalancing_state(false);
+ numabalancing_override = -1;
ret = 1;
}
out:
if (!ret)
- printk(KERN_WARNING "Unable to parse numa_balancing=\n");
+ pr_warn("Unable to parse numa_balancing=\n");
return ret;
}
@@ -2928,7 +2930,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
unsigned short mode = MPOL_DEFAULT;
unsigned short flags = 0;
- if (pol && pol != &default_policy) {
+ if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
mode = pol->mode;
flags = pol->flags;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index a8025befc323..482a33d89134 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -499,7 +499,7 @@ void migrate_page_copy(struct page *newpage, struct page *page)
if (PageUptodate(page))
SetPageUptodate(newpage);
if (TestClearPageActive(page)) {
- VM_BUG_ON(PageUnevictable(page));
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
SetPageActive(newpage);
} else if (TestClearPageUnevictable(page))
SetPageUnevictable(newpage);
@@ -871,7 +871,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* free the metadata, so the page can be freed.
*/
if (!page->mapping) {
- VM_BUG_ON(PageAnon(page));
+ VM_BUG_ON_PAGE(PageAnon(page), page);
if (page_has_private(page)) {
try_to_free_buffers(page);
goto uncharge;
@@ -1548,8 +1548,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
__GFP_NOMEMALLOC | __GFP_NORETRY |
__GFP_NOWARN) &
~GFP_IOFS, 0);
- if (newpage)
- page_cpupid_xchg_last(newpage, page_cpupid_last(page));
return newpage;
}
@@ -1618,7 +1616,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
int page_lru;
- VM_BUG_ON(compound_order(page) && !PageTransHuge(page));
+ VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
/* Avoid migrating to a node that is nearly full */
if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
@@ -1753,8 +1751,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
if (!new_page)
goto out_fail;
- page_cpupid_xchg_last(new_page, page_cpupid_last(page));
-
isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated) {
put_page(new_page);
diff --git a/mm/mincore.c b/mm/mincore.c
index da2be56a7b8f..101623378fbf 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -225,13 +225,6 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
- if (is_vm_hugetlb_page(vma)) {
- mincore_hugetlb_page_range(vma, addr, end, vec);
- return (end - addr) >> PAGE_SHIFT;
- }
-
- end = pmd_addr_end(addr, end);
-
if (is_vm_hugetlb_page(vma))
mincore_hugetlb_page_range(vma, addr, end, vec);
else
diff --git a/mm/mlock.c b/mm/mlock.c
index 10819ed4df3e..4e1a68162285 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -91,6 +91,26 @@ void mlock_vma_page(struct page *page)
}
/*
+ * Isolate a page from LRU with optional get_page() pin.
+ * Assumes lru_lock already held and page already pinned.
+ */
+static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
+{
+ if (PageLRU(page)) {
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_page_lruvec(page, page_zone(page));
+ if (getpage)
+ get_page(page);
+ ClearPageLRU(page);
+ del_page_from_lru_list(page, lruvec, page_lru(page));
+ return true;
+ }
+
+ return false;
+}
+
+/*
* Finish munlock after successful page isolation
*
* Page must be locked. This is a wrapper for try_to_munlock()
@@ -126,9 +146,9 @@ static void __munlock_isolated_page(struct page *page)
static void __munlock_isolation_failed(struct page *page)
{
if (PageUnevictable(page))
- count_vm_event(UNEVICTABLE_PGSTRANDED);
+ __count_vm_event(UNEVICTABLE_PGSTRANDED);
else
- count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+ __count_vm_event(UNEVICTABLE_PGMUNLOCKED);
}
/**
@@ -152,28 +172,34 @@ static void __munlock_isolation_failed(struct page *page)
unsigned int munlock_vma_page(struct page *page)
{
unsigned int nr_pages;
+ struct zone *zone = page_zone(page);
BUG_ON(!PageLocked(page));
- if (TestClearPageMlocked(page)) {
- nr_pages = hpage_nr_pages(page);
- mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
- if (!isolate_lru_page(page))
- __munlock_isolated_page(page);
- else
- __munlock_isolation_failed(page);
- } else {
- nr_pages = hpage_nr_pages(page);
- }
-
/*
- * Regardless of the original PageMlocked flag, we determine nr_pages
- * after touching the flag. This leaves a possible race with a THP page
- * split, such that a whole THP page was munlocked, but nr_pages == 1.
- * Returning a smaller mask due to that is OK, the worst that can
- * happen is subsequent useless scanning of the former tail pages.
- * The NR_MLOCK accounting can however become broken.
+ * Serialize with any parallel __split_huge_page_refcount() which
+ * might otherwise copy PageMlocked to part of the tail pages before
+ * we clear it in the head page. It also stabilizes hpage_nr_pages().
*/
+ spin_lock_irq(&zone->lru_lock);
+
+ nr_pages = hpage_nr_pages(page);
+ if (!TestClearPageMlocked(page))
+ goto unlock_out;
+
+ __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
+
+ if (__munlock_isolate_lru_page(page, true)) {
+ spin_unlock_irq(&zone->lru_lock);
+ __munlock_isolated_page(page);
+ goto out;
+ }
+ __munlock_isolation_failed(page);
+
+unlock_out:
+ spin_unlock_irq(&zone->lru_lock);
+
+out:
return nr_pages - 1;
}
@@ -253,8 +279,8 @@ static int __mlock_posix_error_return(long retval)
static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
int *pgrescued)
{
- VM_BUG_ON(PageLRU(page));
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (page_mapcount(page) <= 1 && page_evictable(page)) {
pagevec_add(pvec, page);
@@ -310,34 +336,24 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
struct page *page = pvec->pages[i];
if (TestClearPageMlocked(page)) {
- struct lruvec *lruvec;
- int lru;
-
- if (PageLRU(page)) {
- lruvec = mem_cgroup_page_lruvec(page, zone);
- lru = page_lru(page);
- /*
- * We already have pin from follow_page_mask()
- * so we can spare the get_page() here.
- */
- ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, lru);
- } else {
- __munlock_isolation_failed(page);
- goto skip_munlock;
- }
-
- } else {
-skip_munlock:
/*
- * We won't be munlocking this page in the next phase
- * but we still need to release the follow_page_mask()
- * pin. We cannot do it under lru_lock however. If it's
- * the last pin, __page_cache_release would deadlock.
+ * We already have pin from follow_page_mask()
+ * so we can spare the get_page() here.
*/
- pagevec_add(&pvec_putback, pvec->pages[i]);
- pvec->pages[i] = NULL;
+ if (__munlock_isolate_lru_page(page, false))
+ continue;
+ else
+ __munlock_isolation_failed(page);
}
+
+ /*
+ * We won't be munlocking this page in the next phase
+ * but we still need to release the follow_page_mask()
+ * pin. We cannot do it under lru_lock however. If it's
+ * the last pin, __page_cache_release() would deadlock.
+ */
+ pagevec_add(&pvec_putback, pvec->pages[i]);
+ pvec->pages[i] = NULL;
}
delta_munlocked = -nr + pagevec_count(&pvec_putback);
__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 68562e92d50c..4074caf9936b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -202,5 +202,4 @@ static int __init mm_sysfs_init(void)
return 0;
}
-
-__initcall(mm_sysfs_init);
+postcore_initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index a0e7153a79e6..20ff0c33274c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -894,7 +894,15 @@ again: remove_next = 1 + (end > next->vm_end);
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags)
{
- if (vma->vm_flags ^ vm_flags)
+ /*
+ * VM_SOFTDIRTY should not prevent from VMA merging, if we
+ * match the flags but dirty bit -- the caller should mark
+ * merged VMA as dirty. If dirty bit won't be excluded from
+ * comparison, we increase pressue on the memory system forcing
+ * the kernel to generate new VMAs when old one could be
+ * extended instead.
+ */
+ if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
return 0;
if (vma->vm_file != file)
return 0;
@@ -1083,7 +1091,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
return a->vm_end == b->vm_start &&
mpol_equal(vma_policy(a), vma_policy(b)) &&
a->vm_file == b->vm_file &&
- !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&
+ !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&
b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
}
@@ -3142,7 +3150,7 @@ static int init_user_reserve(void)
sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
return 0;
}
-module_init(init_user_reserve)
+subsys_initcall(init_user_reserve);
/*
* Initialise sysctl_admin_reserve_kbytes.
@@ -3163,7 +3171,7 @@ static int init_admin_reserve(void)
sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
return 0;
}
-module_init(init_admin_reserve)
+subsys_initcall(init_admin_reserve);
/*
* Reinititalise user and admin reserves if memory is added or removed.
@@ -3233,4 +3241,4 @@ static int __meminit init_reserve_notifier(void)
return 0;
}
-module_init(init_reserve_notifier)
+subsys_initcall(init_reserve_notifier);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 93e6089cb456..41cefdf0aadd 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -329,5 +329,4 @@ static int __init mmu_notifier_init(void)
{
return init_srcu_struct(&srcu);
}
-
-module_init(mmu_notifier_init);
+subsys_initcall(mmu_notifier_init);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 19121ceb8874..f73f2987a852 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -45,7 +45,9 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
if (!addr)
return NULL;
- memblock_reserve(addr, size);
+ if (memblock_reserve(addr, size))
+ return NULL;
+
ptr = phys_to_virt(addr);
memset(ptr, 0, size);
/*
@@ -114,16 +116,27 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
static unsigned long __init free_low_memory_core_early(void)
{
unsigned long count = 0;
- phys_addr_t start, end, size;
+ phys_addr_t start, end;
u64 i;
for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
count += __free_memory_core(start, end);
- /* free range that is used for reserved array if we allocate it */
- size = get_allocated_memblock_reserved_regions_info(&start);
- if (size)
- count += __free_memory_core(start, start + size);
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+ {
+ phys_addr_t size;
+
+ /* Free memblock.reserved array if it was allocated */
+ size = get_allocated_memblock_reserved_regions_info(&start);
+ if (size)
+ count += __free_memory_core(start, start + size);
+
+ /* Free memblock.memory array if it was allocated */
+ size = get_allocated_memblock_memory_regions_info(&start);
+ if (size)
+ count += __free_memory_core(start, start + size);
+ }
+#endif
return count;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 054ff47c4478..3291e82d4352 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -178,7 +178,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* implementation used by LSMs.
*/
if (has_capability_noaudit(p, CAP_SYS_ADMIN))
- adj -= 30;
+ points -= (points * 3) / 100;
/* Normalize to oom_score_adj units */
adj *= totalpages / 1000;
@@ -327,10 +327,14 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
break;
};
points = oom_badness(p, NULL, nodemask, totalpages);
- if (points > chosen_points) {
- chosen = p;
- chosen_points = points;
- }
+ if (!points || points < chosen_points)
+ continue;
+ /* Prefer thread group leaders for display purposes */
+ if (points == chosen_points && thread_group_leader(chosen))
+ continue;
+
+ chosen = p;
+ chosen_points = points;
}
if (chosen)
get_task_struct(chosen);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 63807583d8e8..2d30e2cfe804 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -191,6 +191,26 @@ static unsigned long writeout_period_time = 0;
* global dirtyable memory first.
*/
+/**
+ * zone_dirtyable_memory - number of dirtyable pages in a zone
+ * @zone: the zone
+ *
+ * Returns the zone's number of pages potentially available for dirty
+ * page cache. This is the base value for the per-zone dirty limits.
+ */
+static unsigned long zone_dirtyable_memory(struct zone *zone)
+{
+ unsigned long nr_pages;
+
+ nr_pages = zone_page_state(zone, NR_FREE_PAGES);
+ nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+
+ nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
+ nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
+
+ return nr_pages;
+}
+
static unsigned long highmem_dirtyable_memory(unsigned long total)
{
#ifdef CONFIG_HIGHMEM
@@ -198,11 +218,9 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
unsigned long x = 0;
for_each_node_state(node, N_HIGH_MEMORY) {
- struct zone *z =
- &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+ struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
- x += zone_page_state(z, NR_FREE_PAGES) +
- zone_reclaimable_pages(z) - z->dirty_balance_reserve;
+ x += zone_dirtyable_memory(z);
}
/*
* Unreclaimable memory (kernel memory or anonymous memory
@@ -238,9 +256,12 @@ static unsigned long global_dirtyable_memory(void)
{
unsigned long x;
- x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
+ x = global_page_state(NR_FREE_PAGES);
x -= min(x, dirty_balance_reserve);
+ x += global_page_state(NR_INACTIVE_FILE);
+ x += global_page_state(NR_ACTIVE_FILE);
+
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
@@ -289,32 +310,6 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
}
/**
- * zone_dirtyable_memory - number of dirtyable pages in a zone
- * @zone: the zone
- *
- * Returns the zone's number of pages potentially available for dirty
- * page cache. This is the base value for the per-zone dirty limits.
- */
-static unsigned long zone_dirtyable_memory(struct zone *zone)
-{
- /*
- * The effective global number of dirtyable pages may exclude
- * highmem as a big-picture measure to keep the ratio between
- * dirty memory and lowmem reasonable.
- *
- * But this function is purely about the individual zone and a
- * highmem zone can hold its share of dirty pages, so we don't
- * care about vm_highmem_is_dirtyable here.
- */
- unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
- zone_reclaimable_pages(zone);
-
- /* don't allow this to underflow */
- nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
- return nr_pages;
-}
-
-/**
* zone_dirty_limit - maximum number of dirty pages allowed in a zone
* @zone: the zone
*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 533e2147d14f..e3758a09a009 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -205,7 +205,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
};
int min_free_kbytes = 1024;
-int user_min_free_kbytes;
+int user_min_free_kbytes = -1;
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
@@ -295,7 +295,7 @@ static inline int bad_range(struct zone *zone, struct page *page)
}
#endif
-static void bad_page(struct page *page)
+static void bad_page(struct page *page, char *reason, unsigned long bad_flags)
{
static unsigned long resume;
static unsigned long nr_shown;
@@ -329,7 +329,7 @@ static void bad_page(struct page *page)
printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
- dump_page(page);
+ dump_page_badflags(page, reason, bad_flags);
print_modules();
dump_stack();
@@ -383,7 +383,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
int bad = 0;
if (unlikely(compound_order(page) != order)) {
- bad_page(page);
+ bad_page(page, "wrong compound order", 0);
bad++;
}
@@ -392,8 +392,11 @@ static int destroy_compound_page(struct page *page, unsigned long order)
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
- if (unlikely(!PageTail(p) || (p->first_page != page))) {
- bad_page(page);
+ if (unlikely(!PageTail(p))) {
+ bad_page(page, "PageTail not set", 0);
+ bad++;
+ } else if (unlikely(p->first_page != page)) {
+ bad_page(page, "first_page not consistent", 0);
bad++;
}
__ClearPageTail(p);
@@ -506,12 +509,12 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
return 0;
if (page_is_guard(buddy) && page_order(buddy) == order) {
- VM_BUG_ON(page_count(buddy) != 0);
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
return 1;
}
if (PageBuddy(buddy) && page_order(buddy) == order) {
- VM_BUG_ON(page_count(buddy) != 0);
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
return 1;
}
return 0;
@@ -561,8 +564,8 @@ static inline void __free_one_page(struct page *page,
page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
- VM_BUG_ON(page_idx & ((1 << order) - 1));
- VM_BUG_ON(bad_range(zone, page));
+ VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
+ VM_BUG_ON_PAGE(bad_range(zone, page), page);
while (order < MAX_ORDER-1) {
buddy_idx = __find_buddy_index(page_idx, order);
@@ -618,12 +621,23 @@ out:
static inline int free_pages_check(struct page *page)
{
- if (unlikely(page_mapcount(page) |
- (page->mapping != NULL) |
- (atomic_read(&page->_count) != 0) |
- (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
- (mem_cgroup_bad_page_check(page)))) {
- bad_page(page);
+ char *bad_reason = NULL;
+ unsigned long bad_flags = 0;
+
+ if (unlikely(page_mapcount(page)))
+ bad_reason = "nonzero mapcount";
+ if (unlikely(page->mapping != NULL))
+ bad_reason = "non-NULL mapping";
+ if (unlikely(atomic_read(&page->_count) != 0))
+ bad_reason = "nonzero _count";
+ if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
+ bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
+ bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
+ }
+ if (unlikely(mem_cgroup_bad_page_check(page)))
+ bad_reason = "cgroup check failed";
+ if (unlikely(bad_reason)) {
+ bad_page(page, bad_reason, bad_flags);
return 1;
}
page_cpupid_reset_last(page);
@@ -813,7 +827,7 @@ static inline void expand(struct zone *zone, struct page *page,
area--;
high--;
size >>= 1;
- VM_BUG_ON(bad_range(zone, &page[size]));
+ VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
#ifdef CONFIG_DEBUG_PAGEALLOC
if (high < debug_guardpage_minorder()) {
@@ -843,12 +857,23 @@ static inline void expand(struct zone *zone, struct page *page,
*/
static inline int check_new_page(struct page *page)
{
- if (unlikely(page_mapcount(page) |
- (page->mapping != NULL) |
- (atomic_read(&page->_count) != 0) |
- (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
- (mem_cgroup_bad_page_check(page)))) {
- bad_page(page);
+ char *bad_reason = NULL;
+ unsigned long bad_flags = 0;
+
+ if (unlikely(page_mapcount(page)))
+ bad_reason = "nonzero mapcount";
+ if (unlikely(page->mapping != NULL))
+ bad_reason = "non-NULL mapping";
+ if (unlikely(atomic_read(&page->_count) != 0))
+ bad_reason = "nonzero _count";
+ if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
+ bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
+ bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
+ }
+ if (unlikely(mem_cgroup_bad_page_check(page)))
+ bad_reason = "cgroup check failed";
+ if (unlikely(bad_reason)) {
+ bad_page(page, bad_reason, bad_flags);
return 1;
}
return 0;
@@ -955,7 +980,7 @@ int move_freepages(struct zone *zone,
for (page = start_page; page <= end_page;) {
/* Make sure we are not inadvertently changing nodes */
- VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
@@ -1404,8 +1429,8 @@ void split_page(struct page *page, unsigned int order)
{
int i;
- VM_BUG_ON(PageCompound(page));
- VM_BUG_ON(!page_count(page));
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ VM_BUG_ON_PAGE(!page_count(page), page);
#ifdef CONFIG_KMEMCHECK
/*
@@ -1552,7 +1577,7 @@ again:
zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
- VM_BUG_ON(bad_range(zone, page));
+ VM_BUG_ON_PAGE(bad_range(zone, page), page);
if (prep_new_page(page, order, gfp_flags))
goto again;
return page;
@@ -5729,7 +5754,12 @@ module_init(init_per_zone_wmark_min)
int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, buffer, length, ppos);
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
if (write) {
user_min_free_kbytes = min_free_kbytes;
setup_per_zone_wmarks();
@@ -5996,7 +6026,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
- VM_BUG_ON(!zone_spans_pfn(zone, pfn));
+ VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
if (flags & value)
@@ -6494,12 +6524,24 @@ static void dump_page_flags(unsigned long flags)
printk(")\n");
}
-void dump_page(struct page *page)
+void dump_page_badflags(struct page *page, char *reason, unsigned long badflags)
{
printk(KERN_ALERT
"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
dump_page_flags(page->flags);
+ if (reason)
+ pr_alert("page dumped because: %s\n", reason);
+ if (page->flags & badflags) {
+ pr_alert("bad because of flags:\n");
+ dump_page_flags(page->flags & badflags);
+ }
mem_cgroup_print_bad_page(page);
}
+
+void dump_page(struct page *page, char *reason)
+{
+ dump_page_badflags(page, reason, 0);
+}
+EXPORT_SYMBOL_GPL(dump_page);
diff --git a/mm/page_io.c b/mm/page_io.c
index 8c79a4764be0..7c59ef681381 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -31,13 +31,13 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
bio = bio_alloc(gfp_flags, 1);
if (bio) {
- bio->bi_sector = map_swap_page(page, &bio->bi_bdev);
- bio->bi_sector <<= PAGE_SHIFT - 9;
+ bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
+ bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
bio->bi_io_vec[0].bv_page = page;
bio->bi_io_vec[0].bv_len = PAGE_SIZE;
bio->bi_io_vec[0].bv_offset = 0;
bio->bi_vcnt = 1;
- bio->bi_size = PAGE_SIZE;
+ bio->bi_iter.bi_size = PAGE_SIZE;
bio->bi_end_io = end_io;
}
return bio;
@@ -62,7 +62,7 @@ void end_swap_bio_write(struct bio *bio, int err)
printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_sector);
+ (unsigned long long)bio->bi_iter.bi_sector);
ClearPageReclaim(page);
}
end_page_writeback(page);
@@ -80,7 +80,7 @@ void end_swap_bio_read(struct bio *bio, int err)
printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_sector);
+ (unsigned long long)bio->bi_iter.bi_sector);
goto out;
}
@@ -320,8 +320,8 @@ int swap_readpage(struct page *page)
int ret = 0;
struct swap_info_struct *sis = page_swap_info(page);
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(PageUptodate(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageUptodate(page), page);
if (frontswap_load(page) == 0) {
SetPageUptodate(page);
unlock_page(page);
diff --git a/mm/readahead.c b/mm/readahead.c
index 7cdbb44aa90b..0de2360d65f3 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -211,8 +211,6 @@ out:
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read)
{
- int ret = 0;
-
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;
@@ -226,15 +224,13 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
this_chunk = nr_to_read;
err = __do_page_cache_readahead(mapping, filp,
offset, this_chunk, 0);
- if (err < 0) {
- ret = err;
- break;
- }
- ret += err;
+ if (err < 0)
+ return err;
+
offset += this_chunk;
nr_to_read -= this_chunk;
}
- return ret;
+ return 0;
}
/*
@@ -576,8 +572,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
if (!mapping || !mapping->a_ops)
return -EINVAL;
- force_page_cache_readahead(mapping, filp, index, nr);
- return 0;
+ return force_page_cache_readahead(mapping, filp, index, nr);
}
SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
diff --git a/mm/rmap.c b/mm/rmap.c
index 962e2a1e13a0..d9d42316a99a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -848,9 +848,9 @@ out:
static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
{
if (vma->vm_flags & VM_SHARED)
- return 0;
+ return false;
- return 1;
+ return true;
}
int page_mkclean(struct page *page)
@@ -894,9 +894,9 @@ void page_move_anon_rmap(struct page *page,
{
struct anon_vma *anon_vma = vma->anon_vma;
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON(!anon_vma);
- VM_BUG_ON(page->index != linear_page_index(vma, address));
+ VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
@@ -995,7 +995,7 @@ void do_page_add_anon_rmap(struct page *page,
if (unlikely(PageKsm(page)))
return;
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
/* address might be in next vma when migration races vma_adjust */
if (first)
__page_set_anon_rmap(page, vma, address, exclusive);
@@ -1481,7 +1481,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
.anon_lock = page_lock_anon_vma_read,
};
- VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page);
/*
* During exec, a temporary VMA is setup and later moved.
@@ -1533,7 +1533,7 @@ int try_to_munlock(struct page *page)
};
- VM_BUG_ON(!PageLocked(page) || PageLRU(page));
+ VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
ret = rmap_walk(page, &rwc);
return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index 902a14842b74..1f18c9d0d93e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,7 +45,7 @@ static struct vfsmount *shm_mnt;
#include <linux/xattr.h>
#include <linux/exportfs.h>
#include <linux/posix_acl.h>
-#include <linux/generic_acl.h>
+#include <linux/posix_acl_xattr.h>
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/slab.h>
@@ -285,8 +285,8 @@ static int shmem_add_to_page_cache(struct page *page,
{
int error;
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(!PageSwapBacked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
page_cache_get(page);
page->mapping = mapping;
@@ -491,7 +491,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
continue;
if (!unfalloc || !PageUptodate(page)) {
if (page->mapping == mapping) {
- VM_BUG_ON(PageWriteback(page));
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
truncate_inode_page(mapping, page);
}
}
@@ -568,7 +568,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
lock_page(page);
if (!unfalloc || !PageUptodate(page)) {
if (page->mapping == mapping) {
- VM_BUG_ON(PageWriteback(page));
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
truncate_inode_page(mapping, page);
}
}
@@ -620,10 +620,8 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
}
setattr_copy(inode, attr);
-#ifdef CONFIG_TMPFS_POSIX_ACL
if (attr->ia_valid & ATTR_MODE)
- error = generic_acl_chmod(inode);
-#endif
+ error = posix_acl_chmod(inode, inode->i_mode);
return error;
}
@@ -1937,22 +1935,14 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
-#ifdef CONFIG_TMPFS_POSIX_ACL
- error = generic_acl_init(inode, dir);
- if (error) {
- iput(inode);
- return error;
- }
-#endif
+ error = simple_acl_create(dir, inode);
+ if (error)
+ goto out_iput;
error = security_inode_init_security(inode, dir,
&dentry->d_name,
shmem_initxattrs, NULL);
- if (error) {
- if (error != -EOPNOTSUPP) {
- iput(inode);
- return error;
- }
- }
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
error = 0;
dir->i_size += BOGO_DIRENT_SIZE;
@@ -1961,6 +1951,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
dget(dentry); /* Extra count - pin the dentry in core */
}
return error;
+out_iput:
+ iput(inode);
+ return error;
}
static int
@@ -1974,24 +1967,17 @@ shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
error = security_inode_init_security(inode, dir,
NULL,
shmem_initxattrs, NULL);
- if (error) {
- if (error != -EOPNOTSUPP) {
- iput(inode);
- return error;
- }
- }
-#ifdef CONFIG_TMPFS_POSIX_ACL
- error = generic_acl_init(inode, dir);
- if (error) {
- iput(inode);
- return error;
- }
-#else
- error = 0;
-#endif
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
+ error = simple_acl_create(dir, inode);
+ if (error)
+ goto out_iput;
d_tmpfile(dentry, inode);
}
return error;
+out_iput:
+ iput(inode);
+ return error;
}
static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -2223,8 +2209,8 @@ static int shmem_initxattrs(struct inode *inode,
static const struct xattr_handler *shmem_xattr_handlers[] = {
#ifdef CONFIG_TMPFS_POSIX_ACL
- &generic_acl_access_handler,
- &generic_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
NULL
};
@@ -2740,6 +2726,7 @@ static const struct inode_operations shmem_inode_operations = {
.getxattr = shmem_getxattr,
.listxattr = shmem_listxattr,
.removexattr = shmem_removexattr,
+ .set_acl = simple_set_acl,
#endif
};
@@ -2764,6 +2751,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_setattr,
+ .set_acl = simple_set_acl,
#endif
};
@@ -2776,6 +2764,7 @@ static const struct inode_operations shmem_special_inode_operations = {
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_setattr,
+ .set_acl = simple_set_acl,
#endif
};
diff --git a/mm/slab.h b/mm/slab.h
index 0859c4241ba1..8184a7cde272 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -160,12 +160,36 @@ static inline const char *cache_name(struct kmem_cache *s)
return s->name;
}
+/*
+ * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
+ * That said the caller must assure the memcg's cache won't go away. Since once
+ * created a memcg's cache is destroyed only along with the root cache, it is
+ * true if we are going to allocate from the cache or hold a reference to the
+ * root cache by other means. Otherwise, we should hold either the slab_mutex
+ * or the memcg's slab_caches_mutex while calling this function and accessing
+ * the returned value.
+ */
static inline struct kmem_cache *
cache_from_memcg_idx(struct kmem_cache *s, int idx)
{
+ struct kmem_cache *cachep;
+ struct memcg_cache_params *params;
+
if (!s->memcg_params)
return NULL;
- return s->memcg_params->memcg_caches[idx];
+
+ rcu_read_lock();
+ params = rcu_dereference(s->memcg_params);
+ cachep = params->memcg_caches[idx];
+ rcu_read_unlock();
+
+ /*
+ * Make sure we will access the up-to-date value. The code updating
+ * memcg_caches issues a write barrier to match this (see
+ * memcg_register_cache()).
+ */
+ smp_read_barrier_depends();
+ return cachep;
}
static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 0b7bb399b0e4..1ec3c619ba04 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -171,13 +171,26 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
struct kmem_cache *parent_cache)
{
struct kmem_cache *s = NULL;
- int err = 0;
+ int err;
get_online_cpus();
mutex_lock(&slab_mutex);
- if (!kmem_cache_sanity_check(memcg, name, size) == 0)
- goto out_locked;
+ err = kmem_cache_sanity_check(memcg, name, size);
+ if (err)
+ goto out_unlock;
+
+ if (memcg) {
+ /*
+ * Since per-memcg caches are created asynchronously on first
+ * allocation (see memcg_kmem_get_cache()), several threads can
+ * try to create the same cache, but only one of them may
+ * succeed. Therefore if we get here and see the cache has
+ * already been created, we silently return NULL.
+ */
+ if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg)))
+ goto out_unlock;
+ }
/*
* Some allocators will constraint the set of valid flags to a subset
@@ -189,44 +202,47 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
if (s)
- goto out_locked;
+ goto out_unlock;
+ err = -ENOMEM;
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
- if (s) {
- s->object_size = s->size = size;
- s->align = calculate_alignment(flags, align, size);
- s->ctor = ctor;
+ if (!s)
+ goto out_unlock;
- if (memcg_register_cache(memcg, s, parent_cache)) {
- kmem_cache_free(kmem_cache, s);
- err = -ENOMEM;
- goto out_locked;
- }
+ s->object_size = s->size = size;
+ s->align = calculate_alignment(flags, align, size);
+ s->ctor = ctor;
- s->name = kstrdup(name, GFP_KERNEL);
- if (!s->name) {
- kmem_cache_free(kmem_cache, s);
- err = -ENOMEM;
- goto out_locked;
- }
+ s->name = kstrdup(name, GFP_KERNEL);
+ if (!s->name)
+ goto out_free_cache;
- err = __kmem_cache_create(s, flags);
- if (!err) {
- s->refcount = 1;
- list_add(&s->list, &slab_caches);
- memcg_cache_list_add(memcg, s);
- } else {
- kfree(s->name);
- kmem_cache_free(kmem_cache, s);
- }
- } else
- err = -ENOMEM;
+ err = memcg_alloc_cache_params(memcg, s, parent_cache);
+ if (err)
+ goto out_free_cache;
+
+ err = __kmem_cache_create(s, flags);
+ if (err)
+ goto out_free_cache;
+
+ s->refcount = 1;
+ list_add(&s->list, &slab_caches);
+ memcg_register_cache(s);
-out_locked:
+out_unlock:
mutex_unlock(&slab_mutex);
put_online_cpus();
if (err) {
+ /*
+ * There is no point in flooding logs with warnings or
+ * especially crashing the system if we fail to create a cache
+ * for a memcg. In this case we will be accounting the memcg
+ * allocation to the root cgroup until we succeed to create its
+ * own cache, but it isn't that critical.
+ */
+ if (!memcg)
+ return NULL;
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
@@ -236,11 +252,15 @@ out_locked:
name, err);
dump_stack();
}
-
return NULL;
}
-
return s;
+
+out_free_cache:
+ memcg_free_cache_params(s);
+ kfree(s->name);
+ kmem_cache_free(kmem_cache, s);
+ goto out_unlock;
}
struct kmem_cache *
@@ -263,11 +283,12 @@ void kmem_cache_destroy(struct kmem_cache *s)
list_del(&s->list);
if (!__kmem_cache_shutdown(s)) {
+ memcg_unregister_cache(s);
mutex_unlock(&slab_mutex);
if (s->flags & SLAB_DESTROY_BY_RCU)
rcu_barrier();
- memcg_release_cache(s);
+ memcg_free_cache_params(s);
kfree(s->name);
kmem_cache_free(kmem_cache, s);
} else {
diff --git a/mm/slub.c b/mm/slub.c
index 545a170ebf9f..2b1a6970e46f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -355,6 +355,21 @@ static __always_inline void slab_unlock(struct page *page)
__bit_spin_unlock(PG_locked, &page->flags);
}
+static inline void set_page_slub_counters(struct page *page, unsigned long counters_new)
+{
+ struct page tmp;
+ tmp.counters = counters_new;
+ /*
+ * page->counters can cover frozen/inuse/objects as well
+ * as page->_count. If we assign to ->counters directly
+ * we run the risk of losing updates to page->_count, so
+ * be careful and only assign to the fields we need.
+ */
+ page->frozen = tmp.frozen;
+ page->inuse = tmp.inuse;
+ page->objects = tmp.objects;
+}
+
/* Interrupts must be disabled (for the fallback code to work right) */
static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
void *freelist_old, unsigned long counters_old,
@@ -376,7 +391,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
if (page->freelist == freelist_old &&
page->counters == counters_old) {
page->freelist = freelist_new;
- page->counters = counters_new;
+ set_page_slub_counters(page, counters_new);
slab_unlock(page);
return 1;
}
@@ -415,7 +430,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
if (page->freelist == freelist_old &&
page->counters == counters_old) {
page->freelist = freelist_new;
- page->counters = counters_new;
+ set_page_slub_counters(page, counters_new);
slab_unlock(page);
local_irq_restore(flags);
return 1;
diff --git a/mm/swap.c b/mm/swap.c
index d1100b619e61..b31ba67d440a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -57,7 +57,7 @@ static void __page_cache_release(struct page *page)
spin_lock_irqsave(&zone->lru_lock, flags);
lruvec = mem_cgroup_page_lruvec(page, zone);
- VM_BUG_ON(!PageLRU(page));
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -130,8 +130,8 @@ static void put_compound_page(struct page *page)
* __split_huge_page_refcount cannot race
* here.
*/
- VM_BUG_ON(!PageHead(page_head));
- VM_BUG_ON(page_mapcount(page) != 0);
+ VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
+ VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
if (put_page_testzero(page_head)) {
/*
* If this is the tail of a slab
@@ -148,7 +148,7 @@ static void put_compound_page(struct page *page)
* the compound page enters the buddy
* allocator.
*/
- VM_BUG_ON(PageSlab(page_head));
+ VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
__put_compound_page(page_head);
}
return;
@@ -199,7 +199,7 @@ out_put_single:
__put_single_page(page);
return;
}
- VM_BUG_ON(page_head != page->first_page);
+ VM_BUG_ON_PAGE(page_head != page->first_page, page);
/*
* We can release the refcount taken by
* get_page_unless_zero() now that
@@ -207,12 +207,12 @@ out_put_single:
* compound_lock.
*/
if (put_page_testzero(page_head))
- VM_BUG_ON(1);
+ VM_BUG_ON_PAGE(1, page_head);
/* __split_huge_page_refcount will wait now */
- VM_BUG_ON(page_mapcount(page) <= 0);
+ VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
atomic_dec(&page->_mapcount);
- VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
- VM_BUG_ON(atomic_read(&page->_count) != 0);
+ VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
+ VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
compound_unlock_irqrestore(page_head, flags);
if (put_page_testzero(page_head)) {
@@ -223,7 +223,7 @@ out_put_single:
}
} else {
/* page_head is a dangling pointer */
- VM_BUG_ON(PageTail(page));
+ VM_BUG_ON_PAGE(PageTail(page), page);
goto out_put_single;
}
}
@@ -264,7 +264,7 @@ bool __get_page_tail(struct page *page)
* page. __split_huge_page_refcount
* cannot race here.
*/
- VM_BUG_ON(!PageHead(page_head));
+ VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
__get_page_tail_foll(page, true);
return true;
} else {
@@ -604,8 +604,8 @@ EXPORT_SYMBOL(__lru_cache_add);
*/
void lru_cache_add(struct page *page)
{
- VM_BUG_ON(PageActive(page) && PageUnevictable(page));
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
__lru_cache_add(page);
}
@@ -846,7 +846,7 @@ void release_pages(struct page **pages, int nr, int cold)
}
lruvec = mem_cgroup_page_lruvec(page, zone);
- VM_BUG_ON(!PageLRU(page));
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
}
@@ -888,9 +888,9 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
{
const int file = 0;
- VM_BUG_ON(!PageHead(page));
- VM_BUG_ON(PageCompound(page_tail));
- VM_BUG_ON(PageLRU(page_tail));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page_tail), page);
+ VM_BUG_ON_PAGE(PageLRU(page_tail), page);
VM_BUG_ON(NR_CPUS != 1 &&
!spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
@@ -929,7 +929,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
int active = PageActive(page);
enum lru_list lru = page_lru(page);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, lru);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e6f15f8ca2af..98e85e9c2b2d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -83,9 +83,9 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
int error;
struct address_space *address_space;
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(PageSwapCache(page));
- VM_BUG_ON(!PageSwapBacked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
page_cache_get(page);
SetPageSwapCache(page);
@@ -139,9 +139,9 @@ void __delete_from_swap_cache(struct page *page)
swp_entry_t entry;
struct address_space *address_space;
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(!PageSwapCache(page));
- VM_BUG_ON(PageWriteback(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
entry.val = page_private(page);
address_space = swap_address_space(entry);
@@ -165,8 +165,8 @@ int add_to_swap(struct page *page, struct list_head *list)
swp_entry_t entry;
int err;
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(!PageUptodate(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageUptodate(page), page);
entry = get_swap_page();
if (!entry.val)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 612a7c9795f6..c6c13b050a58 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -616,7 +616,7 @@ scan:
}
}
offset = si->lowest_bit;
- while (++offset < scan_base) {
+ while (offset < scan_base) {
if (!si->swap_map[offset]) {
spin_lock(&si->lock);
goto checks;
@@ -629,6 +629,7 @@ scan:
cond_resched();
latency_ration = LATENCY_LIMIT;
}
+ offset++;
}
spin_lock(&si->lock);
@@ -906,7 +907,7 @@ int reuse_swap_page(struct page *page)
{
int count;
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
return 0;
count = page_mapcount(page);
@@ -926,7 +927,7 @@ int reuse_swap_page(struct page *page)
*/
int try_to_free_swap(struct page *page)
{
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (!PageSwapCache(page))
return 0;
@@ -2714,7 +2715,7 @@ struct swap_info_struct *page_swap_info(struct page *page)
*/
struct address_space *__page_file_mapping(struct page *page)
{
- VM_BUG_ON(!PageSwapCache(page));
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
return page_swap_info(page)->swap_file->f_mapping;
}
EXPORT_SYMBOL_GPL(__page_file_mapping);
@@ -2722,7 +2723,7 @@ EXPORT_SYMBOL_GPL(__page_file_mapping);
pgoff_t __page_file_index(struct page *page)
{
swp_entry_t swap = { .val = page_private(page) };
- VM_BUG_ON(!PageSwapCache(page));
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
return swp_offset(swap);
}
EXPORT_SYMBOL_GPL(__page_file_index);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e4f0db2a3eae..0fdf96803c5b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -220,12 +220,12 @@ int is_vmalloc_or_module_addr(const void *x)
}
/*
- * Walk a vmap address to the physical pfn it maps to.
+ * Walk a vmap address to the struct page it maps.
*/
-unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
+struct page *vmalloc_to_page(const void *vmalloc_addr)
{
unsigned long addr = (unsigned long) vmalloc_addr;
- unsigned long pfn = 0;
+ struct page *page = NULL;
pgd_t *pgd = pgd_offset_k(addr);
/*
@@ -244,23 +244,23 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
ptep = pte_offset_map(pmd, addr);
pte = *ptep;
if (pte_present(pte))
- pfn = pte_pfn(pte);
+ page = pte_page(pte);
pte_unmap(ptep);
}
}
}
- return pfn;
+ return page;
}
-EXPORT_SYMBOL(vmalloc_to_pfn);
+EXPORT_SYMBOL(vmalloc_to_page);
/*
- * Map a vmalloc()-space virtual address to the struct page.
+ * Map a vmalloc()-space virtual address to the physical page frame number.
*/
-struct page *vmalloc_to_page(const void *vmalloc_addr)
+unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
{
- return pfn_to_page(vmalloc_to_pfn(vmalloc_addr));
+ return page_to_pfn(vmalloc_to_page(vmalloc_addr));
}
-EXPORT_SYMBOL(vmalloc_to_page);
+EXPORT_SYMBOL(vmalloc_to_pfn);
/*** Global kva allocator ***/
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eea668d9cff6..a9c74b409681 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -147,7 +147,7 @@ static bool global_reclaim(struct scan_control *sc)
}
#endif
-unsigned long zone_reclaimable_pages(struct zone *zone)
+static unsigned long zone_reclaimable_pages(struct zone *zone)
{
int nr;
@@ -281,17 +281,34 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
nr_pages_scanned, lru_pages,
max_pass, delta, total_scan);
- while (total_scan >= batch_size) {
+ /*
+ * Normally, we should not scan less than batch_size objects in one
+ * pass to avoid too frequent shrinker calls, but if the slab has less
+ * than batch_size objects in total and we are really tight on memory,
+ * we will try to reclaim all available objects, otherwise we can end
+ * up failing allocations although there are plenty of reclaimable
+ * objects spread over several slabs with usage less than the
+ * batch_size.
+ *
+ * We detect the "tight on memory" situations by looking at the total
+ * number of objects we want to scan (total_scan). If it is greater
+ * than the total number of objects on slab (max_pass), we must be
+ * scanning at high prio and therefore should try to reclaim as much as
+ * possible.
+ */
+ while (total_scan >= batch_size ||
+ total_scan >= max_pass) {
unsigned long ret;
+ unsigned long nr_to_scan = min(batch_size, total_scan);
- shrinkctl->nr_to_scan = batch_size;
+ shrinkctl->nr_to_scan = nr_to_scan;
ret = shrinker->scan_objects(shrinker, shrinkctl);
if (ret == SHRINK_STOP)
break;
freed += ret;
- count_vm_events(SLABS_SCANNED, batch_size);
- total_scan -= batch_size;
+ count_vm_events(SLABS_SCANNED, nr_to_scan);
+ total_scan -= nr_to_scan;
cond_resched();
}
@@ -352,16 +369,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
}
list_for_each_entry(shrinker, &shrinker_list, list) {
- for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
- if (!node_online(shrinkctl->nid))
- continue;
-
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
- (shrinkctl->nid != 0))
- break;
-
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
+ shrinkctl->nid = 0;
freed += shrink_slab_node(shrinkctl, shrinker,
- nr_pages_scanned, lru_pages);
+ nr_pages_scanned, lru_pages);
+ continue;
+ }
+
+ for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+ if (node_online(shrinkctl->nid))
+ freed += shrink_slab_node(shrinkctl, shrinker,
+ nr_pages_scanned, lru_pages);
}
}
@@ -603,7 +621,7 @@ void putback_lru_page(struct page *page)
bool is_unevictable;
int was_unevictable = PageUnevictable(page);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
redo:
ClearPageUnevictable(page);
@@ -794,8 +812,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (!trylock_page(page))
goto keep;
- VM_BUG_ON(PageActive(page));
- VM_BUG_ON(page_zone(page) != zone);
+ VM_BUG_ON_PAGE(PageActive(page), page);
+ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
sc->nr_scanned++;
@@ -1079,14 +1097,14 @@ activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
if (PageSwapCache(page) && vm_swap_full())
try_to_free_swap(page);
- VM_BUG_ON(PageActive(page));
+ VM_BUG_ON_PAGE(PageActive(page), page);
SetPageActive(page);
pgactivate++;
keep_locked:
unlock_page(page);
keep:
list_add(&page->lru, &ret_pages);
- VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
+ VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
free_hot_cold_page_list(&free_pages, 1);
@@ -1240,7 +1258,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
- VM_BUG_ON(!PageLRU(page));
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
switch (__isolate_lru_page(page, mode)) {
case 0:
@@ -1295,7 +1313,7 @@ int isolate_lru_page(struct page *page)
{
int ret = -EBUSY;
- VM_BUG_ON(!page_count(page));
+ VM_BUG_ON_PAGE(!page_count(page), page);
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
@@ -1366,7 +1384,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
struct page *page = lru_to_page(page_list);
int lru;
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
spin_unlock_irq(&zone->lru_lock);
@@ -1586,7 +1604,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
page = lru_to_page(list);
lruvec = mem_cgroup_page_lruvec(page, zone);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
nr_pages = hpage_nr_pages(page);
@@ -3297,27 +3315,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
wake_up_interruptible(&pgdat->kswapd_wait);
}
-/*
- * The reclaimable count would be mostly accurate.
- * The less reclaimable pages may be
- * - mlocked pages, which will be moved to unevictable list when encountered
- * - mapped pages, which may require several travels to be reclaimed
- * - dirty pages, which is not "instantly" reclaimable
- */
-unsigned long global_reclaimable_pages(void)
-{
- int nr;
-
- nr = global_page_state(NR_ACTIVE_FILE) +
- global_page_state(NR_INACTIVE_FILE);
-
- if (get_nr_swap_pages() > 0)
- nr += global_page_state(NR_ACTIVE_ANON) +
- global_page_state(NR_INACTIVE_ANON);
-
- return nr;
-}
-
#ifdef CONFIG_HIBERNATION
/*
* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
@@ -3701,7 +3698,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
if (page_evictable(page)) {
enum lru_list lru = page_lru_base_type(page);
- VM_BUG_ON(PageActive(page));
+ VM_BUG_ON_PAGE(PageActive(page), page);
ClearPageUnevictable(page);
del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
add_page_to_lru_list(page, lruvec, lru);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
new file mode 100644
index 000000000000..c03ca5e9fe15
--- /dev/null
+++ b/mm/zsmalloc.c
@@ -0,0 +1,1106 @@
+/*
+ * zsmalloc memory allocator
+ *
+ * Copyright (C) 2011 Nitin Gupta
+ * Copyright (C) 2012, 2013 Minchan Kim
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the license that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+
+/*
+ * This allocator is designed for use with zram. Thus, the allocator is
+ * supposed to work well under low memory conditions. In particular, it
+ * never attempts higher order page allocation which is very likely to
+ * fail under memory pressure. On the other hand, if we just use single
+ * (0-order) pages, it would suffer from very high fragmentation --
+ * any object of size PAGE_SIZE/2 or larger would occupy an entire page.
+ * This was one of the major issues with its predecessor (xvmalloc).
+ *
+ * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
+ * and links them together using various 'struct page' fields. These linked
+ * pages act as a single higher-order page i.e. an object can span 0-order
+ * page boundaries. The code refers to these linked pages as a single entity
+ * called zspage.
+ *
+ * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
+ * since this satisfies the requirements of all its current users (in the
+ * worst case, page is incompressible and is thus stored "as-is" i.e. in
+ * uncompressed form). For allocation requests larger than this size, failure
+ * is returned (see zs_malloc).
+ *
+ * Additionally, zs_malloc() does not return a dereferenceable pointer.
+ * Instead, it returns an opaque handle (unsigned long) which encodes actual
+ * location of the allocated object. The reason for this indirection is that
+ * zsmalloc does not keep zspages permanently mapped since that would cause
+ * issues on 32-bit systems where the VA region for kernel space mappings
+ * is very small. So, before using the allocating memory, the object has to
+ * be mapped using zs_map_object() to get a usable pointer and subsequently
+ * unmapped using zs_unmap_object().
+ *
+ * Following is how we use various fields and flags of underlying
+ * struct page(s) to form a zspage.
+ *
+ * Usage of struct page fields:
+ * page->first_page: points to the first component (0-order) page
+ * page->index (union with page->freelist): offset of the first object
+ * starting in this page. For the first page, this is
+ * always 0, so we use this field (aka freelist) to point
+ * to the first free object in zspage.
+ * page->lru: links together all component pages (except the first page)
+ * of a zspage
+ *
+ * For _first_ page only:
+ *
+ * page->private (union with page->first_page): refers to the
+ * component page after the first page
+ * page->freelist: points to the first free object in zspage.
+ * Free objects are linked together using in-place
+ * metadata.
+ * page->objects: maximum number of objects we can store in this
+ * zspage (class->zspage_order * PAGE_SIZE / class->size)
+ * page->lru: links together first pages of various zspages.
+ * Basically forming list of zspages in a fullness group.
+ * page->mapping: class index and fullness group of the zspage
+ *
+ * Usage of struct page flags:
+ * PG_private: identifies the first component page
+ * PG_private2: identifies the last component page
+ *
+ */
+
+#ifdef CONFIG_ZSMALLOC_DEBUG
+#define DEBUG
+#endif
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <asm/tlbflush.h>
+#include <asm/pgtable.h>
+#include <linux/cpumask.h>
+#include <linux/cpu.h>
+#include <linux/vmalloc.h>
+#include <linux/hardirq.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/zsmalloc.h>
+
+/*
+ * This must be power of 2 and greater than of equal to sizeof(link_free).
+ * These two conditions ensure that any 'struct link_free' itself doesn't
+ * span more than 1 page which avoids complex case of mapping 2 pages simply
+ * to restore link_free pointer values.
+ */
+#define ZS_ALIGN 8
+
+/*
+ * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single)
+ * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N.
+ */
+#define ZS_MAX_ZSPAGE_ORDER 2
+#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
+
+/*
+ * Object location (<PFN>, <obj_idx>) is encoded as
+ * as single (unsigned long) handle value.
+ *
+ * Note that object index <obj_idx> is relative to system
+ * page <PFN> it is stored in, so for each sub-page belonging
+ * to a zspage, obj_idx starts with 0.
+ *
+ * This is made more complicated by various memory models and PAE.
+ */
+
+#ifndef MAX_PHYSMEM_BITS
+#ifdef CONFIG_HIGHMEM64G
+#define MAX_PHYSMEM_BITS 36
+#else /* !CONFIG_HIGHMEM64G */
+/*
+ * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
+ * be PAGE_SHIFT
+ */
+#define MAX_PHYSMEM_BITS BITS_PER_LONG
+#endif
+#endif
+#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
+#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
+#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
+
+#define MAX(a, b) ((a) >= (b) ? (a) : (b))
+/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
+#define ZS_MIN_ALLOC_SIZE \
+ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
+#define ZS_MAX_ALLOC_SIZE PAGE_SIZE
+
+/*
+ * On systems with 4K page size, this gives 254 size classes! There is a
+ * trader-off here:
+ * - Large number of size classes is potentially wasteful as free page are
+ * spread across these classes
+ * - Small number of size classes causes large internal fragmentation
+ * - Probably its better to use specific size classes (empirically
+ * determined). NOTE: all those class sizes must be set as multiple of
+ * ZS_ALIGN to make sure link_free itself never has to span 2 pages.
+ *
+ * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
+ * (reason above)
+ */
+#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8)
+#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
+ ZS_SIZE_CLASS_DELTA + 1)
+
+/*
+ * We do not maintain any list for completely empty or full pages
+ */
+enum fullness_group {
+ ZS_ALMOST_FULL,
+ ZS_ALMOST_EMPTY,
+ _ZS_NR_FULLNESS_GROUPS,
+
+ ZS_EMPTY,
+ ZS_FULL
+};
+
+/*
+ * We assign a page to ZS_ALMOST_EMPTY fullness group when:
+ * n <= N / f, where
+ * n = number of allocated objects
+ * N = total number of objects zspage can store
+ * f = 1/fullness_threshold_frac
+ *
+ * Similarly, we assign zspage to:
+ * ZS_ALMOST_FULL when n > N / f
+ * ZS_EMPTY when n == 0
+ * ZS_FULL when n == N
+ *
+ * (see: fix_fullness_group())
+ */
+static const int fullness_threshold_frac = 4;
+
+struct size_class {
+ /*
+ * Size of objects stored in this class. Must be multiple
+ * of ZS_ALIGN.
+ */
+ int size;
+ unsigned int index;
+
+ /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
+ int pages_per_zspage;
+
+ spinlock_t lock;
+
+ /* stats */
+ u64 pages_allocated;
+
+ struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
+};
+
+/*
+ * Placed within free objects to form a singly linked list.
+ * For every zspage, first_page->freelist gives head of this list.
+ *
+ * This must be power of 2 and less than or equal to ZS_ALIGN
+ */
+struct link_free {
+ /* Handle of next free chunk (encodes <PFN, obj_idx>) */
+ void *next;
+};
+
+struct zs_pool {
+ struct size_class size_class[ZS_SIZE_CLASSES];
+
+ gfp_t flags; /* allocation flags used when growing pool */
+};
+
+/*
+ * A zspage's class index and fullness group
+ * are encoded in its (first)page->mapping
+ */
+#define CLASS_IDX_BITS 28
+#define FULLNESS_BITS 4
+#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1)
+#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1)
+
+struct mapping_area {
+#ifdef CONFIG_PGTABLE_MAPPING
+ struct vm_struct *vm; /* vm area for mapping object that span pages */
+#else
+ char *vm_buf; /* copy buffer for objects that span pages */
+#endif
+ char *vm_addr; /* address of kmap_atomic()'ed pages */
+ enum zs_mapmode vm_mm; /* mapping mode */
+};
+
+
+/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
+static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
+
+static int is_first_page(struct page *page)
+{
+ return PagePrivate(page);
+}
+
+static int is_last_page(struct page *page)
+{
+ return PagePrivate2(page);
+}
+
+static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
+ enum fullness_group *fullness)
+{
+ unsigned long m;
+ BUG_ON(!is_first_page(page));
+
+ m = (unsigned long)page->mapping;
+ *fullness = m & FULLNESS_MASK;
+ *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK;
+}
+
+static void set_zspage_mapping(struct page *page, unsigned int class_idx,
+ enum fullness_group fullness)
+{
+ unsigned long m;
+ BUG_ON(!is_first_page(page));
+
+ m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
+ (fullness & FULLNESS_MASK);
+ page->mapping = (struct address_space *)m;
+}
+
+/*
+ * zsmalloc divides the pool into various size classes where each
+ * class maintains a list of zspages where each zspage is divided
+ * into equal sized chunks. Each allocation falls into one of these
+ * classes depending on its size. This function returns index of the
+ * size class which has chunk size big enough to hold the give size.
+ */
+static int get_size_class_index(int size)
+{
+ int idx = 0;
+
+ if (likely(size > ZS_MIN_ALLOC_SIZE))
+ idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
+ ZS_SIZE_CLASS_DELTA);
+
+ return idx;
+}
+
+/*
+ * For each size class, zspages are divided into different groups
+ * depending on how "full" they are. This was done so that we could
+ * easily find empty or nearly empty zspages when we try to shrink
+ * the pool (not yet implemented). This function returns fullness
+ * status of the given page.
+ */
+static enum fullness_group get_fullness_group(struct page *page)
+{
+ int inuse, max_objects;
+ enum fullness_group fg;
+ BUG_ON(!is_first_page(page));
+
+ inuse = page->inuse;
+ max_objects = page->objects;
+
+ if (inuse == 0)
+ fg = ZS_EMPTY;
+ else if (inuse == max_objects)
+ fg = ZS_FULL;
+ else if (inuse <= max_objects / fullness_threshold_frac)
+ fg = ZS_ALMOST_EMPTY;
+ else
+ fg = ZS_ALMOST_FULL;
+
+ return fg;
+}
+
+/*
+ * Each size class maintains various freelists and zspages are assigned
+ * to one of these freelists based on the number of live objects they
+ * have. This functions inserts the given zspage into the freelist
+ * identified by <class, fullness_group>.
+ */
+static void insert_zspage(struct page *page, struct size_class *class,
+ enum fullness_group fullness)
+{
+ struct page **head;
+
+ BUG_ON(!is_first_page(page));
+
+ if (fullness >= _ZS_NR_FULLNESS_GROUPS)
+ return;
+
+ head = &class->fullness_list[fullness];
+ if (*head)
+ list_add_tail(&page->lru, &(*head)->lru);
+
+ *head = page;
+}
+
+/*
+ * This function removes the given zspage from the freelist identified
+ * by <class, fullness_group>.
+ */
+static void remove_zspage(struct page *page, struct size_class *class,
+ enum fullness_group fullness)
+{
+ struct page **head;
+
+ BUG_ON(!is_first_page(page));
+
+ if (fullness >= _ZS_NR_FULLNESS_GROUPS)
+ return;
+
+ head = &class->fullness_list[fullness];
+ BUG_ON(!*head);
+ if (list_empty(&(*head)->lru))
+ *head = NULL;
+ else if (*head == page)
+ *head = (struct page *)list_entry((*head)->lru.next,
+ struct page, lru);
+
+ list_del_init(&page->lru);
+}
+
+/*
+ * Each size class maintains zspages in different fullness groups depending
+ * on the number of live objects they contain. When allocating or freeing
+ * objects, the fullness status of the page can change, say, from ALMOST_FULL
+ * to ALMOST_EMPTY when freeing an object. This function checks if such
+ * a status change has occurred for the given page and accordingly moves the
+ * page from the freelist of the old fullness group to that of the new
+ * fullness group.
+ */
+static enum fullness_group fix_fullness_group(struct zs_pool *pool,
+ struct page *page)
+{
+ int class_idx;
+ struct size_class *class;
+ enum fullness_group currfg, newfg;
+
+ BUG_ON(!is_first_page(page));
+
+ get_zspage_mapping(page, &class_idx, &currfg);
+ newfg = get_fullness_group(page);
+ if (newfg == currfg)
+ goto out;
+
+ class = &pool->size_class[class_idx];
+ remove_zspage(page, class, currfg);
+ insert_zspage(page, class, newfg);
+ set_zspage_mapping(page, class_idx, newfg);
+
+out:
+ return newfg;
+}
+
+/*
+ * We have to decide on how many pages to link together
+ * to form a zspage for each size class. This is important
+ * to reduce wastage due to unusable space left at end of
+ * each zspage which is given as:
+ * wastage = Zp - Zp % size_class
+ * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
+ *
+ * For example, for size class of 3/8 * PAGE_SIZE, we should
+ * link together 3 PAGE_SIZE sized pages to form a zspage
+ * since then we can perfectly fit in 8 such objects.
+ */
+static int get_pages_per_zspage(int class_size)
+{
+ int i, max_usedpc = 0;
+ /* zspage order which gives maximum used size per KB */
+ int max_usedpc_order = 1;
+
+ for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) {
+ int zspage_size;
+ int waste, usedpc;
+
+ zspage_size = i * PAGE_SIZE;
+ waste = zspage_size % class_size;
+ usedpc = (zspage_size - waste) * 100 / zspage_size;
+
+ if (usedpc > max_usedpc) {
+ max_usedpc = usedpc;
+ max_usedpc_order = i;
+ }
+ }
+
+ return max_usedpc_order;
+}
+
+/*
+ * A single 'zspage' is composed of many system pages which are
+ * linked together using fields in struct page. This function finds
+ * the first/head page, given any component page of a zspage.
+ */
+static struct page *get_first_page(struct page *page)
+{
+ if (is_first_page(page))
+ return page;
+ else
+ return page->first_page;
+}
+
+static struct page *get_next_page(struct page *page)
+{
+ struct page *next;
+
+ if (is_last_page(page))
+ next = NULL;
+ else if (is_first_page(page))
+ next = (struct page *)page_private(page);
+ else
+ next = list_entry(page->lru.next, struct page, lru);
+
+ return next;
+}
+
+/*
+ * Encode <page, obj_idx> as a single handle value.
+ * On hardware platforms with physical memory starting at 0x0 the pfn
+ * could be 0 so we ensure that the handle will never be 0 by adjusting the
+ * encoded obj_idx value before encoding.
+ */
+static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
+{
+ unsigned long handle;
+
+ if (!page) {
+ BUG_ON(obj_idx);
+ return NULL;
+ }
+
+ handle = page_to_pfn(page) << OBJ_INDEX_BITS;
+ handle |= ((obj_idx + 1) & OBJ_INDEX_MASK);
+
+ return (void *)handle;
+}
+
+/*
+ * Decode <page, obj_idx> pair from the given object handle. We adjust the
+ * decoded obj_idx back to its original value since it was adjusted in
+ * obj_location_to_handle().
+ */
+static void obj_handle_to_location(unsigned long handle, struct page **page,
+ unsigned long *obj_idx)
+{
+ *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
+ *obj_idx = (handle & OBJ_INDEX_MASK) - 1;
+}
+
+static unsigned long obj_idx_to_offset(struct page *page,
+ unsigned long obj_idx, int class_size)
+{
+ unsigned long off = 0;
+
+ if (!is_first_page(page))
+ off = page->index;
+
+ return off + obj_idx * class_size;
+}
+
+static void reset_page(struct page *page)
+{
+ clear_bit(PG_private, &page->flags);
+ clear_bit(PG_private_2, &page->flags);
+ set_page_private(page, 0);
+ page->mapping = NULL;
+ page->freelist = NULL;
+ page_mapcount_reset(page);
+}
+
+static void free_zspage(struct page *first_page)
+{
+ struct page *nextp, *tmp, *head_extra;
+
+ BUG_ON(!is_first_page(first_page));
+ BUG_ON(first_page->inuse);
+
+ head_extra = (struct page *)page_private(first_page);
+
+ reset_page(first_page);
+ __free_page(first_page);
+
+ /* zspage with only 1 system page */
+ if (!head_extra)
+ return;
+
+ list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) {
+ list_del(&nextp->lru);
+ reset_page(nextp);
+ __free_page(nextp);
+ }
+ reset_page(head_extra);
+ __free_page(head_extra);
+}
+
+/* Initialize a newly allocated zspage */
+static void init_zspage(struct page *first_page, struct size_class *class)
+{
+ unsigned long off = 0;
+ struct page *page = first_page;
+
+ BUG_ON(!is_first_page(first_page));
+ while (page) {
+ struct page *next_page;
+ struct link_free *link;
+ unsigned int i, objs_on_page;
+
+ /*
+ * page->index stores offset of first object starting
+ * in the page. For the first page, this is always 0,
+ * so we use first_page->index (aka ->freelist) to store
+ * head of corresponding zspage's freelist.
+ */
+ if (page != first_page)
+ page->index = off;
+
+ link = (struct link_free *)kmap_atomic(page) +
+ off / sizeof(*link);
+ objs_on_page = (PAGE_SIZE - off) / class->size;
+
+ for (i = 1; i <= objs_on_page; i++) {
+ off += class->size;
+ if (off < PAGE_SIZE) {
+ link->next = obj_location_to_handle(page, i);
+ link += class->size / sizeof(*link);
+ }
+ }
+
+ /*
+ * We now come to the last (full or partial) object on this
+ * page, which must point to the first object on the next
+ * page (if present)
+ */
+ next_page = get_next_page(page);
+ link->next = obj_location_to_handle(next_page, 0);
+ kunmap_atomic(link);
+ page = next_page;
+ off = (off + class->size) % PAGE_SIZE;
+ }
+}
+
+/*
+ * Allocate a zspage for the given size class
+ */
+static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
+{
+ int i, error;
+ struct page *first_page = NULL, *uninitialized_var(prev_page);
+
+ /*
+ * Allocate individual pages and link them together as:
+ * 1. first page->private = first sub-page
+ * 2. all sub-pages are linked together using page->lru
+ * 3. each sub-page is linked to the first page using page->first_page
+ *
+ * For each size class, First/Head pages are linked together using
+ * page->lru. Also, we set PG_private to identify the first page
+ * (i.e. no other sub-page has this flag set) and PG_private_2 to
+ * identify the last page.
+ */
+ error = -ENOMEM;
+ for (i = 0; i < class->pages_per_zspage; i++) {
+ struct page *page;
+
+ page = alloc_page(flags);
+ if (!page)
+ goto cleanup;
+
+ INIT_LIST_HEAD(&page->lru);
+ if (i == 0) { /* first page */
+ SetPagePrivate(page);
+ set_page_private(page, 0);
+ first_page = page;
+ first_page->inuse = 0;
+ }
+ if (i == 1)
+ set_page_private(first_page, (unsigned long)page);
+ if (i >= 1)
+ page->first_page = first_page;
+ if (i >= 2)
+ list_add(&page->lru, &prev_page->lru);
+ if (i == class->pages_per_zspage - 1) /* last page */
+ SetPagePrivate2(page);
+ prev_page = page;
+ }
+
+ init_zspage(first_page, class);
+
+ first_page->freelist = obj_location_to_handle(first_page, 0);
+ /* Maximum number of objects we can store in this zspage */
+ first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
+
+ error = 0; /* Success */
+
+cleanup:
+ if (unlikely(error) && first_page) {
+ free_zspage(first_page);
+ first_page = NULL;
+ }
+
+ return first_page;
+}
+
+static struct page *find_get_zspage(struct size_class *class)
+{
+ int i;
+ struct page *page;
+
+ for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
+ page = class->fullness_list[i];
+ if (page)
+ break;
+ }
+
+ return page;
+}
+
+#ifdef CONFIG_PGTABLE_MAPPING
+static inline int __zs_cpu_up(struct mapping_area *area)
+{
+ /*
+ * Make sure we don't leak memory if a cpu UP notification
+ * and zs_init() race and both call zs_cpu_up() on the same cpu
+ */
+ if (area->vm)
+ return 0;
+ area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL);
+ if (!area->vm)
+ return -ENOMEM;
+ return 0;
+}
+
+static inline void __zs_cpu_down(struct mapping_area *area)
+{
+ if (area->vm)
+ free_vm_area(area->vm);
+ area->vm = NULL;
+}
+
+static inline void *__zs_map_object(struct mapping_area *area,
+ struct page *pages[2], int off, int size)
+{
+ BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages));
+ area->vm_addr = area->vm->addr;
+ return area->vm_addr + off;
+}
+
+static inline void __zs_unmap_object(struct mapping_area *area,
+ struct page *pages[2], int off, int size)
+{
+ unsigned long addr = (unsigned long)area->vm_addr;
+
+ unmap_kernel_range(addr, PAGE_SIZE * 2);
+}
+
+#else /* CONFIG_PGTABLE_MAPPING */
+
+static inline int __zs_cpu_up(struct mapping_area *area)
+{
+ /*
+ * Make sure we don't leak memory if a cpu UP notification
+ * and zs_init() race and both call zs_cpu_up() on the same cpu
+ */
+ if (area->vm_buf)
+ return 0;
+ area->vm_buf = (char *)__get_free_page(GFP_KERNEL);
+ if (!area->vm_buf)
+ return -ENOMEM;
+ return 0;
+}
+
+static inline void __zs_cpu_down(struct mapping_area *area)
+{
+ if (area->vm_buf)
+ free_page((unsigned long)area->vm_buf);
+ area->vm_buf = NULL;
+}
+
+static void *__zs_map_object(struct mapping_area *area,
+ struct page *pages[2], int off, int size)
+{
+ int sizes[2];
+ void *addr;
+ char *buf = area->vm_buf;
+
+ /* disable page faults to match kmap_atomic() return conditions */
+ pagefault_disable();
+
+ /* no read fastpath */
+ if (area->vm_mm == ZS_MM_WO)
+ goto out;
+
+ sizes[0] = PAGE_SIZE - off;
+ sizes[1] = size - sizes[0];
+
+ /* copy object to per-cpu buffer */
+ addr = kmap_atomic(pages[0]);
+ memcpy(buf, addr + off, sizes[0]);
+ kunmap_atomic(addr);
+ addr = kmap_atomic(pages[1]);
+ memcpy(buf + sizes[0], addr, sizes[1]);
+ kunmap_atomic(addr);
+out:
+ return area->vm_buf;
+}
+
+static void __zs_unmap_object(struct mapping_area *area,
+ struct page *pages[2], int off, int size)
+{
+ int sizes[2];
+ void *addr;
+ char *buf = area->vm_buf;
+
+ /* no write fastpath */
+ if (area->vm_mm == ZS_MM_RO)
+ goto out;
+
+ sizes[0] = PAGE_SIZE - off;
+ sizes[1] = size - sizes[0];
+
+ /* copy per-cpu buffer to object */
+ addr = kmap_atomic(pages[0]);
+ memcpy(addr + off, buf, sizes[0]);
+ kunmap_atomic(addr);
+ addr = kmap_atomic(pages[1]);
+ memcpy(addr, buf + sizes[0], sizes[1]);
+ kunmap_atomic(addr);
+
+out:
+ /* enable page faults to match kunmap_atomic() return conditions */
+ pagefault_enable();
+}
+
+#endif /* CONFIG_PGTABLE_MAPPING */
+
+static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action,
+ void *pcpu)
+{
+ int ret, cpu = (long)pcpu;
+ struct mapping_area *area;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ area = &per_cpu(zs_map_area, cpu);
+ ret = __zs_cpu_up(area);
+ if (ret)
+ return notifier_from_errno(ret);
+ break;
+ case CPU_DEAD:
+ case CPU_UP_CANCELED:
+ area = &per_cpu(zs_map_area, cpu);
+ __zs_cpu_down(area);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block zs_cpu_nb = {
+ .notifier_call = zs_cpu_notifier
+};
+
+static void zs_exit(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu);
+ unregister_cpu_notifier(&zs_cpu_nb);
+}
+
+static int zs_init(void)
+{
+ int cpu, ret;
+
+ register_cpu_notifier(&zs_cpu_nb);
+ for_each_online_cpu(cpu) {
+ ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+ if (notifier_to_errno(ret))
+ goto fail;
+ }
+ return 0;
+fail:
+ zs_exit();
+ return notifier_to_errno(ret);
+}
+
+/**
+ * zs_create_pool - Creates an allocation pool to work from.
+ * @flags: allocation flags used to allocate pool metadata
+ *
+ * This function must be called before anything when using
+ * the zsmalloc allocator.
+ *
+ * On success, a pointer to the newly created pool is returned,
+ * otherwise NULL.
+ */
+struct zs_pool *zs_create_pool(gfp_t flags)
+{
+ int i, ovhd_size;
+ struct zs_pool *pool;
+
+ ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
+ pool = kzalloc(ovhd_size, GFP_KERNEL);
+ if (!pool)
+ return NULL;
+
+ for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+ int size;
+ struct size_class *class;
+
+ size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
+ if (size > ZS_MAX_ALLOC_SIZE)
+ size = ZS_MAX_ALLOC_SIZE;
+
+ class = &pool->size_class[i];
+ class->size = size;
+ class->index = i;
+ spin_lock_init(&class->lock);
+ class->pages_per_zspage = get_pages_per_zspage(size);
+
+ }
+
+ pool->flags = flags;
+
+ return pool;
+}
+EXPORT_SYMBOL_GPL(zs_create_pool);
+
+void zs_destroy_pool(struct zs_pool *pool)
+{
+ int i;
+
+ for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+ int fg;
+ struct size_class *class = &pool->size_class[i];
+
+ for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
+ if (class->fullness_list[fg]) {
+ pr_info("Freeing non-empty class with size %db, fullness group %d\n",
+ class->size, fg);
+ }
+ }
+ }
+ kfree(pool);
+}
+EXPORT_SYMBOL_GPL(zs_destroy_pool);
+
+/**
+ * zs_malloc - Allocate block of given size from pool.
+ * @pool: pool to allocate from
+ * @size: size of block to allocate
+ *
+ * On success, handle to the allocated object is returned,
+ * otherwise 0.
+ * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
+ */
+unsigned long zs_malloc(struct zs_pool *pool, size_t size)
+{
+ unsigned long obj;
+ struct link_free *link;
+ int class_idx;
+ struct size_class *class;
+
+ struct page *first_page, *m_page;
+ unsigned long m_objidx, m_offset;
+
+ if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
+ return 0;
+
+ class_idx = get_size_class_index(size);
+ class = &pool->size_class[class_idx];
+ BUG_ON(class_idx != class->index);
+
+ spin_lock(&class->lock);
+ first_page = find_get_zspage(class);
+
+ if (!first_page) {
+ spin_unlock(&class->lock);
+ first_page = alloc_zspage(class, pool->flags);
+ if (unlikely(!first_page))
+ return 0;
+
+ set_zspage_mapping(first_page, class->index, ZS_EMPTY);
+ spin_lock(&class->lock);
+ class->pages_allocated += class->pages_per_zspage;
+ }
+
+ obj = (unsigned long)first_page->freelist;
+ obj_handle_to_location(obj, &m_page, &m_objidx);
+ m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+
+ link = (struct link_free *)kmap_atomic(m_page) +
+ m_offset / sizeof(*link);
+ first_page->freelist = link->next;
+ memset(link, POISON_INUSE, sizeof(*link));
+ kunmap_atomic(link);
+
+ first_page->inuse++;
+ /* Now move the zspage to another fullness group, if required */
+ fix_fullness_group(pool, first_page);
+ spin_unlock(&class->lock);
+
+ return obj;
+}
+EXPORT_SYMBOL_GPL(zs_malloc);
+
+void zs_free(struct zs_pool *pool, unsigned long obj)
+{
+ struct link_free *link;
+ struct page *first_page, *f_page;
+ unsigned long f_objidx, f_offset;
+
+ int class_idx;
+ struct size_class *class;
+ enum fullness_group fullness;
+
+ if (unlikely(!obj))
+ return;
+
+ obj_handle_to_location(obj, &f_page, &f_objidx);
+ first_page = get_first_page(f_page);
+
+ get_zspage_mapping(first_page, &class_idx, &fullness);
+ class = &pool->size_class[class_idx];
+ f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
+
+ spin_lock(&class->lock);
+
+ /* Insert this object in containing zspage's freelist */
+ link = (struct link_free *)((unsigned char *)kmap_atomic(f_page)
+ + f_offset);
+ link->next = first_page->freelist;
+ kunmap_atomic(link);
+ first_page->freelist = (void *)obj;
+
+ first_page->inuse--;
+ fullness = fix_fullness_group(pool, first_page);
+
+ if (fullness == ZS_EMPTY)
+ class->pages_allocated -= class->pages_per_zspage;
+
+ spin_unlock(&class->lock);
+
+ if (fullness == ZS_EMPTY)
+ free_zspage(first_page);
+}
+EXPORT_SYMBOL_GPL(zs_free);
+
+/**
+ * zs_map_object - get address of allocated object from handle.
+ * @pool: pool from which the object was allocated
+ * @handle: handle returned from zs_malloc
+ *
+ * Before using an object allocated from zs_malloc, it must be mapped using
+ * this function. When done with the object, it must be unmapped using
+ * zs_unmap_object.
+ *
+ * Only one object can be mapped per cpu at a time. There is no protection
+ * against nested mappings.
+ *
+ * This function returns with preemption and page faults disabled.
+ */
+void *zs_map_object(struct zs_pool *pool, unsigned long handle,
+ enum zs_mapmode mm)
+{
+ struct page *page;
+ unsigned long obj_idx, off;
+
+ unsigned int class_idx;
+ enum fullness_group fg;
+ struct size_class *class;
+ struct mapping_area *area;
+ struct page *pages[2];
+
+ BUG_ON(!handle);
+
+ /*
+ * Because we use per-cpu mapping areas shared among the
+ * pools/users, we can't allow mapping in interrupt context
+ * because it can corrupt another users mappings.
+ */
+ BUG_ON(in_interrupt());
+
+ obj_handle_to_location(handle, &page, &obj_idx);
+ get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+ class = &pool->size_class[class_idx];
+ off = obj_idx_to_offset(page, obj_idx, class->size);
+
+ area = &get_cpu_var(zs_map_area);
+ area->vm_mm = mm;
+ if (off + class->size <= PAGE_SIZE) {
+ /* this object is contained entirely within a page */
+ area->vm_addr = kmap_atomic(page);
+ return area->vm_addr + off;
+ }
+
+ /* this object spans two pages */
+ pages[0] = page;
+ pages[1] = get_next_page(page);
+ BUG_ON(!pages[1]);
+
+ return __zs_map_object(area, pages, off, class->size);
+}
+EXPORT_SYMBOL_GPL(zs_map_object);
+
+void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
+{
+ struct page *page;
+ unsigned long obj_idx, off;
+
+ unsigned int class_idx;
+ enum fullness_group fg;
+ struct size_class *class;
+ struct mapping_area *area;
+
+ BUG_ON(!handle);
+
+ obj_handle_to_location(handle, &page, &obj_idx);
+ get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+ class = &pool->size_class[class_idx];
+ off = obj_idx_to_offset(page, obj_idx, class->size);
+
+ area = &__get_cpu_var(zs_map_area);
+ if (off + class->size <= PAGE_SIZE)
+ kunmap_atomic(area->vm_addr);
+ else {
+ struct page *pages[2];
+
+ pages[0] = page;
+ pages[1] = get_next_page(page);
+ BUG_ON(!pages[1]);
+
+ __zs_unmap_object(area, pages, off, class->size);
+ }
+ put_cpu_var(zs_map_area);
+}
+EXPORT_SYMBOL_GPL(zs_unmap_object);
+
+u64 zs_get_total_size_bytes(struct zs_pool *pool)
+{
+ int i;
+ u64 npages = 0;
+
+ for (i = 0; i < ZS_SIZE_CLASSES; i++)
+ npages += pool->size_class[i].pages_allocated;
+
+ return npages << PAGE_SHIFT;
+}
+EXPORT_SYMBOL_GPL(zs_get_total_size_bytes);
+
+module_init(zs_init);
+module_exit(zs_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
diff --git a/mm/zswap.c b/mm/zswap.c
index 5a63f78a5601..e55bab9dc41f 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -77,12 +77,12 @@ static u64 zswap_duplicate_entry;
**********************************/
/* Enable/disable zswap (disabled by default, fixed at boot for now) */
static bool zswap_enabled __read_mostly;
-module_param_named(enabled, zswap_enabled, bool, 0);
+module_param_named(enabled, zswap_enabled, bool, 0444);
/* Compressor to be used by zswap (fixed at boot for now) */
#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
-module_param_named(compressor, zswap_compressor, charp, 0);
+module_param_named(compressor, zswap_compressor, charp, 0444);
/* The maximum percentage of memory that the compressed pool can occupy */
static unsigned int zswap_max_pool_percent = 20;