summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/bootmem.c159
-rw-r--r--mm/huge_memory.c6
-rw-r--r--mm/hugetlb.c7
-rw-r--r--mm/init-mm.c11
-rw-r--r--mm/memblock.c203
-rw-r--r--mm/memcontrol.c28
-rw-r--r--mm/memfd.c2
-rw-r--r--mm/memory.c79
-rw-r--r--mm/mprotect.c49
-rw-r--r--mm/nobootmem.c20
-rw-r--r--mm/page_alloc.c27
-rw-r--r--mm/page_io.c3
-rw-r--r--mm/readahead.c19
-rw-r--r--mm/shmem.c59
-rw-r--r--mm/swapfile.c77
-rw-r--r--mm/usercopy.c25
17 files changed, 510 insertions, 269 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ce95491abd6a..9ae1b6a8e30f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1,3 +1,6 @@
+
+menu "Memory Management options"
+
config SELECT_MEMORY_MODEL
def_bool y
depends on ARCH_SELECT_MEMORY_MODEL
@@ -762,3 +765,5 @@ config GUP_BENCHMARK
config ARCH_HAS_PTE_SPECIAL
bool
+
+endmenu
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 9e197987b67d..97db0e8e362b 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -21,6 +21,53 @@
#include "internal.h"
+/**
+ * DOC: bootmem overview
+ *
+ * Bootmem is a boot-time physical memory allocator and configurator.
+ *
+ * It is used early in the boot process before the page allocator is
+ * set up.
+ *
+ * Bootmem is based on the most basic of allocators, a First Fit
+ * allocator which uses a bitmap to represent memory. If a bit is 1,
+ * the page is allocated and 0 if unallocated. To satisfy allocations
+ * of sizes smaller than a page, the allocator records the Page Frame
+ * Number (PFN) of the last allocation and the offset the allocation
+ * ended at. Subsequent small allocations are merged together and
+ * stored on the same page.
+ *
+ * The information used by the bootmem allocator is represented by
+ * :c:type:`struct bootmem_data`. An array to hold up to %MAX_NUMNODES
+ * such structures is statically allocated and then it is discarded
+ * when the system initialization completes. Each entry in this array
+ * corresponds to a node with memory. For UMA systems only entry 0 is
+ * used.
+ *
+ * The bootmem allocator is initialized during early architecture
+ * specific setup. Each architecture is required to supply a
+ * :c:func:`setup_arch` function which, among other tasks, is
+ * responsible for acquiring the necessary parameters to initialise
+ * the boot memory allocator. These parameters define limits of usable
+ * physical memory:
+ *
+ * * @min_low_pfn - the lowest PFN that is available in the system
+ * * @max_low_pfn - the highest PFN that may be addressed by low
+ * memory (%ZONE_NORMAL)
+ * * @max_pfn - the last PFN available to the system.
+ *
+ * After those limits are determined, the :c:func:`init_bootmem` or
+ * :c:func:`init_bootmem_node` function should be called to initialize
+ * the bootmem allocator. The UMA case should use the `init_bootmem`
+ * function. It will initialize ``contig_page_data`` structure that
+ * represents the only memory node in the system. In the NUMA case the
+ * `init_bootmem_node` function should be called to initialize the
+ * bootmem allocator for each node.
+ *
+ * Once the allocator is set up, it is possible to use either single
+ * node or NUMA variant of the allocation APIs.
+ */
+
#ifndef CONFIG_NEED_MULTIPLE_NODES
struct pglist_data __refdata contig_page_data = {
.bdata = &bootmem_node_data[0]
@@ -62,6 +109,8 @@ static unsigned long __init bootmap_bytes(unsigned long pages)
/**
* bootmem_bootmap_pages - calculate bitmap size in pages
* @pages: number of pages the bitmap has to represent
+ *
+ * Return: the number of pages needed to hold the bitmap.
*/
unsigned long __init bootmem_bootmap_pages(unsigned long pages)
{
@@ -121,7 +170,7 @@ static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
* @startpfn: first pfn on the node
* @endpfn: first pfn after the node
*
- * Returns the number of bytes needed to hold the bitmap for this node.
+ * Return: the number of bytes needed to hold the bitmap for this node.
*/
unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
unsigned long startpfn, unsigned long endpfn)
@@ -134,7 +183,7 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
* @start: pfn where the bitmap is to be placed
* @pages: number of available physical pages
*
- * Returns the number of bytes needed to hold the bitmap.
+ * Return: the number of bytes needed to hold the bitmap.
*/
unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
{
@@ -143,15 +192,6 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
}
-/*
- * free_bootmem_late - free bootmem pages directly to page allocator
- * @addr: starting physical address of the range
- * @size: size of the range in bytes
- *
- * This is only useful when the bootmem allocator has already been torn
- * down, but we are still initializing the system. Pages are given directly
- * to the page allocator, no bootmem metadata is updated because it is gone.
- */
void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
{
unsigned long cursor, end;
@@ -264,11 +304,6 @@ void __init reset_all_zones_managed_pages(void)
reset_managed_pages_done = 1;
}
-/**
- * free_all_bootmem - release free pages to the buddy allocator
- *
- * Returns the number of pages actually released.
- */
unsigned long __init free_all_bootmem(void)
{
unsigned long total_pages = 0;
@@ -385,16 +420,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
BUG();
}
-/**
- * free_bootmem_node - mark a page range as usable
- * @pgdat: node the range resides on
- * @physaddr: starting address of the range
- * @size: size of the range in bytes
- *
- * Partial pages will be considered reserved and left as they are.
- *
- * The range must reside completely on the specified node.
- */
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
@@ -408,15 +433,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
}
-/**
- * free_bootmem - mark a page range as usable
- * @physaddr: starting physical address of the range
- * @size: size of the range in bytes
- *
- * Partial pages will be considered reserved and left as they are.
- *
- * The range must be contiguous but may span node boundaries.
- */
void __init free_bootmem(unsigned long physaddr, unsigned long size)
{
unsigned long start, end;
@@ -439,6 +455,8 @@ void __init free_bootmem(unsigned long physaddr, unsigned long size)
* Partial pages will be reserved.
*
* The range must reside completely on the specified node.
+ *
+ * Return: 0 on success, -errno on failure.
*/
int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size, int flags)
@@ -460,6 +478,8 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
* Partial pages will be reserved.
*
* The range must be contiguous but may span node boundaries.
+ *
+ * Return: 0 on success, -errno on failure.
*/
int __init reserve_bootmem(unsigned long addr, unsigned long size,
int flags)
@@ -646,19 +666,6 @@ restart:
return NULL;
}
-/**
- * __alloc_bootmem_nopanic - allocate boot memory without panicking
- * @size: size of the request in bytes
- * @align: alignment of the region
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may happen on any node in the system.
- *
- * Returns NULL on failure.
- */
void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
unsigned long goal)
{
@@ -682,19 +689,6 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
return NULL;
}
-/**
- * __alloc_bootmem - allocate boot memory
- * @size: size of the request in bytes
- * @align: alignment of the region
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may happen on any node in the system.
- *
- * The function panics if the request can not be satisfied.
- */
void * __init __alloc_bootmem(unsigned long size, unsigned long align,
unsigned long goal)
{
@@ -754,21 +748,6 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
return NULL;
}
-/**
- * __alloc_bootmem_node - allocate boot memory from a specific node
- * @pgdat: node to allocate from
- * @size: size of the request in bytes
- * @align: alignment of the region
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may fall back to any node in the system if the specified node
- * can not hold the requested memory.
- *
- * The function panics if the request can not be satisfied.
- */
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
@@ -807,19 +786,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
}
-/**
- * __alloc_bootmem_low - allocate low boot memory
- * @size: size of the request in bytes
- * @align: alignment of the region
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may happen on any node in the system.
- *
- * The function panics if the request can not be satisfied.
- */
void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
unsigned long goal)
{
@@ -834,21 +800,6 @@ void * __init __alloc_bootmem_low_nopanic(unsigned long size,
ARCH_LOW_ADDRESS_LIMIT);
}
-/**
- * __alloc_bootmem_low_node - allocate low boot memory from a specific node
- * @pgdat: node to allocate from
- * @size: size of the request in bytes
- * @align: alignment of the region
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may fall back to any node in the system if the specified node
- * can not hold the requested memory.
- *
- * The function panics if the request can not be satisfied.
- */
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 25346bd99364..a9e1e093df51 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -552,7 +552,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
+ if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -1142,7 +1142,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
vmf->address, page_to_nid(page));
if (unlikely(!pages[i] ||
- mem_cgroup_try_charge(pages[i], vma->vm_mm,
+ mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
GFP_KERNEL, &memcg, false))) {
if (pages[i])
put_page(pages[i]);
@@ -1312,7 +1312,7 @@ alloc:
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
+ if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
huge_gfp, &memcg, true))) {
put_page(new_page);
split_huge_pmd(vma, vmf->pmd, vmf->address);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 039ddbc574e9..3103099f64fd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3167,6 +3167,13 @@ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
return 0;
}
+/*
+ * When a new function is introduced to vm_operations_struct and added
+ * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
+ * This is because under System V memory model, mappings created via
+ * shmget/shmat with "huge page" specified are backed by hugetlbfs files,
+ * their original vm_ops are overwritten with shm_vm_ops.
+ */
const struct vm_operations_struct hugetlb_vm_ops = {
.fault = hugetlb_vm_op_fault,
.open = hugetlb_vm_op_open,
diff --git a/mm/init-mm.c b/mm/init-mm.c
index f0179c9c04c2..a787a319211e 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -15,6 +15,16 @@
#define INIT_MM_CONTEXT(name)
#endif
+/*
+ * For dynamically allocated mm_structs, there is a dynamically sized cpumask
+ * at the end of the structure, the size of which depends on the maximum CPU
+ * number the system can see. That way we allocate only as much memory for
+ * mm_cpumask() as needed for the hundreds, or thousands of processes that
+ * a system typically runs.
+ *
+ * Since there is only one init_mm in the entire system, keep it simple
+ * and size this cpu_bitmask to NR_CPUS.
+ */
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
.pgd = swapper_pg_dir,
@@ -25,5 +35,6 @@ struct mm_struct init_mm = {
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.user_ns = &init_user_ns,
+ .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/memblock.c b/mm/memblock.c
index 4b5d245fafc1..b4ad05764745 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -27,6 +27,61 @@
#include "internal.h"
+/**
+ * DOC: memblock overview
+ *
+ * Memblock is a method of managing memory regions during the early
+ * boot period when the usual kernel memory allocators are not up and
+ * running.
+ *
+ * Memblock views the system memory as collections of contiguous
+ * regions. There are several types of these collections:
+ *
+ * * ``memory`` - describes the physical memory available to the
+ * kernel; this may differ from the actual physical memory installed
+ * in the system, for instance when the memory is restricted with
+ * ``mem=`` command line parameter
+ * * ``reserved`` - describes the regions that were allocated
+ * * ``physmap`` - describes the actual physical memory regardless of
+ * the possible restrictions; the ``physmap`` type is only available
+ * on some architectures.
+ *
+ * Each region is represented by :c:type:`struct memblock_region` that
+ * defines the region extents, its attributes and NUMA node id on NUMA
+ * systems. Every memory type is described by the :c:type:`struct
+ * memblock_type` which contains an array of memory regions along with
+ * the allocator metadata. The memory types are nicely wrapped with
+ * :c:type:`struct memblock`. This structure is statically initialzed
+ * at build time. The region arrays for the "memory" and "reserved"
+ * types are initially sized to %INIT_MEMBLOCK_REGIONS and for the
+ * "physmap" type to %INIT_PHYSMEM_REGIONS.
+ * The :c:func:`memblock_allow_resize` enables automatic resizing of
+ * the region arrays during addition of new regions. This feature
+ * should be used with care so that memory allocated for the region
+ * array will not overlap with areas that should be reserved, for
+ * example initrd.
+ *
+ * The early architecture setup should tell memblock what the physical
+ * memory layout is by using :c:func:`memblock_add` or
+ * :c:func:`memblock_add_node` functions. The first function does not
+ * assign the region to a NUMA node and it is appropriate for UMA
+ * systems. Yet, it is possible to use it on NUMA systems as well and
+ * assign the region to a NUMA node later in the setup process using
+ * :c:func:`memblock_set_node`. The :c:func:`memblock_add_node`
+ * performs such an assignment directly.
+ *
+ * Once memblock is setup the memory can be allocated using either
+ * memblock or bootmem APIs.
+ *
+ * As the system boot progresses, the architecture specific
+ * :c:func:`mem_init` function frees all the memory to the buddy page
+ * allocator.
+ *
+ * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the
+ * memblock data structures will be discarded after the system
+ * initialization compltes.
+ */
+
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -61,7 +116,7 @@ static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock = 0;
static int memblock_reserved_in_slab __initdata_memblock = 0;
-ulong __init_memblock choose_memblock_flags(void)
+enum memblock_flags __init_memblock choose_memblock_flags(void)
{
return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
}
@@ -93,10 +148,11 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
return i < type->cnt;
}
-/*
+/**
* __memblock_find_range_bottom_up - find free area utility in bottom-up
* @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
+ * %MEMBLOCK_ALLOC_ACCESSIBLE
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
@@ -104,13 +160,13 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
*
* Utility called from memblock_find_in_range_node(), find free area bottom-up.
*
- * RETURNS:
+ * Return:
* Found address on success, 0 on failure.
*/
static phys_addr_t __init_memblock
__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align, int nid,
- ulong flags)
+ enum memblock_flags flags)
{
phys_addr_t this_start, this_end, cand;
u64 i;
@@ -130,7 +186,8 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
/**
* __memblock_find_range_top_down - find free area utility, in top-down
* @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
+ * %MEMBLOCK_ALLOC_ACCESSIBLE
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
@@ -138,13 +195,13 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
*
* Utility called from memblock_find_in_range_node(), find free area top-down.
*
- * RETURNS:
+ * Return:
* Found address on success, 0 on failure.
*/
static phys_addr_t __init_memblock
__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align, int nid,
- ulong flags)
+ enum memblock_flags flags)
{
phys_addr_t this_start, this_end, cand;
u64 i;
@@ -170,7 +227,8 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
* @size: size of free area to find
* @align: alignment of free area to find
* @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
+ * %MEMBLOCK_ALLOC_ACCESSIBLE
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
* @flags: pick from blocks based on memory attributes
*
@@ -184,12 +242,13 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
*
* If bottom-up allocation failed, will try to allocate memory top-down.
*
- * RETURNS:
+ * Return:
* Found address on success, 0 on failure.
*/
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid, ulong flags)
+ phys_addr_t end, int nid,
+ enum memblock_flags flags)
{
phys_addr_t kernel_end, ret;
@@ -239,13 +298,14 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
/**
* memblock_find_in_range - find free area in given range
* @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
+ * %MEMBLOCK_ALLOC_ACCESSIBLE
* @size: size of free area to find
* @align: alignment of free area to find
*
* Find @size free area aligned to @align in the specified range.
*
- * RETURNS:
+ * Return:
* Found address on success, 0 on failure.
*/
phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
@@ -253,7 +313,7 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t align)
{
phys_addr_t ret;
- ulong flags = choose_memblock_flags();
+ enum memblock_flags flags = choose_memblock_flags();
again:
ret = memblock_find_in_range_node(size, align, start, end,
@@ -289,7 +349,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
/**
- * Discard memory and reserved arrays if they were allocated
+ * memblock_discard - discard memory and reserved arrays if they were allocated
*/
void __init memblock_discard(void)
{
@@ -319,11 +379,11 @@ void __init memblock_discard(void)
*
* Double the size of the @type regions array. If memblock is being used to
* allocate memory for a new reserved regions array and there is a previously
- * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
+ * allocated memory range [@new_area_start, @new_area_start + @new_area_size]
* waiting to be reserved, ensure the memory used by the new array does
* not overlap.
*
- * RETURNS:
+ * Return:
* 0 on success, -1 on failure.
*/
static int __init_memblock memblock_double_array(struct memblock_type *type,
@@ -468,13 +528,14 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
* @nid: node id of the new region
* @flags: flags of the new region
*
- * Insert new memblock region [@base,@base+@size) into @type at @idx.
+ * Insert new memblock region [@base, @base + @size) into @type at @idx.
* @type must already have extra room to accommodate the new region.
*/
static void __init_memblock memblock_insert_region(struct memblock_type *type,
int idx, phys_addr_t base,
phys_addr_t size,
- int nid, unsigned long flags)
+ int nid,
+ enum memblock_flags flags)
{
struct memblock_region *rgn = &type->regions[idx];
@@ -496,17 +557,17 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
* @nid: nid of the new region
* @flags: flags of the new region
*
- * Add new memblock region [@base,@base+@size) into @type. The new region
+ * Add new memblock region [@base, @base + @size) into @type. The new region
* is allowed to overlap with existing ones - overlaps don't affect already
* existing regions. @type is guaranteed to be minimal (all neighbouring
* compatible regions are merged) after the addition.
*
- * RETURNS:
+ * Return:
* 0 on success, -errno on failure.
*/
int __init_memblock memblock_add_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size,
- int nid, unsigned long flags)
+ int nid, enum memblock_flags flags)
{
bool insert = false;
phys_addr_t obase = base;
@@ -590,12 +651,35 @@ repeat:
}
}
+/**
+ * memblock_add_node - add new memblock region within a NUMA node
+ * @base: base address of the new region
+ * @size: size of the new region
+ * @nid: nid of the new region
+ *
+ * Add new memblock region [@base, @base + @size) to the "memory"
+ * type. See memblock_add_range() description for mode details
+ *
+ * Return:
+ * 0 on success, -errno on failure.
+ */
int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
int nid)
{
return memblock_add_range(&memblock.memory, base, size, nid, 0);
}
+/**
+ * memblock_add - add new memblock region
+ * @base: base address of the new region
+ * @size: size of the new region
+ *
+ * Add new memblock region [@base, @base + @size) to the "memory"
+ * type. See memblock_add_range() description for mode details
+ *
+ * Return:
+ * 0 on success, -errno on failure.
+ */
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
@@ -615,11 +699,11 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
* @end_rgn: out parameter for the end of isolated region
*
* Walk @type and ensure that regions don't cross the boundaries defined by
- * [@base,@base+@size). Crossing regions are split at the boundaries,
+ * [@base, @base + @size). Crossing regions are split at the boundaries,
* which may create at most two more regions. The index of the first
* region inside the range is returned in *@start_rgn and end in *@end_rgn.
*
- * RETURNS:
+ * Return:
* 0 on success, -errno on failure.
*/
static int __init_memblock memblock_isolate_range(struct memblock_type *type,
@@ -730,10 +814,15 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
}
/**
+ * memblock_setclr_flag - set or clear flag for a memory region
+ * @base: base address of the region
+ * @size: size of the region
+ * @set: set or clear the flag
+ * @flag: the flag to udpate
*
* This function isolates region [@base, @base + @size), and sets/clears flag
*
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
*/
static int __init_memblock memblock_setclr_flag(phys_addr_t base,
phys_addr_t size, int set, int flag)
@@ -760,7 +849,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base,
* @base: the base phys addr of the region
* @size: the size of the region
*
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
*/
int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
{
@@ -772,7 +861,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
* @base: the base phys addr of the region
* @size: the size of the region
*
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
*/
int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
{
@@ -784,7 +873,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
* @base: the base phys addr of the region
* @size: the size of the region
*
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
*/
int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
{
@@ -798,7 +887,7 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
* @base: the base phys addr of the region
* @size: the size of the region
*
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
*/
int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
{
@@ -810,7 +899,7 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
* @base: the base phys addr of the region
* @size: the size of the region
*
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
*/
int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
{
@@ -875,7 +964,8 @@ void __init_memblock __next_reserved_mem_region(u64 *idx,
* As both region arrays are sorted, the function advances the two indices
* in lockstep and returns each intersection.
*/
-void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
+void __init_memblock __next_mem_range(u64 *idx, int nid,
+ enum memblock_flags flags,
struct memblock_type *type_a,
struct memblock_type *type_b,
phys_addr_t *out_start,
@@ -970,9 +1060,6 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
/**
* __next_mem_range_rev - generic next function for for_each_*_range_rev()
*
- * Finds the next range from type_a which is not marked as unsuitable
- * in type_b.
- *
* @idx: pointer to u64 loop variable
* @nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
@@ -982,9 +1069,13 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
* @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @out_nid: ptr to int for nid of the range, can be %NULL
*
+ * Finds the next range from type_a which is not marked as unsuitable
+ * in type_b.
+ *
* Reverse of __next_mem_range().
*/
-void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
+void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
+ enum memblock_flags flags,
struct memblock_type *type_a,
struct memblock_type *type_b,
phys_addr_t *out_start,
@@ -1116,10 +1207,10 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
* @type: memblock type to set node ID for
* @nid: node ID to set
*
- * Set the nid of memblock @type regions in [@base,@base+@size) to @nid.
+ * Set the nid of memblock @type regions in [@base, @base + @size) to @nid.
* Regions which cross the area boundaries are split as necessary.
*
- * RETURNS:
+ * Return:
* 0 on success, -errno on failure.
*/
int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
@@ -1142,7 +1233,8 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid, ulong flags)
+ phys_addr_t end, int nid,
+ enum memblock_flags flags)
{
phys_addr_t found;
@@ -1164,7 +1256,7 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
phys_addr_t start, phys_addr_t end,
- ulong flags)
+ enum memblock_flags flags)
{
return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
flags);
@@ -1172,14 +1264,14 @@ phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t max_addr,
- int nid, ulong flags)
+ int nid, enum memblock_flags flags)
{
return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags);
}
phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
{
- ulong flags = choose_memblock_flags();
+ enum memblock_flags flags = choose_memblock_flags();
phys_addr_t ret;
again:
@@ -1243,7 +1335,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
* The allocation is performed from memory region limited by
* memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE.
*
- * The memory block is aligned on SMP_CACHE_BYTES if @align == 0.
+ * The memory block is aligned on %SMP_CACHE_BYTES if @align == 0.
*
* The phys address of allocated boot memory block is converted to virtual and
* allocated memory is reset to 0.
@@ -1251,7 +1343,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
* In addition, function sets the min_count to 0 using kmemleak_alloc for
* allocated boot memory block, so that it is never reported as leaks.
*
- * RETURNS:
+ * Return:
* Virtual address of allocated memory block on success, NULL on failure.
*/
static void * __init memblock_virt_alloc_internal(
@@ -1261,7 +1353,7 @@ static void * __init memblock_virt_alloc_internal(
{
phys_addr_t alloc;
void *ptr;
- ulong flags = choose_memblock_flags();
+ enum memblock_flags flags = choose_memblock_flags();
if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
nid = NUMA_NO_NODE;
@@ -1336,7 +1428,7 @@ done:
* info), if enabled. Does not zero allocated memory, does not panic if request
* cannot be satisfied.
*
- * RETURNS:
+ * Return:
* Virtual address of allocated memory block on success, NULL on failure.
*/
void * __init memblock_virt_alloc_try_nid_raw(
@@ -1373,7 +1465,7 @@ void * __init memblock_virt_alloc_try_nid_raw(
* Public function, provides additional debug information (including caller
* info), if enabled. This function zeroes the allocated memory.
*
- * RETURNS:
+ * Return:
* Virtual address of allocated memory block on success, NULL on failure.
*/
void * __init memblock_virt_alloc_try_nid_nopanic(
@@ -1409,7 +1501,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
* which provides debug information (including caller info), if enabled,
* and panics if the request can not be satisfied.
*
- * RETURNS:
+ * Return:
* Virtual address of allocated memory block on success, NULL on failure.
*/
void * __init memblock_virt_alloc_try_nid(
@@ -1453,9 +1545,9 @@ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
memblock_remove_range(&memblock.reserved, base, size);
}
-/*
+/**
* __memblock_free_late - free bootmem block pages directly to buddy allocator
- * @addr: phys starting address of the boot memory block
+ * @base: phys starting address of the boot memory block
* @size: size of the boot memory block in bytes
*
* This is only useful when the bootmem allocator has already been torn
@@ -1667,9 +1759,9 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
* @base: base of region to check
* @size: size of region to check
*
- * Check if the region [@base, @base+@size) is a subset of a memory block.
+ * Check if the region [@base, @base + @size) is a subset of a memory block.
*
- * RETURNS:
+ * Return:
* 0 if false, non-zero if true
*/
bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
@@ -1688,9 +1780,10 @@ bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t siz
* @base: base of region to check
* @size: size of region to check
*
- * Check if the region [@base, @base+@size) intersects a reserved memory block.
+ * Check if the region [@base, @base + @size) intersects a reserved
+ * memory block.
*
- * RETURNS:
+ * Return:
* True if they intersect, false if not.
*/
bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
@@ -1737,7 +1830,7 @@ phys_addr_t __init_memblock memblock_get_current_limit(void)
static void __init_memblock memblock_dump(struct memblock_type *type)
{
phys_addr_t base, end, size;
- unsigned long flags;
+ enum memblock_flags flags;
int idx;
struct memblock_region *rgn;
@@ -1755,7 +1848,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
snprintf(nid_buf, sizeof(nid_buf), " on node %d",
memblock_get_region_node(rgn));
#endif
- pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#lx\n",
+ pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#x\n",
type->name, idx, &base, &end, &size, nid_buf, flags);
}
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c0280b3143e..b836e7f00309 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4037,6 +4037,14 @@ static struct cftype mem_cgroup_legacy_files[] = {
static DEFINE_IDR(mem_cgroup_idr);
+static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
+{
+ if (memcg->id.id > 0) {
+ idr_remove(&mem_cgroup_idr, memcg->id.id);
+ memcg->id.id = 0;
+ }
+}
+
static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
{
VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
@@ -4047,8 +4055,7 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
if (atomic_sub_and_test(n, &memcg->id.ref)) {
- idr_remove(&mem_cgroup_idr, memcg->id.id);
- memcg->id.id = 0;
+ mem_cgroup_id_remove(memcg);
/* Memcg ID pins CSS */
css_put(&memcg->css);
@@ -4185,8 +4192,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
fail:
- if (memcg->id.id > 0)
- idr_remove(&mem_cgroup_idr, memcg->id.id);
+ mem_cgroup_id_remove(memcg);
__mem_cgroup_free(memcg);
return NULL;
}
@@ -4245,6 +4251,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &memcg->css;
fail:
+ mem_cgroup_id_remove(memcg);
mem_cgroup_free(memcg);
return ERR_PTR(-ENOMEM);
}
@@ -5593,6 +5600,19 @@ out:
return ret;
}
+int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask, struct mem_cgroup **memcgp,
+ bool compound)
+{
+ struct mem_cgroup *memcg;
+ int ret;
+
+ ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
+ memcg = *memcgp;
+ mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
+ return ret;
+}
+
/**
* mem_cgroup_commit_charge - commit a page charge
* @page: page to charge
diff --git a/mm/memfd.c b/mm/memfd.c
index 27069518e3c5..2bb5e257080e 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -326,7 +326,7 @@ SYSCALL_DEFINE2(memfd_create,
goto err_fd;
}
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
- file->f_flags |= O_RDWR | O_LARGEFILE;
+ file->f_flags |= O_LARGEFILE;
if (flags & MFD_ALLOW_SEALING) {
file_seals = memfd_file_seals_ptr(file);
diff --git a/mm/memory.c b/mm/memory.c
index 7206a634270b..348279ff6e51 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -326,16 +326,20 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-/*
- * See the comment near struct mmu_table_batch.
- */
-
static void tlb_remove_table_smp_sync(void *arg)
{
- /* Simply deliver the interrupt */
+ struct mm_struct __maybe_unused *mm = arg;
+ /*
+ * On most architectures this does nothing. Simply delivering the
+ * interrupt is enough to prevent races with software page table
+ * walking like that done in get_user_pages_fast.
+ *
+ * See the comment near struct mmu_table_batch.
+ */
+ tlb_flush_remove_tables_local(mm);
}
-static void tlb_remove_table_one(void *table)
+static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
{
/*
* This isn't an RCU grace period and hence the page-tables cannot be
@@ -344,7 +348,7 @@ static void tlb_remove_table_one(void *table)
* It is however sufficient for software page-table walkers that rely on
* IRQ disabling. See the comment near struct mmu_table_batch.
*/
- smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+ smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1);
__tlb_remove_table(table);
}
@@ -365,6 +369,8 @@ void tlb_table_flush(struct mmu_gather *tlb)
{
struct mmu_table_batch **batch = &tlb->batch;
+ tlb_flush_remove_tables(tlb->mm);
+
if (*batch) {
call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
*batch = NULL;
@@ -387,7 +393,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
if (*batch == NULL) {
*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
if (*batch == NULL) {
- tlb_remove_table_one(table);
+ tlb_remove_table_one(table, tlb);
return;
}
(*batch)->nr = 0;
@@ -1417,11 +1423,9 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
do {
next = pmd_addr_end(addr, end);
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
- if (next - addr != HPAGE_PMD_SIZE) {
- VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
- !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
+ if (next - addr != HPAGE_PMD_SIZE)
__split_huge_pmd(vma, pmd, addr, false, NULL);
- } else if (zap_huge_pmd(tlb, vma, pmd, addr))
+ else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
}
@@ -1886,6 +1890,9 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
+ if (!pfn_modify_allowed(pfn, pgprot))
+ return -EACCES;
+
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
@@ -1921,6 +1928,9 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
track_pfn_insert(vma, &pgprot, pfn);
+ if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
+ return -EACCES;
+
/*
* If we don't have pte special, then we have to use the pfn_valid()
* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
@@ -1982,6 +1992,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
{
pte_t *pte;
spinlock_t *ptl;
+ int err = 0;
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
@@ -1989,12 +2000,16 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
arch_enter_lazy_mmu_mode();
do {
BUG_ON(!pte_none(*pte));
+ if (!pfn_modify_allowed(pfn, prot)) {
+ err = -EACCES;
+ break;
+ }
set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
- return 0;
+ return err;
}
static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -2003,6 +2018,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
{
pmd_t *pmd;
unsigned long next;
+ int err;
pfn -= addr >> PAGE_SHIFT;
pmd = pmd_alloc(mm, pud, addr);
@@ -2011,9 +2027,10 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
VM_BUG_ON(pmd_trans_huge(*pmd));
do {
next = pmd_addr_end(addr, end);
- if (remap_pte_range(mm, pmd, addr, next,
- pfn + (addr >> PAGE_SHIFT), prot))
- return -ENOMEM;
+ err = remap_pte_range(mm, pmd, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ return err;
} while (pmd++, addr = next, addr != end);
return 0;
}
@@ -2024,6 +2041,7 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
{
pud_t *pud;
unsigned long next;
+ int err;
pfn -= addr >> PAGE_SHIFT;
pud = pud_alloc(mm, p4d, addr);
@@ -2031,9 +2049,10 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (remap_pmd_range(mm, pud, addr, next,
- pfn + (addr >> PAGE_SHIFT), prot))
- return -ENOMEM;
+ err = remap_pmd_range(mm, pud, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ return err;
} while (pud++, addr = next, addr != end);
return 0;
}
@@ -2044,6 +2063,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
{
p4d_t *p4d;
unsigned long next;
+ int err;
pfn -= addr >> PAGE_SHIFT;
p4d = p4d_alloc(mm, pgd, addr);
@@ -2051,9 +2071,10 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
- if (remap_pud_range(mm, p4d, addr, next,
- pfn + (addr >> PAGE_SHIFT), prot))
- return -ENOMEM;
+ err = remap_pud_range(mm, p4d, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ return err;
} while (p4d++, addr = next, addr != end);
return 0;
}
@@ -2503,7 +2524,7 @@ static int wp_page_copy(struct vm_fault *vmf)
cow_user_page(new_page, old_page, vmf->address, vma);
}
- if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
goto oom_free_new;
__SetPageUptodate(new_page);
@@ -3003,8 +3024,8 @@ int do_swap_page(struct vm_fault *vmf)
goto out_page;
}
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
- &memcg, false)) {
+ if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false)) {
ret = VM_FAULT_OOM;
goto out_page;
}
@@ -3165,7 +3186,8 @@ static int do_anonymous_page(struct vm_fault *vmf)
if (!page)
goto oom;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
+ false))
goto oom_free_page;
/*
@@ -3661,7 +3683,7 @@ static int do_cow_fault(struct vm_fault *vmf)
if (!vmf->cow_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
+ if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
&vmf->memcg, false)) {
put_page(vmf->cow_page);
return VM_FAULT_OOM;
@@ -4397,6 +4419,9 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
return -EINVAL;
maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
+ if (!maddr)
+ return -ENOMEM;
+
if (write)
memcpy_toio(maddr + offset, buf, len);
else
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 625608bc8962..6d331620b9e5 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -306,6 +306,42 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
return pages;
}
+static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
+ 0 : -EACCES;
+}
+
+static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long next,
+ struct mm_walk *walk)
+{
+ return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
+ 0 : -EACCES;
+}
+
+static int prot_none_test(unsigned long addr, unsigned long next,
+ struct mm_walk *walk)
+{
+ return 0;
+}
+
+static int prot_none_walk(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, unsigned long newflags)
+{
+ pgprot_t new_pgprot = vm_get_page_prot(newflags);
+ struct mm_walk prot_none_walk = {
+ .pte_entry = prot_none_pte_entry,
+ .hugetlb_entry = prot_none_hugetlb_entry,
+ .test_walk = prot_none_test,
+ .mm = current->mm,
+ .private = &new_pgprot,
+ };
+
+ return walk_page_range(start, end, &prot_none_walk);
+}
+
int
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
unsigned long start, unsigned long end, unsigned long newflags)
@@ -324,6 +360,19 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
}
/*
+ * Do PROT_NONE PFN permission checks here when we can still
+ * bail out without undoing a lot of state. This is a rather
+ * uncommon case, so doesn't need to be very optimized.
+ */
+ if (arch_has_pfn_modify_check() &&
+ (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
+ (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) {
+ error = prot_none_walk(vma, start, end, newflags);
+ if (error)
+ return error;
+ }
+
+ /*
* If we make a private mapping writable we increase our commit;
* but (without finer accounting) cannot reduce our commit if we
* make it unwritable again. hugetlb mapping were accounted for
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 9b02fda0886b..439af3b765a7 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -42,7 +42,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
{
void *ptr;
u64 addr;
- ulong flags = choose_memblock_flags();
+ enum memblock_flags flags = choose_memblock_flags();
if (limit > memblock.current_limit)
limit = memblock.current_limit;
@@ -72,7 +72,7 @@ again:
return ptr;
}
-/*
+/**
* free_bootmem_late - free bootmem pages directly to page allocator
* @addr: starting address of the range
* @size: size of the range in bytes
@@ -176,7 +176,7 @@ void __init reset_all_zones_managed_pages(void)
/**
* free_all_bootmem - release free pages to the buddy allocator
*
- * Returns the number of pages actually released.
+ * Return: the number of pages actually released.
*/
unsigned long __init free_all_bootmem(void)
{
@@ -193,7 +193,7 @@ unsigned long __init free_all_bootmem(void)
/**
* free_bootmem_node - mark a page range as usable
* @pgdat: node the range resides on
- * @physaddr: starting address of the range
+ * @physaddr: starting physical address of the range
* @size: size of the range in bytes
*
* Partial pages will be considered reserved and left as they are.
@@ -208,7 +208,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
/**
* free_bootmem - mark a page range as usable
- * @addr: starting address of the range
+ * @addr: starting physical address of the range
* @size: size of the range in bytes
*
* Partial pages will be considered reserved and left as they are.
@@ -256,7 +256,7 @@ restart:
*
* Allocation may happen on any node in the system.
*
- * Returns NULL on failure.
+ * Return: address of the allocated region or %NULL on failure.
*/
void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
unsigned long goal)
@@ -293,6 +293,8 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
* Allocation may happen on any node in the system.
*
* The function panics if the request can not be satisfied.
+ *
+ * Return: address of the allocated region.
*/
void * __init __alloc_bootmem(unsigned long size, unsigned long align,
unsigned long goal)
@@ -367,6 +369,8 @@ static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
* can not hold the requested memory.
*
* The function panics if the request can not be satisfied.
+ *
+ * Return: address of the allocated region.
*/
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
@@ -396,6 +400,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
* Allocation may happen on any node in the system.
*
* The function panics if the request can not be satisfied.
+ *
+ * Return: address of the allocated region.
*/
void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
unsigned long goal)
@@ -425,6 +431,8 @@ void * __init __alloc_bootmem_low_nopanic(unsigned long size,
* can not hold the requested memory.
*
* The function panics if the request can not be satisfied.
+ *
+ * Return: address of the allocated region.
*/
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a790ef4be74e..0922ef5d2e46 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -155,16 +155,17 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype)
* The following functions are used by the suspend/hibernate code to temporarily
* change gfp_allowed_mask in order to avoid using I/O during memory allocations
* while devices are suspended. To avoid races with the suspend/hibernate code,
- * they should always be called with pm_mutex held (gfp_allowed_mask also should
- * only be modified with pm_mutex held, unless the suspend/hibernate code is
- * guaranteed not to run in parallel with that modification).
+ * they should always be called with system_transition_mutex held
+ * (gfp_allowed_mask also should only be modified with system_transition_mutex
+ * held, unless the suspend/hibernate code is guaranteed not to run in parallel
+ * with that modification).
*/
static gfp_t saved_gfp_mask;
void pm_restore_gfp_mask(void)
{
- WARN_ON(!mutex_is_locked(&pm_mutex));
+ WARN_ON(!mutex_is_locked(&system_transition_mutex));
if (saved_gfp_mask) {
gfp_allowed_mask = saved_gfp_mask;
saved_gfp_mask = 0;
@@ -173,7 +174,7 @@ void pm_restore_gfp_mask(void)
void pm_restrict_gfp_mask(void)
{
- WARN_ON(!mutex_is_locked(&pm_mutex));
+ WARN_ON(!mutex_is_locked(&system_transition_mutex));
WARN_ON(saved_gfp_mask);
saved_gfp_mask = gfp_allowed_mask;
gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
@@ -6939,9 +6940,21 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
start = (void *)PAGE_ALIGN((unsigned long)start);
end = (void *)((unsigned long)end & PAGE_MASK);
for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
+ struct page *page = virt_to_page(pos);
+ void *direct_map_addr;
+
+ /*
+ * 'direct_map_addr' might be different from 'pos'
+ * because some architectures' virt_to_page()
+ * work with aliases. Getting the direct map
+ * address ensures that we get a _writeable_
+ * alias for the memset().
+ */
+ direct_map_addr = page_address(page);
if ((unsigned int)poison <= 0xFF)
- memset(pos, poison, PAGE_SIZE);
- free_reserved_page(virt_to_page(pos));
+ memset(direct_map_addr, poison, PAGE_SIZE);
+
+ free_reserved_page(page);
}
if (pages && s)
diff --git a/mm/page_io.c b/mm/page_io.c
index b41cf9644585..aafd19ec1db4 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -338,7 +338,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
ret = -ENOMEM;
goto out;
}
- bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+ bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
+ bio_associate_blkcg_from_page(bio, page);
count_swpout_vm_event(page);
set_page_writeback(page);
unlock_page(page);
diff --git a/mm/readahead.c b/mm/readahead.c
index e273f0de3376..a59ea70527b9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -19,6 +19,7 @@
#include <linux/syscalls.h>
#include <linux/file.h>
#include <linux/mm_inline.h>
+#include <linux/blk-cgroup.h>
#include "internal.h"
@@ -385,6 +386,7 @@ ondemand_readahead(struct address_space *mapping,
{
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long max_pages = ra->ra_pages;
+ unsigned long add_pages;
pgoff_t prev_offset;
/*
@@ -474,10 +476,17 @@ readit:
* Will this read hit the readahead marker made by itself?
* If so, trigger the readahead marker hit now, and merge
* the resulted next readahead window into the current one.
+ * Take care of maximum IO pages as above.
*/
if (offset == ra->start && ra->size == ra->async_size) {
- ra->async_size = get_next_ra_size(ra, max_pages);
- ra->size += ra->async_size;
+ add_pages = get_next_ra_size(ra, max_pages);
+ if (ra->size + add_pages <= max_pages) {
+ ra->async_size = add_pages;
+ ra->size += add_pages;
+ } else {
+ ra->size = max_pages;
+ ra->async_size = max_pages >> 1;
+ }
}
return ra_submit(ra, mapping, filp);
@@ -505,6 +514,9 @@ void page_cache_sync_readahead(struct address_space *mapping,
if (!ra->ra_pages)
return;
+ if (blk_cgroup_congested())
+ return;
+
/* be dumb */
if (filp && (filp->f_mode & FMODE_RANDOM)) {
force_page_cache_readahead(mapping, filp, offset, req_size);
@@ -555,6 +567,9 @@ page_cache_async_readahead(struct address_space *mapping,
if (inode_read_congested(mapping->host))
return;
+ if (blk_cgroup_congested())
+ return;
+
/* do read-ahead */
ondemand_readahead(mapping, ra, filp, true, offset, req_size);
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 41b9bbf24e16..06ebe17bb924 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1239,8 +1239,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
*/
- error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
- false);
+ error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
+ &memcg, false);
if (error)
goto out;
/* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -1713,7 +1713,7 @@ repeat:
goto failed;
}
- error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
+ error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
false);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
@@ -1819,7 +1819,7 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
- error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
+ error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
PageTransHuge(page));
if (error)
goto unacct;
@@ -2292,7 +2292,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
__SetPageSwapBacked(page);
__SetPageUptodate(page);
- ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false);
+ ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
if (ret)
goto out_release;
@@ -3897,18 +3897,11 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
/* common code */
-static const struct dentry_operations anon_ops = {
- .d_dname = simple_dname
-};
-
static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
unsigned long flags, unsigned int i_flags)
{
- struct file *res;
struct inode *inode;
- struct path path;
- struct super_block *sb;
- struct qstr this;
+ struct file *res;
if (IS_ERR(mnt))
return ERR_CAST(mnt);
@@ -3919,41 +3912,21 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l
if (shmem_acct_size(flags, size))
return ERR_PTR(-ENOMEM);
- res = ERR_PTR(-ENOMEM);
- this.name = name;
- this.len = strlen(name);
- this.hash = 0; /* will go */
- sb = mnt->mnt_sb;
- path.mnt = mntget(mnt);
- path.dentry = d_alloc_pseudo(sb, &this);
- if (!path.dentry)
- goto put_memory;
- d_set_d_op(path.dentry, &anon_ops);
-
- res = ERR_PTR(-ENOSPC);
- inode = shmem_get_inode(sb, NULL, S_IFREG | 0777, 0, flags);
- if (!inode)
- goto put_memory;
-
+ inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
+ flags);
+ if (unlikely(!inode)) {
+ shmem_unacct_size(flags, size);
+ return ERR_PTR(-ENOSPC);
+ }
inode->i_flags |= i_flags;
- d_instantiate(path.dentry, inode);
inode->i_size = size;
clear_nlink(inode); /* It is unlinked */
res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
+ if (!IS_ERR(res))
+ res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
+ &shmem_file_operations);
if (IS_ERR(res))
- goto put_path;
-
- res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
- &shmem_file_operations);
- if (IS_ERR(res))
- goto put_path;
-
- return res;
-
-put_memory:
- shmem_unacct_size(flags, size);
-put_path:
- path_put(&path);
+ iput(inode);
return res;
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2cc2972eedaf..8837b22c848d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2909,6 +2909,35 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
return 0;
}
+
+/*
+ * Find out how many pages are allowed for a single swap device. There
+ * are two limiting factors:
+ * 1) the number of bits for the swap offset in the swp_entry_t type, and
+ * 2) the number of bits in the swap pte, as defined by the different
+ * architectures.
+ *
+ * In order to find the largest possible bit mask, a swap entry with
+ * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
+ * decoded to a swp_entry_t again, and finally the swap offset is
+ * extracted.
+ *
+ * This will mask all the bits from the initial ~0UL mask that can't
+ * be encoded in either the swp_entry_t or the architecture definition
+ * of a swap pte.
+ */
+unsigned long generic_max_swapfile_size(void)
+{
+ return swp_offset(pte_to_swp_entry(
+ swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+}
+
+/* Can be overridden by an architecture for additional checks. */
+__weak unsigned long max_swapfile_size(void)
+{
+ return generic_max_swapfile_size();
+}
+
static unsigned long read_swap_header(struct swap_info_struct *p,
union swap_header *swap_header,
struct inode *inode)
@@ -2944,22 +2973,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
p->cluster_next = 1;
p->cluster_nr = 0;
- /*
- * Find out how many pages are allowed for a single swap
- * device. There are two limiting factors: 1) the number
- * of bits for the swap offset in the swp_entry_t type, and
- * 2) the number of bits in the swap pte as defined by the
- * different architectures. In order to find the
- * largest possible bit mask, a swap entry with swap type 0
- * and swap offset ~0UL is created, encoded to a swap pte,
- * decoded to a swp_entry_t again, and finally the swap
- * offset is extracted. This will mask all the bits from
- * the initial ~0UL mask that can't be encoded in either
- * the swp_entry_t or the architecture definition of a
- * swap pte.
- */
- maxpages = swp_offset(pte_to_swp_entry(
- swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+ maxpages = max_swapfile_size();
last_page = swap_header->info.last_page;
if (!last_page) {
pr_warn("Empty swap-file\n");
@@ -3731,6 +3745,37 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
}
+#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
+ gfp_t gfp_mask)
+{
+ struct swap_info_struct *si, *next;
+ if (!(gfp_mask & __GFP_IO) || !memcg)
+ return;
+
+ if (!blk_cgroup_congested())
+ return;
+
+ /*
+ * We've already scheduled a throttle, avoid taking the global swap
+ * lock.
+ */
+ if (current->throttle_queue)
+ return;
+
+ spin_lock(&swap_avail_lock);
+ plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
+ avail_lists[node]) {
+ if (si->bdev) {
+ blkcg_schedule_throttle(bdev_get_queue(si->bdev),
+ true);
+ break;
+ }
+ }
+ spin_unlock(&swap_avail_lock);
+}
+#endif
+
static int __init swapfile_init(void)
{
int nid;
diff --git a/mm/usercopy.c b/mm/usercopy.c
index e9e9325f7638..852eb4e53f06 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -20,6 +20,8 @@
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/thread_info.h>
+#include <linux/atomic.h>
+#include <linux/jump_label.h>
#include <asm/sections.h>
/*
@@ -240,6 +242,8 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
}
}
+static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
+
/*
* Validates that the given object is:
* - not bogus address
@@ -248,6 +252,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
*/
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
{
+ if (static_branch_unlikely(&bypass_usercopy_checks))
+ return;
+
/* Skip all tests if size is zero. */
if (!n)
return;
@@ -279,3 +286,21 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
check_kernel_text_object((const unsigned long)ptr, n, to_user);
}
EXPORT_SYMBOL(__check_object_size);
+
+static bool enable_checks __initdata = true;
+
+static int __init parse_hardened_usercopy(char *str)
+{
+ return strtobool(str, &enable_checks);
+}
+
+__setup("hardened_usercopy=", parse_hardened_usercopy);
+
+static int __init set_hardened_usercopy(void)
+{
+ if (enable_checks == false)
+ static_branch_enable(&bypass_usercopy_checks);
+ return 1;
+}
+
+late_initcall(set_hardened_usercopy);