diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 5 | ||||
-rw-r--r-- | mm/bootmem.c | 159 | ||||
-rw-r--r-- | mm/huge_memory.c | 6 | ||||
-rw-r--r-- | mm/init-mm.c | 11 | ||||
-rw-r--r-- | mm/memblock.c | 203 | ||||
-rw-r--r-- | mm/memcontrol.c | 13 | ||||
-rw-r--r-- | mm/memfd.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 70 | ||||
-rw-r--r-- | mm/mprotect.c | 49 | ||||
-rw-r--r-- | mm/nobootmem.c | 20 | ||||
-rw-r--r-- | mm/page_alloc.c | 27 | ||||
-rw-r--r-- | mm/page_io.c | 3 | ||||
-rw-r--r-- | mm/readahead.c | 19 | ||||
-rw-r--r-- | mm/shmem.c | 59 | ||||
-rw-r--r-- | mm/swapfile.c | 77 | ||||
-rw-r--r-- | mm/usercopy.c | 25 |
16 files changed, 487 insertions, 261 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index ce95491abd6a..9ae1b6a8e30f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1,3 +1,6 @@ + +menu "Memory Management options" + config SELECT_MEMORY_MODEL def_bool y depends on ARCH_SELECT_MEMORY_MODEL @@ -762,3 +765,5 @@ config GUP_BENCHMARK config ARCH_HAS_PTE_SPECIAL bool + +endmenu diff --git a/mm/bootmem.c b/mm/bootmem.c index 9e197987b67d..97db0e8e362b 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -21,6 +21,53 @@ #include "internal.h" +/** + * DOC: bootmem overview + * + * Bootmem is a boot-time physical memory allocator and configurator. + * + * It is used early in the boot process before the page allocator is + * set up. + * + * Bootmem is based on the most basic of allocators, a First Fit + * allocator which uses a bitmap to represent memory. If a bit is 1, + * the page is allocated and 0 if unallocated. To satisfy allocations + * of sizes smaller than a page, the allocator records the Page Frame + * Number (PFN) of the last allocation and the offset the allocation + * ended at. Subsequent small allocations are merged together and + * stored on the same page. + * + * The information used by the bootmem allocator is represented by + * :c:type:`struct bootmem_data`. An array to hold up to %MAX_NUMNODES + * such structures is statically allocated and then it is discarded + * when the system initialization completes. Each entry in this array + * corresponds to a node with memory. For UMA systems only entry 0 is + * used. + * + * The bootmem allocator is initialized during early architecture + * specific setup. Each architecture is required to supply a + * :c:func:`setup_arch` function which, among other tasks, is + * responsible for acquiring the necessary parameters to initialise + * the boot memory allocator. These parameters define limits of usable + * physical memory: + * + * * @min_low_pfn - the lowest PFN that is available in the system + * * @max_low_pfn - the highest PFN that may be addressed by low + * memory (%ZONE_NORMAL) + * * @max_pfn - the last PFN available to the system. + * + * After those limits are determined, the :c:func:`init_bootmem` or + * :c:func:`init_bootmem_node` function should be called to initialize + * the bootmem allocator. The UMA case should use the `init_bootmem` + * function. It will initialize ``contig_page_data`` structure that + * represents the only memory node in the system. In the NUMA case the + * `init_bootmem_node` function should be called to initialize the + * bootmem allocator for each node. + * + * Once the allocator is set up, it is possible to use either single + * node or NUMA variant of the allocation APIs. + */ + #ifndef CONFIG_NEED_MULTIPLE_NODES struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] @@ -62,6 +109,8 @@ static unsigned long __init bootmap_bytes(unsigned long pages) /** * bootmem_bootmap_pages - calculate bitmap size in pages * @pages: number of pages the bitmap has to represent + * + * Return: the number of pages needed to hold the bitmap. */ unsigned long __init bootmem_bootmap_pages(unsigned long pages) { @@ -121,7 +170,7 @@ static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, * @startpfn: first pfn on the node * @endpfn: first pfn after the node * - * Returns the number of bytes needed to hold the bitmap for this node. + * Return: the number of bytes needed to hold the bitmap for this node. */ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) @@ -134,7 +183,7 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, * @start: pfn where the bitmap is to be placed * @pages: number of available physical pages * - * Returns the number of bytes needed to hold the bitmap. + * Return: the number of bytes needed to hold the bitmap. */ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) { @@ -143,15 +192,6 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } -/* - * free_bootmem_late - free bootmem pages directly to page allocator - * @addr: starting physical address of the range - * @size: size of the range in bytes - * - * This is only useful when the bootmem allocator has already been torn - * down, but we are still initializing the system. Pages are given directly - * to the page allocator, no bootmem metadata is updated because it is gone. - */ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) { unsigned long cursor, end; @@ -264,11 +304,6 @@ void __init reset_all_zones_managed_pages(void) reset_managed_pages_done = 1; } -/** - * free_all_bootmem - release free pages to the buddy allocator - * - * Returns the number of pages actually released. - */ unsigned long __init free_all_bootmem(void) { unsigned long total_pages = 0; @@ -385,16 +420,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, BUG(); } -/** - * free_bootmem_node - mark a page range as usable - * @pgdat: node the range resides on - * @physaddr: starting address of the range - * @size: size of the range in bytes - * - * Partial pages will be considered reserved and left as they are. - * - * The range must reside completely on the specified node. - */ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { @@ -408,15 +433,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, mark_bootmem_node(pgdat->bdata, start, end, 0, 0); } -/** - * free_bootmem - mark a page range as usable - * @physaddr: starting physical address of the range - * @size: size of the range in bytes - * - * Partial pages will be considered reserved and left as they are. - * - * The range must be contiguous but may span node boundaries. - */ void __init free_bootmem(unsigned long physaddr, unsigned long size) { unsigned long start, end; @@ -439,6 +455,8 @@ void __init free_bootmem(unsigned long physaddr, unsigned long size) * Partial pages will be reserved. * * The range must reside completely on the specified node. + * + * Return: 0 on success, -errno on failure. */ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) @@ -460,6 +478,8 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, * Partial pages will be reserved. * * The range must be contiguous but may span node boundaries. + * + * Return: 0 on success, -errno on failure. */ int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) @@ -646,19 +666,6 @@ restart: return NULL; } -/** - * __alloc_bootmem_nopanic - allocate boot memory without panicking - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * Returns NULL on failure. - */ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) { @@ -682,19 +689,6 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, return NULL; } -/** - * __alloc_bootmem - allocate boot memory - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * The function panics if the request can not be satisfied. - */ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { @@ -754,21 +748,6 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, return NULL; } -/** - * __alloc_bootmem_node - allocate boot memory from a specific node - * @pgdat: node to allocate from - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may fall back to any node in the system if the specified node - * can not hold the requested memory. - * - * The function panics if the request can not be satisfied. - */ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { @@ -807,19 +786,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, } -/** - * __alloc_bootmem_low - allocate low boot memory - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * The function panics if the request can not be satisfied. - */ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) { @@ -834,21 +800,6 @@ void * __init __alloc_bootmem_low_nopanic(unsigned long size, ARCH_LOW_ADDRESS_LIMIT); } -/** - * __alloc_bootmem_low_node - allocate low boot memory from a specific node - * @pgdat: node to allocate from - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may fall back to any node in the system if the specified node - * can not hold the requested memory. - * - * The function panics if the request can not be satisfied. - */ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 25346bd99364..a9e1e093df51 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -552,7 +552,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1142,7 +1142,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, vmf->address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_try_charge(pages[i], vma->vm_mm, + mem_cgroup_try_charge_delay(pages[i], vma->vm_mm, GFP_KERNEL, &memcg, false))) { if (pages[i]) put_page(pages[i]); @@ -1312,7 +1312,7 @@ alloc: goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, + if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm, huge_gfp, &memcg, true))) { put_page(new_page); split_huge_pmd(vma, vmf->pmd, vmf->address); diff --git a/mm/init-mm.c b/mm/init-mm.c index f0179c9c04c2..a787a319211e 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -15,6 +15,16 @@ #define INIT_MM_CONTEXT(name) #endif +/* + * For dynamically allocated mm_structs, there is a dynamically sized cpumask + * at the end of the structure, the size of which depends on the maximum CPU + * number the system can see. That way we allocate only as much memory for + * mm_cpumask() as needed for the hundreds, or thousands of processes that + * a system typically runs. + * + * Since there is only one init_mm in the entire system, keep it simple + * and size this cpu_bitmask to NR_CPUS. + */ struct mm_struct init_mm = { .mm_rb = RB_ROOT, .pgd = swapper_pg_dir, @@ -25,5 +35,6 @@ struct mm_struct init_mm = { .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, + .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, INIT_MM_CONTEXT(init_mm) }; diff --git a/mm/memblock.c b/mm/memblock.c index 4b5d245fafc1..b4ad05764745 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -27,6 +27,61 @@ #include "internal.h" +/** + * DOC: memblock overview + * + * Memblock is a method of managing memory regions during the early + * boot period when the usual kernel memory allocators are not up and + * running. + * + * Memblock views the system memory as collections of contiguous + * regions. There are several types of these collections: + * + * * ``memory`` - describes the physical memory available to the + * kernel; this may differ from the actual physical memory installed + * in the system, for instance when the memory is restricted with + * ``mem=`` command line parameter + * * ``reserved`` - describes the regions that were allocated + * * ``physmap`` - describes the actual physical memory regardless of + * the possible restrictions; the ``physmap`` type is only available + * on some architectures. + * + * Each region is represented by :c:type:`struct memblock_region` that + * defines the region extents, its attributes and NUMA node id on NUMA + * systems. Every memory type is described by the :c:type:`struct + * memblock_type` which contains an array of memory regions along with + * the allocator metadata. The memory types are nicely wrapped with + * :c:type:`struct memblock`. This structure is statically initialzed + * at build time. The region arrays for the "memory" and "reserved" + * types are initially sized to %INIT_MEMBLOCK_REGIONS and for the + * "physmap" type to %INIT_PHYSMEM_REGIONS. + * The :c:func:`memblock_allow_resize` enables automatic resizing of + * the region arrays during addition of new regions. This feature + * should be used with care so that memory allocated for the region + * array will not overlap with areas that should be reserved, for + * example initrd. + * + * The early architecture setup should tell memblock what the physical + * memory layout is by using :c:func:`memblock_add` or + * :c:func:`memblock_add_node` functions. The first function does not + * assign the region to a NUMA node and it is appropriate for UMA + * systems. Yet, it is possible to use it on NUMA systems as well and + * assign the region to a NUMA node later in the setup process using + * :c:func:`memblock_set_node`. The :c:func:`memblock_add_node` + * performs such an assignment directly. + * + * Once memblock is setup the memory can be allocated using either + * memblock or bootmem APIs. + * + * As the system boot progresses, the architecture specific + * :c:func:`mem_init` function frees all the memory to the buddy page + * allocator. + * + * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the + * memblock data structures will be discarded after the system + * initialization compltes. + */ + static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -61,7 +116,7 @@ static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; -ulong __init_memblock choose_memblock_flags(void) +enum memblock_flags __init_memblock choose_memblock_flags(void) { return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; } @@ -93,10 +148,11 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type, return i < type->cnt; } -/* +/** * __memblock_find_range_bottom_up - find free area utility in bottom-up * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or + * %MEMBLOCK_ALLOC_ACCESSIBLE * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %NUMA_NO_NODE for any node @@ -104,13 +160,13 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type, * * Utility called from memblock_find_in_range_node(), find free area bottom-up. * - * RETURNS: + * Return: * Found address on success, 0 on failure. */ static phys_addr_t __init_memblock __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align, int nid, - ulong flags) + enum memblock_flags flags) { phys_addr_t this_start, this_end, cand; u64 i; @@ -130,7 +186,8 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, /** * __memblock_find_range_top_down - find free area utility, in top-down * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or + * %MEMBLOCK_ALLOC_ACCESSIBLE * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %NUMA_NO_NODE for any node @@ -138,13 +195,13 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, * * Utility called from memblock_find_in_range_node(), find free area top-down. * - * RETURNS: + * Return: * Found address on success, 0 on failure. */ static phys_addr_t __init_memblock __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align, int nid, - ulong flags) + enum memblock_flags flags) { phys_addr_t this_start, this_end, cand; u64 i; @@ -170,7 +227,8 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * @size: size of free area to find * @align: alignment of free area to find * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or + * %MEMBLOCK_ALLOC_ACCESSIBLE * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * @flags: pick from blocks based on memory attributes * @@ -184,12 +242,13 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * * If bottom-up allocation failed, will try to allocate memory top-down. * - * RETURNS: + * Return: * Found address on success, 0 on failure. */ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid, ulong flags) + phys_addr_t end, int nid, + enum memblock_flags flags) { phys_addr_t kernel_end, ret; @@ -239,13 +298,14 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, /** * memblock_find_in_range - find free area in given range * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or + * %MEMBLOCK_ALLOC_ACCESSIBLE * @size: size of free area to find * @align: alignment of free area to find * * Find @size free area aligned to @align in the specified range. * - * RETURNS: + * Return: * Found address on success, 0 on failure. */ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, @@ -253,7 +313,7 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t align) { phys_addr_t ret; - ulong flags = choose_memblock_flags(); + enum memblock_flags flags = choose_memblock_flags(); again: ret = memblock_find_in_range_node(size, align, start, end, @@ -289,7 +349,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK /** - * Discard memory and reserved arrays if they were allocated + * memblock_discard - discard memory and reserved arrays if they were allocated */ void __init memblock_discard(void) { @@ -319,11 +379,11 @@ void __init memblock_discard(void) * * Double the size of the @type regions array. If memblock is being used to * allocate memory for a new reserved regions array and there is a previously - * allocated memory range [@new_area_start,@new_area_start+@new_area_size] + * allocated memory range [@new_area_start, @new_area_start + @new_area_size] * waiting to be reserved, ensure the memory used by the new array does * not overlap. * - * RETURNS: + * Return: * 0 on success, -1 on failure. */ static int __init_memblock memblock_double_array(struct memblock_type *type, @@ -468,13 +528,14 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) * @nid: node id of the new region * @flags: flags of the new region * - * Insert new memblock region [@base,@base+@size) into @type at @idx. + * Insert new memblock region [@base, @base + @size) into @type at @idx. * @type must already have extra room to accommodate the new region. */ static void __init_memblock memblock_insert_region(struct memblock_type *type, int idx, phys_addr_t base, phys_addr_t size, - int nid, unsigned long flags) + int nid, + enum memblock_flags flags) { struct memblock_region *rgn = &type->regions[idx]; @@ -496,17 +557,17 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * @nid: nid of the new region * @flags: flags of the new region * - * Add new memblock region [@base,@base+@size) into @type. The new region + * Add new memblock region [@base, @base + @size) into @type. The new region * is allowed to overlap with existing ones - overlaps don't affect already * existing regions. @type is guaranteed to be minimal (all neighbouring * compatible regions are merged) after the addition. * - * RETURNS: + * Return: * 0 on success, -errno on failure. */ int __init_memblock memblock_add_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size, - int nid, unsigned long flags) + int nid, enum memblock_flags flags) { bool insert = false; phys_addr_t obase = base; @@ -590,12 +651,35 @@ repeat: } } +/** + * memblock_add_node - add new memblock region within a NUMA node + * @base: base address of the new region + * @size: size of the new region + * @nid: nid of the new region + * + * Add new memblock region [@base, @base + @size) to the "memory" + * type. See memblock_add_range() description for mode details + * + * Return: + * 0 on success, -errno on failure. + */ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int nid) { return memblock_add_range(&memblock.memory, base, size, nid, 0); } +/** + * memblock_add - add new memblock region + * @base: base address of the new region + * @size: size of the new region + * + * Add new memblock region [@base, @base + @size) to the "memory" + * type. See memblock_add_range() description for mode details + * + * Return: + * 0 on success, -errno on failure. + */ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; @@ -615,11 +699,11 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) * @end_rgn: out parameter for the end of isolated region * * Walk @type and ensure that regions don't cross the boundaries defined by - * [@base,@base+@size). Crossing regions are split at the boundaries, + * [@base, @base + @size). Crossing regions are split at the boundaries, * which may create at most two more regions. The index of the first * region inside the range is returned in *@start_rgn and end in *@end_rgn. * - * RETURNS: + * Return: * 0 on success, -errno on failure. */ static int __init_memblock memblock_isolate_range(struct memblock_type *type, @@ -730,10 +814,15 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) } /** + * memblock_setclr_flag - set or clear flag for a memory region + * @base: base address of the region + * @size: size of the region + * @set: set or clear the flag + * @flag: the flag to udpate * * This function isolates region [@base, @base + @size), and sets/clears flag * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ static int __init_memblock memblock_setclr_flag(phys_addr_t base, phys_addr_t size, int set, int flag) @@ -760,7 +849,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base, * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) { @@ -772,7 +861,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) { @@ -784,7 +873,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) { @@ -798,7 +887,7 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) { @@ -810,7 +899,7 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) { @@ -875,7 +964,8 @@ void __init_memblock __next_reserved_mem_region(u64 *idx, * As both region arrays are sorted, the function advances the two indices * in lockstep and returns each intersection. */ -void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, +void __init_memblock __next_mem_range(u64 *idx, int nid, + enum memblock_flags flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, @@ -970,9 +1060,6 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, /** * __next_mem_range_rev - generic next function for for_each_*_range_rev() * - * Finds the next range from type_a which is not marked as unsuitable - * in type_b. - * * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes * @flags: pick from blocks based on memory attributes @@ -982,9 +1069,13 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL * + * Finds the next range from type_a which is not marked as unsuitable + * in type_b. + * * Reverse of __next_mem_range(). */ -void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, +void __init_memblock __next_mem_range_rev(u64 *idx, int nid, + enum memblock_flags flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, @@ -1116,10 +1207,10 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, * @type: memblock type to set node ID for * @nid: node ID to set * - * Set the nid of memblock @type regions in [@base,@base+@size) to @nid. + * Set the nid of memblock @type regions in [@base, @base + @size) to @nid. * Regions which cross the area boundaries are split as necessary. * - * RETURNS: + * Return: * 0 on success, -errno on failure. */ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, @@ -1142,7 +1233,8 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid, ulong flags) + phys_addr_t end, int nid, + enum memblock_flags flags) { phys_addr_t found; @@ -1164,7 +1256,7 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, - ulong flags) + enum memblock_flags flags) { return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, flags); @@ -1172,14 +1264,14 @@ phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr, - int nid, ulong flags) + int nid, enum memblock_flags flags) { return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags); } phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { - ulong flags = choose_memblock_flags(); + enum memblock_flags flags = choose_memblock_flags(); phys_addr_t ret; again: @@ -1243,7 +1335,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i * The allocation is performed from memory region limited by * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. * - * The memory block is aligned on SMP_CACHE_BYTES if @align == 0. + * The memory block is aligned on %SMP_CACHE_BYTES if @align == 0. * * The phys address of allocated boot memory block is converted to virtual and * allocated memory is reset to 0. @@ -1251,7 +1343,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i * In addition, function sets the min_count to 0 using kmemleak_alloc for * allocated boot memory block, so that it is never reported as leaks. * - * RETURNS: + * Return: * Virtual address of allocated memory block on success, NULL on failure. */ static void * __init memblock_virt_alloc_internal( @@ -1261,7 +1353,7 @@ static void * __init memblock_virt_alloc_internal( { phys_addr_t alloc; void *ptr; - ulong flags = choose_memblock_flags(); + enum memblock_flags flags = choose_memblock_flags(); if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) nid = NUMA_NO_NODE; @@ -1336,7 +1428,7 @@ done: * info), if enabled. Does not zero allocated memory, does not panic if request * cannot be satisfied. * - * RETURNS: + * Return: * Virtual address of allocated memory block on success, NULL on failure. */ void * __init memblock_virt_alloc_try_nid_raw( @@ -1373,7 +1465,7 @@ void * __init memblock_virt_alloc_try_nid_raw( * Public function, provides additional debug information (including caller * info), if enabled. This function zeroes the allocated memory. * - * RETURNS: + * Return: * Virtual address of allocated memory block on success, NULL on failure. */ void * __init memblock_virt_alloc_try_nid_nopanic( @@ -1409,7 +1501,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic( * which provides debug information (including caller info), if enabled, * and panics if the request can not be satisfied. * - * RETURNS: + * Return: * Virtual address of allocated memory block on success, NULL on failure. */ void * __init memblock_virt_alloc_try_nid( @@ -1453,9 +1545,9 @@ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) memblock_remove_range(&memblock.reserved, base, size); } -/* +/** * __memblock_free_late - free bootmem block pages directly to buddy allocator - * @addr: phys starting address of the boot memory block + * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * * This is only useful when the bootmem allocator has already been torn @@ -1667,9 +1759,9 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, * @base: base of region to check * @size: size of region to check * - * Check if the region [@base, @base+@size) is a subset of a memory block. + * Check if the region [@base, @base + @size) is a subset of a memory block. * - * RETURNS: + * Return: * 0 if false, non-zero if true */ bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) @@ -1688,9 +1780,10 @@ bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t siz * @base: base of region to check * @size: size of region to check * - * Check if the region [@base, @base+@size) intersects a reserved memory block. + * Check if the region [@base, @base + @size) intersects a reserved + * memory block. * - * RETURNS: + * Return: * True if they intersect, false if not. */ bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) @@ -1737,7 +1830,7 @@ phys_addr_t __init_memblock memblock_get_current_limit(void) static void __init_memblock memblock_dump(struct memblock_type *type) { phys_addr_t base, end, size; - unsigned long flags; + enum memblock_flags flags; int idx; struct memblock_region *rgn; @@ -1755,7 +1848,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif - pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#lx\n", + pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#x\n", type->name, idx, &base, &end, &size, nid_buf, flags); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b2173f7e5164..b836e7f00309 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5600,6 +5600,19 @@ out: return ret; } +int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp, + bool compound) +{ + struct mem_cgroup *memcg; + int ret; + + ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound); + memcg = *memcgp; + mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask); + return ret; +} + /** * mem_cgroup_commit_charge - commit a page charge * @page: page to charge diff --git a/mm/memfd.c b/mm/memfd.c index 27069518e3c5..2bb5e257080e 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -326,7 +326,7 @@ SYSCALL_DEFINE2(memfd_create, goto err_fd; } file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - file->f_flags |= O_RDWR | O_LARGEFILE; + file->f_flags |= O_LARGEFILE; if (flags & MFD_ALLOW_SEALING) { file_seals = memfd_file_seals_ptr(file); diff --git a/mm/memory.c b/mm/memory.c index c5e87a3a82ba..348279ff6e51 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -326,16 +326,20 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ #ifdef CONFIG_HAVE_RCU_TABLE_FREE -/* - * See the comment near struct mmu_table_batch. - */ - static void tlb_remove_table_smp_sync(void *arg) { - /* Simply deliver the interrupt */ + struct mm_struct __maybe_unused *mm = arg; + /* + * On most architectures this does nothing. Simply delivering the + * interrupt is enough to prevent races with software page table + * walking like that done in get_user_pages_fast. + * + * See the comment near struct mmu_table_batch. + */ + tlb_flush_remove_tables_local(mm); } -static void tlb_remove_table_one(void *table) +static void tlb_remove_table_one(void *table, struct mmu_gather *tlb) { /* * This isn't an RCU grace period and hence the page-tables cannot be @@ -344,7 +348,7 @@ static void tlb_remove_table_one(void *table) * It is however sufficient for software page-table walkers that rely on * IRQ disabling. See the comment near struct mmu_table_batch. */ - smp_call_function(tlb_remove_table_smp_sync, NULL, 1); + smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1); __tlb_remove_table(table); } @@ -365,6 +369,8 @@ void tlb_table_flush(struct mmu_gather *tlb) { struct mmu_table_batch **batch = &tlb->batch; + tlb_flush_remove_tables(tlb->mm); + if (*batch) { call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); *batch = NULL; @@ -387,7 +393,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) if (*batch == NULL) { *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (*batch == NULL) { - tlb_remove_table_one(table); + tlb_remove_table_one(table, tlb); return; } (*batch)->nr = 0; @@ -1884,6 +1890,9 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; + if (!pfn_modify_allowed(pfn, pgprot)) + return -EACCES; + track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, @@ -1919,6 +1928,9 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, track_pfn_insert(vma, &pgprot, pfn); + if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) + return -EACCES; + /* * If we don't have pte special, then we have to use the pfn_valid() * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* @@ -1980,6 +1992,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; spinlock_t *ptl; + int err = 0; pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) @@ -1987,12 +2000,16 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, arch_enter_lazy_mmu_mode(); do { BUG_ON(!pte_none(*pte)); + if (!pfn_modify_allowed(pfn, prot)) { + err = -EACCES; + break; + } set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); - return 0; + return err; } static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, @@ -2001,6 +2018,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, { pmd_t *pmd; unsigned long next; + int err; pfn -= addr >> PAGE_SHIFT; pmd = pmd_alloc(mm, pud, addr); @@ -2009,9 +2027,10 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); - if (remap_pte_range(mm, pmd, addr, next, - pfn + (addr >> PAGE_SHIFT), prot)) - return -ENOMEM; + err = remap_pte_range(mm, pmd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + return err; } while (pmd++, addr = next, addr != end); return 0; } @@ -2022,6 +2041,7 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, { pud_t *pud; unsigned long next; + int err; pfn -= addr >> PAGE_SHIFT; pud = pud_alloc(mm, p4d, addr); @@ -2029,9 +2049,10 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, return -ENOMEM; do { next = pud_addr_end(addr, end); - if (remap_pmd_range(mm, pud, addr, next, - pfn + (addr >> PAGE_SHIFT), prot)) - return -ENOMEM; + err = remap_pmd_range(mm, pud, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + return err; } while (pud++, addr = next, addr != end); return 0; } @@ -2042,6 +2063,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, { p4d_t *p4d; unsigned long next; + int err; pfn -= addr >> PAGE_SHIFT; p4d = p4d_alloc(mm, pgd, addr); @@ -2049,9 +2071,10 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (remap_pud_range(mm, p4d, addr, next, - pfn + (addr >> PAGE_SHIFT), prot)) - return -ENOMEM; + err = remap_pud_range(mm, p4d, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + return err; } while (p4d++, addr = next, addr != end); return 0; } @@ -2501,7 +2524,7 @@ static int wp_page_copy(struct vm_fault *vmf) cow_user_page(new_page, old_page, vmf->address, vma); } - if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false)) goto oom_free_new; __SetPageUptodate(new_page); @@ -3001,8 +3024,8 @@ int do_swap_page(struct vm_fault *vmf) goto out_page; } - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, + &memcg, false)) { ret = VM_FAULT_OOM; goto out_page; } @@ -3163,7 +3186,8 @@ static int do_anonymous_page(struct vm_fault *vmf) if (!page) goto oom; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg, + false)) goto oom_free_page; /* @@ -3659,7 +3683,7 @@ static int do_cow_fault(struct vm_fault *vmf) if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, + if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL, &vmf->memcg, false)) { put_page(vmf->cow_page); return VM_FAULT_OOM; diff --git a/mm/mprotect.c b/mm/mprotect.c index 625608bc8962..6d331620b9e5 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -306,6 +306,42 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, return pages; } +static int prot_none_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + 0 : -EACCES; +} + +static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + 0 : -EACCES; +} + +static int prot_none_test(unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + return 0; +} + +static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long newflags) +{ + pgprot_t new_pgprot = vm_get_page_prot(newflags); + struct mm_walk prot_none_walk = { + .pte_entry = prot_none_pte_entry, + .hugetlb_entry = prot_none_hugetlb_entry, + .test_walk = prot_none_test, + .mm = current->mm, + .private = &new_pgprot, + }; + + return walk_page_range(start, end, &prot_none_walk); +} + int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags) @@ -324,6 +360,19 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, } /* + * Do PROT_NONE PFN permission checks here when we can still + * bail out without undoing a lot of state. This is a rather + * uncommon case, so doesn't need to be very optimized. + */ + if (arch_has_pfn_modify_check() && + (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && + (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { + error = prot_none_walk(vma, start, end, newflags); + if (error) + return error; + } + + /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we * make it unwritable again. hugetlb mapping were accounted for diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 9b02fda0886b..439af3b765a7 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -42,7 +42,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, { void *ptr; u64 addr; - ulong flags = choose_memblock_flags(); + enum memblock_flags flags = choose_memblock_flags(); if (limit > memblock.current_limit) limit = memblock.current_limit; @@ -72,7 +72,7 @@ again: return ptr; } -/* +/** * free_bootmem_late - free bootmem pages directly to page allocator * @addr: starting address of the range * @size: size of the range in bytes @@ -176,7 +176,7 @@ void __init reset_all_zones_managed_pages(void) /** * free_all_bootmem - release free pages to the buddy allocator * - * Returns the number of pages actually released. + * Return: the number of pages actually released. */ unsigned long __init free_all_bootmem(void) { @@ -193,7 +193,7 @@ unsigned long __init free_all_bootmem(void) /** * free_bootmem_node - mark a page range as usable * @pgdat: node the range resides on - * @physaddr: starting address of the range + * @physaddr: starting physical address of the range * @size: size of the range in bytes * * Partial pages will be considered reserved and left as they are. @@ -208,7 +208,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, /** * free_bootmem - mark a page range as usable - * @addr: starting address of the range + * @addr: starting physical address of the range * @size: size of the range in bytes * * Partial pages will be considered reserved and left as they are. @@ -256,7 +256,7 @@ restart: * * Allocation may happen on any node in the system. * - * Returns NULL on failure. + * Return: address of the allocated region or %NULL on failure. */ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) @@ -293,6 +293,8 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, * Allocation may happen on any node in the system. * * The function panics if the request can not be satisfied. + * + * Return: address of the allocated region. */ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) @@ -367,6 +369,8 @@ static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, * can not hold the requested memory. * * The function panics if the request can not be satisfied. + * + * Return: address of the allocated region. */ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) @@ -396,6 +400,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, * Allocation may happen on any node in the system. * * The function panics if the request can not be satisfied. + * + * Return: address of the allocated region. */ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) @@ -425,6 +431,8 @@ void * __init __alloc_bootmem_low_nopanic(unsigned long size, * can not hold the requested memory. * * The function panics if the request can not be satisfied. + * + * Return: address of the allocated region. */ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a790ef4be74e..0922ef5d2e46 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -155,16 +155,17 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype) * The following functions are used by the suspend/hibernate code to temporarily * change gfp_allowed_mask in order to avoid using I/O during memory allocations * while devices are suspended. To avoid races with the suspend/hibernate code, - * they should always be called with pm_mutex held (gfp_allowed_mask also should - * only be modified with pm_mutex held, unless the suspend/hibernate code is - * guaranteed not to run in parallel with that modification). + * they should always be called with system_transition_mutex held + * (gfp_allowed_mask also should only be modified with system_transition_mutex + * held, unless the suspend/hibernate code is guaranteed not to run in parallel + * with that modification). */ static gfp_t saved_gfp_mask; void pm_restore_gfp_mask(void) { - WARN_ON(!mutex_is_locked(&pm_mutex)); + WARN_ON(!mutex_is_locked(&system_transition_mutex)); if (saved_gfp_mask) { gfp_allowed_mask = saved_gfp_mask; saved_gfp_mask = 0; @@ -173,7 +174,7 @@ void pm_restore_gfp_mask(void) void pm_restrict_gfp_mask(void) { - WARN_ON(!mutex_is_locked(&pm_mutex)); + WARN_ON(!mutex_is_locked(&system_transition_mutex)); WARN_ON(saved_gfp_mask); saved_gfp_mask = gfp_allowed_mask; gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); @@ -6939,9 +6940,21 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) start = (void *)PAGE_ALIGN((unsigned long)start); end = (void *)((unsigned long)end & PAGE_MASK); for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { + struct page *page = virt_to_page(pos); + void *direct_map_addr; + + /* + * 'direct_map_addr' might be different from 'pos' + * because some architectures' virt_to_page() + * work with aliases. Getting the direct map + * address ensures that we get a _writeable_ + * alias for the memset(). + */ + direct_map_addr = page_address(page); if ((unsigned int)poison <= 0xFF) - memset(pos, poison, PAGE_SIZE); - free_reserved_page(virt_to_page(pos)); + memset(direct_map_addr, poison, PAGE_SIZE); + + free_reserved_page(page); } if (pages && s) diff --git a/mm/page_io.c b/mm/page_io.c index b41cf9644585..aafd19ec1db4 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -338,7 +338,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, ret = -ENOMEM; goto out; } - bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); + bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); + bio_associate_blkcg_from_page(bio, page); count_swpout_vm_event(page); set_page_writeback(page); unlock_page(page); diff --git a/mm/readahead.c b/mm/readahead.c index e273f0de3376..a59ea70527b9 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -19,6 +19,7 @@ #include <linux/syscalls.h> #include <linux/file.h> #include <linux/mm_inline.h> +#include <linux/blk-cgroup.h> #include "internal.h" @@ -385,6 +386,7 @@ ondemand_readahead(struct address_space *mapping, { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages = ra->ra_pages; + unsigned long add_pages; pgoff_t prev_offset; /* @@ -474,10 +476,17 @@ readit: * Will this read hit the readahead marker made by itself? * If so, trigger the readahead marker hit now, and merge * the resulted next readahead window into the current one. + * Take care of maximum IO pages as above. */ if (offset == ra->start && ra->size == ra->async_size) { - ra->async_size = get_next_ra_size(ra, max_pages); - ra->size += ra->async_size; + add_pages = get_next_ra_size(ra, max_pages); + if (ra->size + add_pages <= max_pages) { + ra->async_size = add_pages; + ra->size += add_pages; + } else { + ra->size = max_pages; + ra->async_size = max_pages >> 1; + } } return ra_submit(ra, mapping, filp); @@ -505,6 +514,9 @@ void page_cache_sync_readahead(struct address_space *mapping, if (!ra->ra_pages) return; + if (blk_cgroup_congested()) + return; + /* be dumb */ if (filp && (filp->f_mode & FMODE_RANDOM)) { force_page_cache_readahead(mapping, filp, offset, req_size); @@ -555,6 +567,9 @@ page_cache_async_readahead(struct address_space *mapping, if (inode_read_congested(mapping->host)) return; + if (blk_cgroup_congested()) + return; + /* do read-ahead */ ondemand_readahead(mapping, ra, filp, true, offset, req_size); } diff --git a/mm/shmem.c b/mm/shmem.c index 41b9bbf24e16..06ebe17bb924 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1239,8 +1239,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg, - false); + error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, + &memcg, false); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ @@ -1713,7 +1713,7 @@ repeat: goto failed; } - error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, false); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, @@ -1819,7 +1819,7 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, PageTransHuge(page)); if (error) goto unacct; @@ -2292,7 +2292,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, __SetPageSwapBacked(page); __SetPageUptodate(page); - ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false); + ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); if (ret) goto out_release; @@ -3897,18 +3897,11 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); /* common code */ -static const struct dentry_operations anon_ops = { - .d_dname = simple_dname -}; - static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags, unsigned int i_flags) { - struct file *res; struct inode *inode; - struct path path; - struct super_block *sb; - struct qstr this; + struct file *res; if (IS_ERR(mnt)) return ERR_CAST(mnt); @@ -3919,41 +3912,21 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l if (shmem_acct_size(flags, size)) return ERR_PTR(-ENOMEM); - res = ERR_PTR(-ENOMEM); - this.name = name; - this.len = strlen(name); - this.hash = 0; /* will go */ - sb = mnt->mnt_sb; - path.mnt = mntget(mnt); - path.dentry = d_alloc_pseudo(sb, &this); - if (!path.dentry) - goto put_memory; - d_set_d_op(path.dentry, &anon_ops); - - res = ERR_PTR(-ENOSPC); - inode = shmem_get_inode(sb, NULL, S_IFREG | 0777, 0, flags); - if (!inode) - goto put_memory; - + inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, + flags); + if (unlikely(!inode)) { + shmem_unacct_size(flags, size); + return ERR_PTR(-ENOSPC); + } inode->i_flags |= i_flags; - d_instantiate(path.dentry, inode); inode->i_size = size; clear_nlink(inode); /* It is unlinked */ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); + if (!IS_ERR(res)) + res = alloc_file_pseudo(inode, mnt, name, O_RDWR, + &shmem_file_operations); if (IS_ERR(res)) - goto put_path; - - res = alloc_file(&path, FMODE_WRITE | FMODE_READ, - &shmem_file_operations); - if (IS_ERR(res)) - goto put_path; - - return res; - -put_memory: - shmem_unacct_size(flags, size); -put_path: - path_put(&path); + iput(inode); return res; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 2cc2972eedaf..8837b22c848d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2909,6 +2909,35 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) return 0; } + +/* + * Find out how many pages are allowed for a single swap device. There + * are two limiting factors: + * 1) the number of bits for the swap offset in the swp_entry_t type, and + * 2) the number of bits in the swap pte, as defined by the different + * architectures. + * + * In order to find the largest possible bit mask, a swap entry with + * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, + * decoded to a swp_entry_t again, and finally the swap offset is + * extracted. + * + * This will mask all the bits from the initial ~0UL mask that can't + * be encoded in either the swp_entry_t or the architecture definition + * of a swap pte. + */ +unsigned long generic_max_swapfile_size(void) +{ + return swp_offset(pte_to_swp_entry( + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; +} + +/* Can be overridden by an architecture for additional checks. */ +__weak unsigned long max_swapfile_size(void) +{ + return generic_max_swapfile_size(); +} + static unsigned long read_swap_header(struct swap_info_struct *p, union swap_header *swap_header, struct inode *inode) @@ -2944,22 +2973,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, p->cluster_next = 1; p->cluster_nr = 0; - /* - * Find out how many pages are allowed for a single swap - * device. There are two limiting factors: 1) the number - * of bits for the swap offset in the swp_entry_t type, and - * 2) the number of bits in the swap pte as defined by the - * different architectures. In order to find the - * largest possible bit mask, a swap entry with swap type 0 - * and swap offset ~0UL is created, encoded to a swap pte, - * decoded to a swp_entry_t again, and finally the swap - * offset is extracted. This will mask all the bits from - * the initial ~0UL mask that can't be encoded in either - * the swp_entry_t or the architecture definition of a - * swap pte. - */ - maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + maxpages = max_swapfile_size(); last_page = swap_header->info.last_page; if (!last_page) { pr_warn("Empty swap-file\n"); @@ -3731,6 +3745,37 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } } +#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, + gfp_t gfp_mask) +{ + struct swap_info_struct *si, *next; + if (!(gfp_mask & __GFP_IO) || !memcg) + return; + + if (!blk_cgroup_congested()) + return; + + /* + * We've already scheduled a throttle, avoid taking the global swap + * lock. + */ + if (current->throttle_queue) + return; + + spin_lock(&swap_avail_lock); + plist_for_each_entry_safe(si, next, &swap_avail_heads[node], + avail_lists[node]) { + if (si->bdev) { + blkcg_schedule_throttle(bdev_get_queue(si->bdev), + true); + break; + } + } + spin_unlock(&swap_avail_lock); +} +#endif + static int __init swapfile_init(void) { int nid; diff --git a/mm/usercopy.c b/mm/usercopy.c index e9e9325f7638..852eb4e53f06 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -20,6 +20,8 @@ #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/thread_info.h> +#include <linux/atomic.h> +#include <linux/jump_label.h> #include <asm/sections.h> /* @@ -240,6 +242,8 @@ static inline void check_heap_object(const void *ptr, unsigned long n, } } +static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); + /* * Validates that the given object is: * - not bogus address @@ -248,6 +252,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n, */ void __check_object_size(const void *ptr, unsigned long n, bool to_user) { + if (static_branch_unlikely(&bypass_usercopy_checks)) + return; + /* Skip all tests if size is zero. */ if (!n) return; @@ -279,3 +286,21 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) check_kernel_text_object((const unsigned long)ptr, n, to_user); } EXPORT_SYMBOL(__check_object_size); + +static bool enable_checks __initdata = true; + +static int __init parse_hardened_usercopy(char *str) +{ + return strtobool(str, &enable_checks); +} + +__setup("hardened_usercopy=", parse_hardened_usercopy); + +static int __init set_hardened_usercopy(void) +{ + if (enable_checks == false) + static_branch_enable(&bypass_usercopy_checks); + return 1; +} + +late_initcall(set_hardened_usercopy); |