summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-10-14 19:57:24 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2020-10-14 19:57:24 +0300
commitd5660df4a555a98154da850fb61f118269d0a283 (patch)
treeb2c5f3a15c300499df930321c32fd7d288467d6b
parentb5fc7a89e58bcc059a3d5e4db79c481fb437de59 (diff)
parentf1f4f3ab54e9a52c7610c998ff8255f019742e67 (diff)
downloadlinux-d5660df4a555a98154da850fb61f118269d0a283.tar.xz
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: "181 patches. Subsystems affected by this patch series: kbuild, scripts, ntfs, ocfs2, vfs, mm (slab, slub, kmemleak, dax, debug, pagecache, fadvise, gup, swap, memremap, memcg, selftests, pagemap, mincore, hmm, dma, memory-failure, vmallo and migration)" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (181 commits) mm/migrate: remove obsolete comment about device public mm/migrate: remove cpages-- in migrate_vma_finalize() mm, oom_adj: don't loop through tasks in __set_oom_adj when not necessary memblock: use separate iterators for memory and reserved regions memblock: implement for_each_reserved_mem_region() using __next_mem_region() memblock: remove unused memblock_mem_size() x86/setup: simplify reserve_crashkernel() x86/setup: simplify initrd relocation and reservation arch, drivers: replace for_each_membock() with for_each_mem_range() arch, mm: replace for_each_memblock() with for_each_mem_pfn_range() memblock: reduce number of parameters in for_each_mem_range() memblock: make memblock_debug and related functionality private memblock: make for_each_memblock_type() iterator private mircoblaze: drop unneeded NUMA and sparsemem initializations riscv: drop unneeded node initialization h8300, nds32, openrisc: simplify detection of memory extents arm64: numa: simplify dummy_numa_init() arm, xtensa: simplify initialization of high memory pages dma-contiguous: simplify cma_early_percent_memory() KVM: PPC: Book3S HV: simplify kvm_cma_reserve() ...
-rw-r--r--.clang-format5
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst69
-rw-r--r--Documentation/admin-guide/mm/hugetlbpage.rst2
-rw-r--r--Documentation/dev-tools/kasan.rst74
-rw-r--r--Documentation/dev-tools/kmemleak.rst2
-rw-r--r--Documentation/kbuild/makefiles.rst20
-rw-r--r--Documentation/vm/active_mm.rst2
-rw-r--r--Documentation/x86/x86_64/boot-options.rst4
-rw-r--r--MAINTAINERS2
-rw-r--r--Makefile9
-rw-r--r--arch/arm/Kconfig2
-rw-r--r--arch/arm/include/asm/tlb.h1
-rw-r--r--arch/arm/kernel/setup.c18
-rw-r--r--arch/arm/mm/init.c59
-rw-r--r--arch/arm/mm/mmu.c39
-rw-r--r--arch/arm/mm/pmsa-v7.c23
-rw-r--r--arch/arm/mm/pmsa-v8.c17
-rw-r--r--arch/arm/xen/mm.c7
-rw-r--r--arch/arm64/Kconfig2
-rw-r--r--arch/arm64/kernel/machine_kexec_file.c6
-rw-r--r--arch/arm64/kernel/setup.c4
-rw-r--r--arch/arm64/kernel/vdso/Makefile7
-rw-r--r--arch/arm64/mm/init.c11
-rw-r--r--arch/arm64/mm/kasan_init.c10
-rw-r--r--arch/arm64/mm/mmu.c11
-rw-r--r--arch/arm64/mm/numa.c15
-rw-r--r--arch/c6x/kernel/setup.c9
-rw-r--r--arch/h8300/kernel/setup.c8
-rw-r--r--arch/microblaze/mm/init.c21
-rw-r--r--arch/mips/cavium-octeon/dma-octeon.c14
-rw-r--r--arch/mips/kernel/setup.c31
-rw-r--r--arch/mips/netlogic/xlp/setup.c2
-rw-r--r--arch/nds32/kernel/setup.c8
-rw-r--r--arch/openrisc/kernel/setup.c9
-rw-r--r--arch/openrisc/mm/init.c8
-rw-r--r--arch/powerpc/kernel/fadump.c57
-rw-r--r--arch/powerpc/kexec/file_load_64.c16
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c12
-rw-r--r--arch/powerpc/kvm/book3s_hv_uvmem.c14
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c16
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c10
-rw-r--r--arch/powerpc/mm/kasan/kasan_init_32.c8
-rw-r--r--arch/powerpc/mm/mem.c33
-rw-r--r--arch/powerpc/mm/numa.c7
-rw-r--r--arch/powerpc/mm/pgtable_32.c8
-rw-r--r--arch/riscv/mm/init.c36
-rw-r--r--arch/riscv/mm/kasan_init.c10
-rw-r--r--arch/s390/kernel/setup.c27
-rw-r--r--arch/s390/mm/page-states.c6
-rw-r--r--arch/s390/mm/vmem.c7
-rw-r--r--arch/sh/mm/init.c9
-rw-r--r--arch/sparc/mm/init_64.c12
-rw-r--r--arch/x86/include/asm/numa.h8
-rw-r--r--arch/x86/kernel/e820.c16
-rw-r--r--arch/x86/kernel/setup.c56
-rw-r--r--arch/x86/mm/numa.c13
-rw-r--r--arch/x86/mm/numa_emulation.c3
-rw-r--r--arch/x86/xen/enlighten_pv.c2
-rw-r--r--arch/xtensa/mm/init.c55
-rw-r--r--drivers/acpi/numa/hmat.c76
-rw-r--r--drivers/acpi/numa/srat.c9
-rw-r--r--drivers/base/core.c2
-rw-r--r--drivers/bus/mvebu-mbus.c12
-rw-r--r--drivers/dax/Kconfig6
-rw-r--r--drivers/dax/Makefile3
-rw-r--r--drivers/dax/bus.c1049
-rw-r--r--drivers/dax/bus.h28
-rw-r--r--drivers/dax/dax-private.h60
-rw-r--r--drivers/dax/device.c138
-rw-r--r--drivers/dax/hmem/Makefile6
-rw-r--r--drivers/dax/hmem/device.c100
-rw-r--r--drivers/dax/hmem/hmem.c (renamed from drivers/dax/hmem.c)23
-rw-r--r--drivers/dax/kmem.c178
-rw-r--r--drivers/dax/pmem/compat.c2
-rw-r--r--drivers/dax/pmem/core.c22
-rw-r--r--drivers/firmware/efi/x86_fake_mem.c12
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_shmem.c4
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_dmem.c15
-rw-r--r--drivers/irqchip/irq-gic-v3-its.c2
-rw-r--r--drivers/nvdimm/badrange.c26
-rw-r--r--drivers/nvdimm/claim.c13
-rw-r--r--drivers/nvdimm/nd.h3
-rw-r--r--drivers/nvdimm/pfn_devs.c13
-rw-r--r--drivers/nvdimm/pmem.c27
-rw-r--r--drivers/nvdimm/region.c21
-rw-r--r--drivers/pci/p2pdma.c12
-rw-r--r--drivers/virtio/virtio_mem.c47
-rw-r--r--drivers/xen/unpopulated-alloc.c45
-rw-r--r--fs/fs_parser.c2
-rw-r--r--fs/ntfs/inode.c6
-rw-r--r--fs/ocfs2/alloc.c6
-rw-r--r--fs/ocfs2/localalloc.c2
-rw-r--r--fs/proc/base.c3
-rw-r--r--fs/proc/task_mmu.c104
-rw-r--r--fs/xattr.c22
-rw-r--r--include/acpi/acpi_numa.h14
-rw-r--r--include/kunit/test.h5
-rw-r--r--include/linux/acpi.h2
-rw-r--r--include/linux/compaction.h3
-rw-r--r--include/linux/compiler-clang.h8
-rw-r--r--include/linux/compiler-gcc.h2
-rw-r--r--include/linux/compiler.h2
-rw-r--r--include/linux/dax.h8
-rw-r--r--include/linux/export.h2
-rw-r--r--include/linux/fs.h4
-rw-r--r--include/linux/gfp.h6
-rw-r--r--include/linux/huge_mm.h3
-rw-r--r--include/linux/kasan.h6
-rw-r--r--include/linux/memblock.h88
-rw-r--r--include/linux/memcontrol.h13
-rw-r--r--include/linux/memory_hotplug.h23
-rw-r--r--include/linux/memremap.h11
-rw-r--r--include/linux/mm.h36
-rw-r--r--include/linux/mmap_lock.h5
-rw-r--r--include/linux/mmzone.h37
-rw-r--r--include/linux/numa.h11
-rw-r--r--include/linux/oom.h1
-rw-r--r--include/linux/page-flags.h42
-rw-r--r--include/linux/pagemap.h43
-rw-r--r--include/linux/range.h6
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/sched/coredump.h1
-rw-r--r--include/linux/slab.h2
-rw-r--r--include/linux/swap.h10
-rw-r--r--include/linux/swap_slots.h2
-rw-r--r--kernel/dma/contiguous.c11
-rw-r--r--kernel/fork.c25
-rw-r--r--kernel/resource.c11
-rw-r--r--lib/Kconfig.debug9
-rw-r--r--lib/Kconfig.kasan31
-rw-r--r--lib/Makefile5
-rw-r--r--lib/kunit/test.c13
-rw-r--r--lib/test_free_pages.c42
-rw-r--r--lib/test_hmm.c65
-rw-r--r--lib/test_kasan.c728
-rw-r--r--lib/test_kasan_module.c111
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/Makefile1
-rw-r--r--mm/compaction.c5
-rw-r--r--mm/debug.c18
-rw-r--r--mm/dmapool.c46
-rw-r--r--mm/fadvise.c9
-rw-r--r--mm/filemap.c74
-rw-r--r--mm/gup.c44
-rw-r--r--mm/gup_benchmark.c23
-rw-r--r--mm/huge_memory.c4
-rw-r--r--mm/hugetlb.c100
-rw-r--r--mm/internal.h3
-rw-r--r--mm/kasan/report.c34
-rw-r--r--mm/kmemleak.c8
-rw-r--r--mm/madvise.c21
-rw-r--r--mm/memblock.c98
-rw-r--r--mm/memcontrol.c260
-rw-r--r--mm/memory-failure.c5
-rw-r--r--mm/memory.c147
-rw-r--r--mm/memory_hotplug.c10
-rw-r--r--mm/mempolicy.c8
-rw-r--r--mm/mempool.c18
-rw-r--r--mm/memremap.c308
-rw-r--r--mm/migrate.c3
-rw-r--r--mm/mincore.c28
-rw-r--r--mm/mmap.c45
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page_alloc.c82
-rw-r--r--mm/page_counter.c2
-rw-r--r--mm/page_io.c14
-rw-r--r--mm/page_isolation.c39
-rw-r--r--mm/shmem.c19
-rw-r--r--mm/slab.c4
-rw-r--r--mm/slab.h42
-rw-r--r--mm/slub.c33
-rw-r--r--mm/sparse.c10
-rw-r--r--mm/swap.c14
-rw-r--r--mm/swap_slots.c3
-rw-r--r--mm/swap_state.c36
-rw-r--r--mm/swapfile.c12
-rw-r--r--mm/truncate.c58
-rw-r--r--mm/vmalloc.c6
-rw-r--r--mm/vmscan.c5
-rw-r--r--mm/z3fold.c3
-rw-r--r--mm/zbud.c1
-rw-r--r--samples/Makefile1
-rw-r--r--samples/kmemleak/Makefile3
-rw-r--r--samples/kmemleak/kmemleak-test.c (renamed from mm/kmemleak-test.c)2
-rwxr-xr-xscripts/decodecode29
-rw-r--r--scripts/spelling.txt4
-rw-r--r--tools/testing/nvdimm/dax-dev.c22
-rw-r--r--tools/testing/nvdimm/test/iomap.c2
-rw-r--r--tools/testing/selftests/vm/Makefile17
-rw-r--r--tools/testing/selftests/vm/compaction_test.c11
-rw-r--r--tools/testing/selftests/vm/gup_benchmark.c14
-rw-r--r--tools/testing/selftests/vm/hmm-tests.c4
192 files changed, 3918 insertions, 2422 deletions
diff --git a/.clang-format b/.clang-format
index badfc1ba440a..6cbd6ee51610 100644
--- a/.clang-format
+++ b/.clang-format
@@ -203,11 +203,13 @@ ForEachMacros:
- 'for_each_matching_node'
- 'for_each_matching_node_and_match'
- 'for_each_member'
- - 'for_each_memblock'
+ - 'for_each_mem_region'
- 'for_each_memblock_type'
- 'for_each_memcg_cache_index'
- 'for_each_mem_pfn_range'
+ - '__for_each_mem_range'
- 'for_each_mem_range'
+ - '__for_each_mem_range_rev'
- 'for_each_mem_range_rev'
- 'for_each_migratetype_order'
- 'for_each_msi_entry'
@@ -271,6 +273,7 @@ ForEachMacros:
- 'for_each_registered_fb'
- 'for_each_requested_gpio'
- 'for_each_requested_gpio_in_range'
+ - 'for_each_reserved_mem_range'
- 'for_each_reserved_mem_region'
- 'for_each_rtd_codec_dais'
- 'for_each_rtd_codec_dais_rollback'
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index baa07b30845e..608d7c279396 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1259,6 +1259,10 @@ PAGE_SIZE multiple when read back.
can show up in the middle. Don't rely on items remaining in a
fixed position; use the keys to look up specific values!
+ If the entry has no per-node counter(or not show in the
+ mempry.numa_stat). We use 'npn'(non-per-node) as the tag
+ to indicate that it will not show in the mempry.numa_stat.
+
anon
Amount of memory used in anonymous mappings such as
brk(), sbrk(), and mmap(MAP_ANONYMOUS)
@@ -1270,15 +1274,11 @@ PAGE_SIZE multiple when read back.
kernel_stack
Amount of memory allocated to kernel stacks.
- slab
- Amount of memory used for storing in-kernel data
- structures.
-
- percpu
+ percpu(npn)
Amount of memory used for storing per-cpu kernel
data structures.
- sock
+ sock(npn)
Amount of memory used in network transmission buffers
shmem
@@ -1318,11 +1318,9 @@ PAGE_SIZE multiple when read back.
Part of "slab" that cannot be reclaimed on memory
pressure.
- pgfault
- Total number of page faults incurred
-
- pgmajfault
- Number of major page faults incurred
+ slab(npn)
+ Amount of memory used for storing in-kernel data
+ structures.
workingset_refault_anon
Number of refaults of previously evicted anonymous pages.
@@ -1348,37 +1346,68 @@ PAGE_SIZE multiple when read back.
workingset_nodereclaim
Number of times a shadow node has been reclaimed
- pgrefill
+ pgfault(npn)
+ Total number of page faults incurred
+
+ pgmajfault(npn)
+ Number of major page faults incurred
+
+ pgrefill(npn)
Amount of scanned pages (in an active LRU list)
- pgscan
+ pgscan(npn)
Amount of scanned pages (in an inactive LRU list)
- pgsteal
+ pgsteal(npn)
Amount of reclaimed pages
- pgactivate
+ pgactivate(npn)
Amount of pages moved to the active LRU list
- pgdeactivate
+ pgdeactivate(npn)
Amount of pages moved to the inactive LRU list
- pglazyfree
+ pglazyfree(npn)
Amount of pages postponed to be freed under memory pressure
- pglazyfreed
+ pglazyfreed(npn)
Amount of reclaimed lazyfree pages
- thp_fault_alloc
+ thp_fault_alloc(npn)
Number of transparent hugepages which were allocated to satisfy
a page fault. This counter is not present when CONFIG_TRANSPARENT_HUGEPAGE
is not set.
- thp_collapse_alloc
+ thp_collapse_alloc(npn)
Number of transparent hugepages which were allocated to allow
collapsing an existing range of pages. This counter is not
present when CONFIG_TRANSPARENT_HUGEPAGE is not set.
+ memory.numa_stat
+ A read-only nested-keyed file which exists on non-root cgroups.
+
+ This breaks down the cgroup's memory footprint into different
+ types of memory, type-specific details, and other information
+ per node on the state of the memory management system.
+
+ This is useful for providing visibility into the NUMA locality
+ information within an memcg since the pages are allowed to be
+ allocated from any physical node. One of the use case is evaluating
+ application performance by combining this information with the
+ application's CPU allocation.
+
+ All memory amounts are in bytes.
+
+ The output format of memory.numa_stat is::
+
+ type N0=<bytes in node 0> N1=<bytes in node 1> ...
+
+ The entries are ordered to be human readable, and new entries
+ can show up in the middle. Don't rely on items remaining in a
+ fixed position; use the keys to look up specific values!
+
+ The entries can refer to the memory.stat.
+
memory.swap.current
A read-only single value file which exists on non-root
cgroups.
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index 015a5f7d7854..f7b1c7462991 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -131,7 +131,7 @@ hugepages
parameter is preceded by an invalid hugepagesz parameter, it will
be ignored.
default_hugepagesz
- pecify the default huge page size. This parameter can
+ Specify the default huge page size. This parameter can
only be specified once on the command line. default_hugepagesz can
optionally be followed by the hugepages parameter to preallocate a
specific number of huge pages of default size. The number of default
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index 38fd5681fade..c09c9ca2ff1c 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -13,10 +13,10 @@ KASAN uses compile-time instrumentation to insert validity checks before every
memory access, and therefore requires a compiler version that supports that.
Generic KASAN is supported in both GCC and Clang. With GCC it requires version
-8.3.0 or later. With Clang it requires version 7.0.0 or later, but detection of
+8.3.0 or later. Any supported Clang version is compatible, but detection of
out-of-bounds accesses for global variables is only supported since Clang 11.
-Tag-based KASAN is only supported in Clang and requires version 7.0.0 or later.
+Tag-based KASAN is only supported in Clang.
Currently generic KASAN is supported for the x86_64, arm64, xtensa, s390 and
riscv architectures, and tag-based KASAN is supported only for arm64.
@@ -281,3 +281,73 @@ unmapped. This will require changes in arch-specific code.
This allows ``VMAP_STACK`` support on x86, and can simplify support of
architectures that do not have a fixed module region.
+
+CONFIG_KASAN_KUNIT_TEST & CONFIG_TEST_KASAN_MODULE
+--------------------------------------------------
+
+``CONFIG_KASAN_KUNIT_TEST`` utilizes the KUnit Test Framework for testing.
+This means each test focuses on a small unit of functionality and
+there are a few ways these tests can be run.
+
+Each test will print the KASAN report if an error is detected and then
+print the number of the test and the status of the test:
+
+pass::
+
+ ok 28 - kmalloc_double_kzfree
+or, if kmalloc failed::
+
+ # kmalloc_large_oob_right: ASSERTION FAILED at lib/test_kasan.c:163
+ Expected ptr is not null, but is
+ not ok 4 - kmalloc_large_oob_right
+or, if a KASAN report was expected, but not found::
+
+ # kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:629
+ Expected kasan_data->report_expected == kasan_data->report_found, but
+ kasan_data->report_expected == 1
+ kasan_data->report_found == 0
+ not ok 28 - kmalloc_double_kzfree
+
+All test statuses are tracked as they run and an overall status will
+be printed at the end::
+
+ ok 1 - kasan
+
+or::
+
+ not ok 1 - kasan
+
+(1) Loadable Module
+~~~~~~~~~~~~~~~~~~~~
+
+With ``CONFIG_KUNIT`` enabled, ``CONFIG_KASAN_KUNIT_TEST`` can be built as
+a loadable module and run on any architecture that supports KASAN
+using something like insmod or modprobe. The module is called ``test_kasan``.
+
+(2) Built-In
+~~~~~~~~~~~~~
+
+With ``CONFIG_KUNIT`` built-in, ``CONFIG_KASAN_KUNIT_TEST`` can be built-in
+on any architecure that supports KASAN. These and any other KUnit
+tests enabled will run and print the results at boot as a late-init
+call.
+
+(3) Using kunit_tool
+~~~~~~~~~~~~~~~~~~~~~
+
+With ``CONFIG_KUNIT`` and ``CONFIG_KASAN_KUNIT_TEST`` built-in, we can also
+use kunit_tool to see the results of these along with other KUnit
+tests in a more readable way. This will not print the KASAN reports
+of tests that passed. Use `KUnit documentation <https://www.kernel.org/doc/html/latest/dev-tools/kunit/index.html>`_ for more up-to-date
+information on kunit_tool.
+
+.. _KUnit: https://www.kernel.org/doc/html/latest/dev-tools/kunit/index.html
+
+``CONFIG_TEST_KASAN_MODULE`` is a set of KASAN tests that could not be
+converted to KUnit. These tests can be run only as a module with
+``CONFIG_TEST_KASAN_MODULE`` built as a loadable module and
+``CONFIG_KASAN`` built-in. The type of error expected and the
+function being run is printed before the expression expected to give
+an error. Then the error is printed, if found, and that test
+should be interpretted to pass only if the error was the one expected
+by the test.
diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst
index a41a2d238af2..1c935f41cd3a 100644
--- a/Documentation/dev-tools/kmemleak.rst
+++ b/Documentation/dev-tools/kmemleak.rst
@@ -229,7 +229,7 @@ Testing with kmemleak-test
To check if you have all set up to use kmemleak, you can use the kmemleak-test
module, a module that deliberately leaks memory. Set CONFIG_DEBUG_KMEMLEAK_TEST
-as module (it can't be used as bult-in) and boot the kernel with kmemleak
+as module (it can't be used as built-in) and boot the kernel with kmemleak
enabled. Load the module and perform a scan with::
# modprobe kmemleak-test
diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst
index 58d513a0fa95..0d5dd5413af0 100644
--- a/Documentation/kbuild/makefiles.rst
+++ b/Documentation/kbuild/makefiles.rst
@@ -21,6 +21,7 @@ This document describes the Linux kernel Makefiles.
--- 3.10 Special Rules
--- 3.11 $(CC) support functions
--- 3.12 $(LD) support functions
+ --- 3.13 Script Invocation
=== 4 Host Program support
--- 4.1 Simple Host Program
@@ -605,6 +606,25 @@ more details, with real examples.
#Makefile
LDFLAGS_vmlinux += $(call ld-option, -X)
+3.13 Script invocation
+----------------------
+
+ Make rules may invoke scripts to build the kernel. The rules shall
+ always provide the appropriate interpreter to execute the script. They
+ shall not rely on the execute bits being set, and shall not invoke the
+ script directly. For the convenience of manual script invocation, such
+ as invoking ./scripts/checkpatch.pl, it is recommended to set execute
+ bits on the scripts nonetheless.
+
+ Kbuild provides variables $(CONFIG_SHELL), $(AWK), $(PERL),
+ $(PYTHON) and $(PYTHON3) to refer to interpreters for the respective
+ scripts.
+
+ Example::
+
+ #Makefile
+ cmd_depmod = $(CONFIG_SHELL) $(srctree)/scripts/depmod.sh $(DEPMOD) \
+ $(KERNELRELEASE)
4 Host Program support
======================
diff --git a/Documentation/vm/active_mm.rst b/Documentation/vm/active_mm.rst
index c84471b180f8..6f8269c284ed 100644
--- a/Documentation/vm/active_mm.rst
+++ b/Documentation/vm/active_mm.rst
@@ -64,7 +64,7 @@ Active MM
actually get cases where you have a address space that is _only_ used by
lazy users. That is often a short-lived state, because once that thread
gets scheduled away in favour of a real thread, the "zombie" mm gets
- released because "mm_users" becomes zero.
+ released because "mm_count" becomes zero.
Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any
more. "init_mm" should be considered just a "lazy context when no other
diff --git a/Documentation/x86/x86_64/boot-options.rst b/Documentation/x86/x86_64/boot-options.rst
index 2b98efb5ba7f..324cefff92e7 100644
--- a/Documentation/x86/x86_64/boot-options.rst
+++ b/Documentation/x86/x86_64/boot-options.rst
@@ -173,6 +173,10 @@ NUMA
numa=noacpi
Don't parse the SRAT table for NUMA setup
+ numa=nohmat
+ Don't parse the HMAT table for NUMA setup, or soft-reserved memory
+ partitioning.
+
numa=fake=<size>[MG]
If given as a memory unit, fills all system RAM with nodes of
size interleaved over physical nodes.
diff --git a/MAINTAINERS b/MAINTAINERS
index 6336f9314ae1..7eecfb345806 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9727,8 +9727,8 @@ M: Catalin Marinas <catalin.marinas@arm.com>
S: Maintained
F: Documentation/dev-tools/kmemleak.rst
F: include/linux/kmemleak.h
-F: mm/kmemleak-test.c
F: mm/kmemleak.c
+F: samples/kmemleak/kmemleak-test.c
KMOD KERNEL MODULE LOADER - USERMODE HELPER
M: Luis Chamberlain <mcgrof@kernel.org>
diff --git a/Makefile b/Makefile
index 51540b291738..342d26432553 100644
--- a/Makefile
+++ b/Makefile
@@ -921,15 +921,6 @@ KBUILD_CFLAGS += $(call cc-disable-warning, maybe-uninitialized)
# disable invalid "can't wrap" optimizations for signed / pointers
KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
-# clang sets -fmerge-all-constants by default as optimization, but this
-# is non-conforming behavior for C and in fact breaks the kernel, so we
-# need to disable it here generally.
-KBUILD_CFLAGS += $(call cc-option,-fno-merge-all-constants)
-
-# for gcc -fno-merge-all-constants disables everything, but it is fine
-# to have actual conforming behavior enabled.
-KBUILD_CFLAGS += $(call cc-option,-fmerge-constants)
-
# Make sure -fstack-check isn't enabled (like gentoo apparently did)
KBUILD_CFLAGS += $(call cc-option,-fno-stack-check,)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 82d0b00bc7a5..3996b6572c3a 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -85,7 +85,7 @@ config ARM
select HAVE_FAST_GUP if ARM_LPAE
select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL && !CC_IS_CLANG
- select HAVE_FUNCTION_TRACER if !XIP_KERNEL && (CC_IS_GCC || CLANG_VERSION >= 100000)
+ select HAVE_FUNCTION_TRACER if !XIP_KERNEL
select HAVE_GCC_PLUGINS
select HAVE_HW_BREAKPOINT if PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7)
select HAVE_IDE if PCI || ISA || PCMCIA
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index 9415222b49ad..b8cbe03ad260 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -59,6 +59,7 @@ __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr)
#ifdef CONFIG_ARM_LPAE
struct page *page = virt_to_page(pmdp);
+ pgtable_pmd_page_dtor(page);
tlb_remove_table(tlb, page);
#endif
}
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index d8e18cdd96d3..3f65d0ac9f63 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -843,20 +843,26 @@ early_param("mem", early_mem);
static void __init request_standard_resources(const struct machine_desc *mdesc)
{
- struct memblock_region *region;
+ phys_addr_t start, end, res_end;
struct resource *res;
+ u64 i;
kernel_code.start = virt_to_phys(_text);
kernel_code.end = virt_to_phys(__init_begin - 1);
kernel_data.start = virt_to_phys(_sdata);
kernel_data.end = virt_to_phys(_end - 1);
- for_each_memblock(memory, region) {
- phys_addr_t start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
- phys_addr_t end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
+ for_each_mem_range(i, &start, &end) {
unsigned long boot_alias_start;
/*
+ * In memblock, end points to the first byte after the
+ * range while in resourses, end points to the last byte in
+ * the range.
+ */
+ res_end = end - 1;
+
+ /*
* Some systems have a special memory alias which is only
* used for booting. We need to advertise this region to
* kexec-tools so they know where bootable RAM is located.
@@ -869,7 +875,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
__func__, sizeof(*res));
res->name = "System RAM (boot alias)";
res->start = boot_alias_start;
- res->end = phys_to_idmap(end);
+ res->end = phys_to_idmap(res_end);
res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
request_resource(&iomem_resource, res);
}
@@ -880,7 +886,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
sizeof(*res));
res->name = "System RAM";
res->start = start;
- res->end = end;
+ res->end = res_end;
res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
request_resource(&iomem_resource, res);
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 000c1b48e973..45f9d5ec2360 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -299,16 +299,14 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn)
*/
static void __init free_unused_memmap(void)
{
- unsigned long start, prev_end = 0;
- struct memblock_region *reg;
+ unsigned long start, end, prev_end = 0;
+ int i;
/*
* This relies on each bank being in address order.
* The banks are sorted previously in bootmem_init().
*/
- for_each_memblock(memory, reg) {
- start = memblock_region_memory_base_pfn(reg);
-
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) {
#ifdef CONFIG_SPARSEMEM
/*
* Take care not to free memmap entries that don't exist
@@ -336,8 +334,7 @@ static void __init free_unused_memmap(void)
* memmap entries are valid from the bank end aligned to
* MAX_ORDER_NR_PAGES.
*/
- prev_end = ALIGN(memblock_region_memory_end_pfn(reg),
- MAX_ORDER_NR_PAGES);
+ prev_end = ALIGN(end, MAX_ORDER_NR_PAGES);
}
#ifdef CONFIG_SPARSEMEM
@@ -347,61 +344,29 @@ static void __init free_unused_memmap(void)
#endif
}
-#ifdef CONFIG_HIGHMEM
-static inline void free_area_high(unsigned long pfn, unsigned long end)
-{
- for (; pfn < end; pfn++)
- free_highmem_page(pfn_to_page(pfn));
-}
-#endif
-
static void __init free_highpages(void)
{
#ifdef CONFIG_HIGHMEM
unsigned long max_low = max_low_pfn;
- struct memblock_region *mem, *res;
+ phys_addr_t range_start, range_end;
+ u64 i;
/* set highmem page free */
- for_each_memblock(memory, mem) {
- unsigned long start = memblock_region_memory_base_pfn(mem);
- unsigned long end = memblock_region_memory_end_pfn(mem);
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE,
+ &range_start, &range_end, NULL) {
+ unsigned long start = PHYS_PFN(range_start);
+ unsigned long end = PHYS_PFN(range_end);
/* Ignore complete lowmem entries */
if (end <= max_low)
continue;
- if (memblock_is_nomap(mem))
- continue;
-
/* Truncate partial highmem entries */
if (start < max_low)
start = max_low;
- /* Find and exclude any reserved regions */
- for_each_memblock(reserved, res) {
- unsigned long res_start, res_end;
-
- res_start = memblock_region_reserved_base_pfn(res);
- res_end = memblock_region_reserved_end_pfn(res);
-
- if (res_end < start)
- continue;
- if (res_start < start)
- res_start = start;
- if (res_start > end)
- res_start = end;
- if (res_end > end)
- res_end = end;
- if (res_start != start)
- free_area_high(start, res_start);
- start = res_end;
- if (start == end)
- break;
- }
-
- /* And now free anything which remains */
- if (start < end)
- free_area_high(start, end);
+ for (; start < end; start++)
+ free_highmem_page(pfn_to_page(start));
}
#endif
}
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index c36f977b2ccb..698cc740c6b8 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1154,9 +1154,8 @@ phys_addr_t arm_lowmem_limit __initdata = 0;
void __init adjust_lowmem_bounds(void)
{
- phys_addr_t memblock_limit = 0;
- u64 vmalloc_limit;
- struct memblock_region *reg;
+ phys_addr_t block_start, block_end, memblock_limit = 0;
+ u64 vmalloc_limit, i;
phys_addr_t lowmem_limit = 0;
/*
@@ -1172,26 +1171,18 @@ void __init adjust_lowmem_bounds(void)
* The first usable region must be PMD aligned. Mark its start
* as MEMBLOCK_NOMAP if it isn't
*/
- for_each_memblock(memory, reg) {
- if (!memblock_is_nomap(reg)) {
- if (!IS_ALIGNED(reg->base, PMD_SIZE)) {
- phys_addr_t len;
+ for_each_mem_range(i, &block_start, &block_end) {
+ if (!IS_ALIGNED(block_start, PMD_SIZE)) {
+ phys_addr_t len;
- len = round_up(reg->base, PMD_SIZE) - reg->base;
- memblock_mark_nomap(reg->base, len);
- }
- break;
+ len = round_up(block_start, PMD_SIZE) - block_start;
+ memblock_mark_nomap(block_start, len);
}
+ break;
}
- for_each_memblock(memory, reg) {
- phys_addr_t block_start = reg->base;
- phys_addr_t block_end = reg->base + reg->size;
-
- if (memblock_is_nomap(reg))
- continue;
-
- if (reg->base < vmalloc_limit) {
+ for_each_mem_range(i, &block_start, &block_end) {
+ if (block_start < vmalloc_limit) {
if (block_end > lowmem_limit)
/*
* Compare as u64 to ensure vmalloc_limit does
@@ -1440,19 +1431,15 @@ static void __init kmap_init(void)
static void __init map_lowmem(void)
{
- struct memblock_region *reg;
phys_addr_t kernel_x_start = round_down(__pa(KERNEL_START), SECTION_SIZE);
phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE);
+ phys_addr_t start, end;
+ u64 i;
/* Map all the lowmem memory banks. */
- for_each_memblock(memory, reg) {
- phys_addr_t start = reg->base;
- phys_addr_t end = start + reg->size;
+ for_each_mem_range(i, &start, &end) {
struct map_desc map;
- if (memblock_is_nomap(reg))
- continue;
-
if (end > arm_lowmem_limit)
end = arm_lowmem_limit;
if (start >= end)
diff --git a/arch/arm/mm/pmsa-v7.c b/arch/arm/mm/pmsa-v7.c
index 699fa2e88725..88950e41a3a9 100644
--- a/arch/arm/mm/pmsa-v7.c
+++ b/arch/arm/mm/pmsa-v7.c
@@ -231,12 +231,12 @@ static int __init allocate_region(phys_addr_t base, phys_addr_t size,
void __init pmsav7_adjust_lowmem_bounds(void)
{
phys_addr_t specified_mem_size = 0, total_mem_size = 0;
- struct memblock_region *reg;
- bool first = true;
phys_addr_t mem_start;
phys_addr_t mem_end;
+ phys_addr_t reg_start, reg_end;
unsigned int mem_max_regions;
- int num, i;
+ int num;
+ u64 i;
/* Free-up PMSAv7_PROBE_REGION */
mpu_min_region_order = __mpu_min_region_order();
@@ -262,20 +262,19 @@ void __init pmsav7_adjust_lowmem_bounds(void)
mem_max_regions -= num;
#endif
- for_each_memblock(memory, reg) {
- if (first) {
+ for_each_mem_range(i, &reg_start, &reg_end) {
+ if (i == 0) {
phys_addr_t phys_offset = PHYS_OFFSET;
/*
* Initially only use memory continuous from
* PHYS_OFFSET */
- if (reg->base != phys_offset)
+ if (reg_start != phys_offset)
panic("First memory bank must be contiguous from PHYS_OFFSET");
- mem_start = reg->base;
- mem_end = reg->base + reg->size;
- specified_mem_size = reg->size;
- first = false;
+ mem_start = reg_start;
+ mem_end = reg_end;
+ specified_mem_size = mem_end - mem_start;
} else {
/*
* memblock auto merges contiguous blocks, remove
@@ -283,8 +282,8 @@ void __init pmsav7_adjust_lowmem_bounds(void)
* blocks separately while iterating)
*/
pr_notice("Ignoring RAM after %pa, memory at %pa ignored\n",
- &mem_end, &reg->base);
- memblock_remove(reg->base, 0 - reg->base);
+ &mem_end, &reg_start);
+ memblock_remove(reg_start, 0 - reg_start);
break;
}
}
diff --git a/arch/arm/mm/pmsa-v8.c b/arch/arm/mm/pmsa-v8.c
index 0d7d5fb59247..2de019f7503e 100644
--- a/arch/arm/mm/pmsa-v8.c
+++ b/arch/arm/mm/pmsa-v8.c
@@ -94,20 +94,19 @@ static __init bool is_region_fixed(int number)
void __init pmsav8_adjust_lowmem_bounds(void)
{
phys_addr_t mem_end;
- struct memblock_region *reg;
- bool first = true;
+ phys_addr_t reg_start, reg_end;
+ u64 i;
- for_each_memblock(memory, reg) {
- if (first) {
+ for_each_mem_range(i, &reg_start, &reg_end) {
+ if (i == 0) {
phys_addr_t phys_offset = PHYS_OFFSET;
/*
* Initially only use memory continuous from
* PHYS_OFFSET */
- if (reg->base != phys_offset)
+ if (reg_start != phys_offset)
panic("First memory bank must be contiguous from PHYS_OFFSET");
- mem_end = reg->base + reg->size;
- first = false;
+ mem_end = reg_end;
} else {
/*
* memblock auto merges contiguous blocks, remove
@@ -115,8 +114,8 @@ void __init pmsav8_adjust_lowmem_bounds(void)
* blocks separately while iterating)
*/
pr_notice("Ignoring RAM after %pa, memory at %pa ignored\n",
- &mem_end, &reg->base);
- memblock_remove(reg->base, 0 - reg->base);
+ &mem_end, &reg_start);
+ memblock_remove(reg_start, 0 - reg_start);
break;
}
}
diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
index 396797ffe2b1..d3ef975a0965 100644
--- a/arch/arm/xen/mm.c
+++ b/arch/arm/xen/mm.c
@@ -25,11 +25,12 @@
unsigned long xen_get_swiotlb_free_pages(unsigned int order)
{
- struct memblock_region *reg;
+ phys_addr_t base;
gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM;
+ u64 i;
- for_each_memblock(memory, reg) {
- if (reg->base < (phys_addr_t)0xffffffff) {
+ for_each_mem_range(i, &base, NULL) {
+ if (base < (phys_addr_t)0xffffffff) {
if (IS_ENABLED(CONFIG_ZONE_DMA32))
flags |= __GFP_DMA32;
else
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9cd317f00034..893130ce1626 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1599,8 +1599,6 @@ config ARM64_BTI_KERNEL
depends on CC_HAS_BRANCH_PROT_PAC_RET_BTI
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94697
depends on !CC_IS_GCC || GCC_VERSION >= 100100
- # https://reviews.llvm.org/rGb8ae3fdfa579dbf366b1bb1cbfdbf8c51db7fa55
- depends on !CC_IS_CLANG || CLANG_VERSION >= 100001
depends on !(CC_IS_CLANG && GCOV_KERNEL)
depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS)
help
diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
index 361a1143e09e..5b0e67b93cdc 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -215,8 +215,7 @@ static int prepare_elf_headers(void **addr, unsigned long *sz)
phys_addr_t start, end;
nr_ranges = 1; /* for exclusion of crashkernel region */
- for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
- MEMBLOCK_NONE, &start, &end, NULL)
+ for_each_mem_range(i, &start, &end)
nr_ranges++;
cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL);
@@ -225,8 +224,7 @@ static int prepare_elf_headers(void **addr, unsigned long *sz)
cmem->max_nr_ranges = nr_ranges;
cmem->nr_ranges = 0;
- for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
- MEMBLOCK_NONE, &start, &end, NULL) {
+ for_each_mem_range(i, &start, &end) {
cmem->ranges[cmem->nr_ranges].start = start;
cmem->ranges[cmem->nr_ranges].end = end - 1;
cmem->nr_ranges++;
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 53acbeca4f57..133257ffd859 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -217,7 +217,7 @@ static void __init request_standard_resources(void)
if (!standard_resources)
panic("%s: Failed to allocate %zu bytes\n", __func__, res_size);
- for_each_memblock(memory, region) {
+ for_each_mem_region(region) {
res = &standard_resources[i++];
if (memblock_is_nomap(region)) {
res->name = "reserved";
@@ -257,7 +257,7 @@ static int __init reserve_memblock_reserved_regions(void)
if (!memblock_is_region_reserved(mem->start, mem_size))
continue;
- for_each_reserved_mem_region(j, &r_start, &r_end) {
+ for_each_reserved_mem_range(j, &r_start, &r_end) {
resource_size_t start, end;
start = max(PFN_PHYS(PFN_DOWN(r_start)), mem->start);
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index 45d5cfe46429..04021a93171c 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -43,13 +43,6 @@ ifneq ($(c-gettimeofday-y),)
CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y)
endif
-# Clang versions less than 8 do not support -mcmodel=tiny
-ifeq ($(CONFIG_CC_IS_CLANG), y)
- ifeq ($(shell test $(CONFIG_CLANG_VERSION) -lt 80000; echo $$?),0)
- CFLAGS_REMOVE_vgettimeofday.o += -mcmodel=tiny
- endif
-endif
-
# Disable gcov profiling for VDSO code
GCOV_PROFILE := n
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 481d22c32a2e..f0bf86d81622 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -471,12 +471,10 @@ static inline void free_memmap(unsigned long start_pfn, unsigned long end_pfn)
*/
static void __init free_unused_memmap(void)
{
- unsigned long start, prev_end = 0;
- struct memblock_region *reg;
-
- for_each_memblock(memory, reg) {
- start = __phys_to_pfn(reg->base);
+ unsigned long start, end, prev_end = 0;
+ int i;
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) {
#ifdef CONFIG_SPARSEMEM
/*
* Take care not to free memmap entries that don't exist due
@@ -496,8 +494,7 @@ static void __init free_unused_memmap(void)
* memmap entries are valid from the bank end aligned to
* MAX_ORDER_NR_PAGES.
*/
- prev_end = ALIGN(__phys_to_pfn(reg->base + reg->size),
- MAX_ORDER_NR_PAGES);
+ prev_end = ALIGN(end, MAX_ORDER_NR_PAGES);
}
#ifdef CONFIG_SPARSEMEM
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 7291b26ce788..b24e43d20667 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -212,8 +212,8 @@ void __init kasan_init(void)
{
u64 kimg_shadow_start, kimg_shadow_end;
u64 mod_shadow_start, mod_shadow_end;
- struct memblock_region *reg;
- int i;
+ phys_addr_t pa_start, pa_end;
+ u64 i;
kimg_shadow_start = (u64)kasan_mem_to_shadow(_text) & PAGE_MASK;
kimg_shadow_end = PAGE_ALIGN((u64)kasan_mem_to_shadow(_end));
@@ -246,9 +246,9 @@ void __init kasan_init(void)
kasan_populate_early_shadow((void *)mod_shadow_end,
(void *)kimg_shadow_start);
- for_each_memblock(memory, reg) {
- void *start = (void *)__phys_to_virt(reg->base);
- void *end = (void *)__phys_to_virt(reg->base + reg->size);
+ for_each_mem_range(i, &pa_start, &pa_end) {
+ void *start = (void *)__phys_to_virt(pa_start);
+ void *end = (void *)__phys_to_virt(pa_end);
if (start >= end)
break;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 087a844b4d26..beff3ad8c7f8 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -473,8 +473,9 @@ static void __init map_mem(pgd_t *pgdp)
{
phys_addr_t kernel_start = __pa_symbol(_text);
phys_addr_t kernel_end = __pa_symbol(__init_begin);
- struct memblock_region *reg;
+ phys_addr_t start, end;
int flags = 0;
+ u64 i;
if (rodata_full || debug_pagealloc_enabled())
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
@@ -493,15 +494,9 @@ static void __init map_mem(pgd_t *pgdp)
#endif
/* map all the memory banks */
- for_each_memblock(memory, reg) {
- phys_addr_t start = reg->base;
- phys_addr_t end = start + reg->size;
-
+ for_each_mem_range(i, &start, &end) {
if (start >= end)
break;
- if (memblock_is_nomap(reg))
- continue;
-
/*
* The linear map must allow allocation tags reading/writing
* if MTE is present. Otherwise, it has the same attributes as
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index 676deb220b99..a8303bc6b62a 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -354,7 +354,7 @@ static int __init numa_register_nodes(void)
struct memblock_region *mblk;
/* Check that valid nid is set to memblks */
- for_each_memblock(memory, mblk) {
+ for_each_mem_region(mblk) {
int mblk_nid = memblock_get_region_node(mblk);
if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) {
@@ -427,19 +427,16 @@ out_free_distance:
*/
static int __init dummy_numa_init(void)
{
+ phys_addr_t start = memblock_start_of_DRAM();
+ phys_addr_t end = memblock_end_of_DRAM();
int ret;
- struct memblock_region *mblk;
if (numa_off)
pr_info("NUMA disabled\n"); /* Forced off on command line. */
- pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n",
- memblock_start_of_DRAM(), memblock_end_of_DRAM() - 1);
-
- for_each_memblock(memory, mblk) {
- ret = numa_add_memblk(0, mblk->base, mblk->base + mblk->size);
- if (!ret)
- continue;
+ pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n", start, end - 1);
+ ret = numa_add_memblk(0, start, end);
+ if (ret) {
pr_err("NUMA init failed\n");
return ret;
}
diff --git a/arch/c6x/kernel/setup.c b/arch/c6x/kernel/setup.c
index 8ef35131f999..9254c3b794a5 100644
--- a/arch/c6x/kernel/setup.c
+++ b/arch/c6x/kernel/setup.c
@@ -287,7 +287,8 @@ notrace void __init machine_init(unsigned long dt_ptr)
void __init setup_arch(char **cmdline_p)
{
- struct memblock_region *reg;
+ phys_addr_t start, end;
+ u64 i;
printk(KERN_INFO "Initializing kernel\n");
@@ -351,9 +352,9 @@ void __init setup_arch(char **cmdline_p)
disable_caching(ram_start, ram_end - 1);
/* Set caching of external RAM used by Linux */
- for_each_memblock(memory, reg)
- enable_caching(CACHE_REGION_START(reg->base),
- CACHE_REGION_START(reg->base + reg->size - 1));
+ for_each_mem_range(i, &start, &end)
+ enable_caching(CACHE_REGION_START(start),
+ CACHE_REGION_START(end - 1));
#ifdef CONFIG_BLK_DEV_INITRD
/*
diff --git a/arch/h8300/kernel/setup.c b/arch/h8300/kernel/setup.c
index 28ac88358a89..0281f92eea3d 100644
--- a/arch/h8300/kernel/setup.c
+++ b/arch/h8300/kernel/setup.c
@@ -74,17 +74,15 @@ static void __init bootmem_init(void)
memory_end = memory_start = 0;
/* Find main memory where is the kernel */
- for_each_memblock(memory, region) {
- memory_start = region->base;
- memory_end = region->base + region->size;
- }
+ memory_start = memblock_start_of_DRAM();
+ memory_end = memblock_end_of_DRAM();
if (!memory_end)
panic("No memory!");
/* setup bootmem globals (we use no_bootmem, but mm still depends on this) */
min_low_pfn = PFN_UP(memory_start);
- max_low_pfn = PFN_DOWN(memblock_end_of_DRAM());
+ max_low_pfn = PFN_DOWN(memory_end);
max_pfn = max_low_pfn;
memblock_reserve(__pa(_stext), _end - _stext);
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 3344d4a1fe89..0902c459c385 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -108,15 +108,15 @@ static void __init paging_init(void)
void __init setup_memory(void)
{
- struct memblock_region *reg;
-
#ifndef CONFIG_MMU
u32 kernel_align_start, kernel_align_size;
+ phys_addr_t start, end;
+ u64 i;
/* Find main memory where is the kernel */
- for_each_memblock(memory, reg) {
- memory_start = (u32)reg->base;
- lowmem_size = reg->size;
+ for_each_mem_range(i, &start, &end) {
+ memory_start = start;
+ lowmem_size = end - start;
if ((memory_start <= (u32)_text) &&
((u32)_text <= (memory_start + lowmem_size - 1))) {
memory_size = lowmem_size;
@@ -164,17 +164,6 @@ void __init setup_memory(void)
pr_info("%s: max_low_pfn: %#lx\n", __func__, max_low_pfn);
pr_info("%s: max_pfn: %#lx\n", __func__, max_pfn);
- /* Add active regions with valid PFNs */
- for_each_memblock(memory, reg) {
- unsigned long start_pfn, end_pfn;
-
- start_pfn = memblock_region_memory_base_pfn(reg);
- end_pfn = memblock_region_memory_end_pfn(reg);
- memblock_set_node(start_pfn << PAGE_SHIFT,
- (end_pfn - start_pfn) << PAGE_SHIFT,
- &memblock.memory, 0);
- }
-
paging_init();
}
diff --git a/arch/mips/cavium-octeon/dma-octeon.c b/arch/mips/cavium-octeon/dma-octeon.c
index 14ea680d180e..ad1aecc4b401 100644
--- a/arch/mips/cavium-octeon/dma-octeon.c
+++ b/arch/mips/cavium-octeon/dma-octeon.c
@@ -190,25 +190,25 @@ char *octeon_swiotlb;
void __init plat_swiotlb_setup(void)
{
- struct memblock_region *mem;
+ phys_addr_t start, end;
phys_addr_t max_addr;
phys_addr_t addr_size;
size_t swiotlbsize;
unsigned long swiotlb_nslabs;
+ u64 i;
max_addr = 0;
addr_size = 0;
- for_each_memblock(memory, mem) {
+ for_each_mem_range(i, &start, &end) {
/* These addresses map low for PCI. */
- if (mem->base > 0x410000000ull && !OCTEON_IS_OCTEON2())
+ if (start > 0x410000000ull && !OCTEON_IS_OCTEON2())
continue;
- addr_size += mem->size;
-
- if (max_addr < mem->base + mem->size)
- max_addr = mem->base + mem->size;
+ addr_size += (end - start);
+ if (max_addr < end)
+ max_addr = end;
}
swiotlbsize = PAGE_SIZE;
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index bf5f5acab0a8..335bd188b8b4 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -300,8 +300,9 @@ static void __init bootmem_init(void)
static void __init bootmem_init(void)
{
- struct memblock_region *mem;
phys_addr_t ramstart, ramend;
+ phys_addr_t start, end;
+ u64 i;
ramstart = memblock_start_of_DRAM();
ramend = memblock_end_of_DRAM();
@@ -338,18 +339,13 @@ static void __init bootmem_init(void)
min_low_pfn = ARCH_PFN_OFFSET;
max_pfn = PFN_DOWN(ramend);
- for_each_memblock(memory, mem) {
- unsigned long start = memblock_region_memory_base_pfn(mem);
- unsigned long end = memblock_region_memory_end_pfn(mem);
-
+ for_each_mem_range(i, &start, &end) {
/*
* Skip highmem here so we get an accurate max_low_pfn if low
* memory stops short of high memory.
* If the region overlaps HIGHMEM_START, end is clipped so
* max_pfn excludes the highmem portion.
*/
- if (memblock_is_nomap(mem))
- continue;
if (start >= PFN_DOWN(HIGHMEM_START))
continue;
if (end > PFN_DOWN(HIGHMEM_START))
@@ -450,13 +446,12 @@ early_param("memmap", early_parse_memmap);
unsigned long setup_elfcorehdr, setup_elfcorehdr_size;
static int __init early_parse_elfcorehdr(char *p)
{
- struct memblock_region *mem;
+ phys_addr_t start, end;
+ u64 i;
setup_elfcorehdr = memparse(p, &p);
- for_each_memblock(memory, mem) {
- unsigned long start = mem->base;
- unsigned long end = start + mem->size;
+ for_each_mem_range(i, &start, &end) {
if (setup_elfcorehdr >= start && setup_elfcorehdr < end) {
/*
* Reserve from the elf core header to the end of
@@ -720,7 +715,8 @@ static void __init arch_mem_init(char **cmdline_p)
static void __init resource_init(void)
{
- struct memblock_region *region;
+ phys_addr_t start, end;
+ u64 i;
if (UNCAC_BASE != IO_BASE)
return;
@@ -732,9 +728,7 @@ static void __init resource_init(void)
bss_resource.start = __pa_symbol(&__bss_start);
bss_resource.end = __pa_symbol(&__bss_stop) - 1;
- for_each_memblock(memory, region) {
- phys_addr_t start = PFN_PHYS(memblock_region_memory_base_pfn(region));
- phys_addr_t end = PFN_PHYS(memblock_region_memory_end_pfn(region)) - 1;
+ for_each_mem_range(i, &start, &end) {
struct resource *res;
res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES);
@@ -743,7 +737,12 @@ static void __init resource_init(void)
sizeof(struct resource));
res->start = start;
- res->end = end;
+ /*
+ * In memblock, end points to the first byte after the
+ * range while in resourses, end points to the last byte in
+ * the range.
+ */
+ res->end = end - 1;
res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
res->name = "System RAM";
diff --git a/arch/mips/netlogic/xlp/setup.c b/arch/mips/netlogic/xlp/setup.c
index 1a0fc5b62ba4..6e3102bcd2f1 100644
--- a/arch/mips/netlogic/xlp/setup.c
+++ b/arch/mips/netlogic/xlp/setup.c
@@ -70,7 +70,7 @@ static void nlm_fixup_mem(void)
const int pref_backup = 512;
struct memblock_region *mem;
- for_each_memblock(memory, mem) {
+ for_each_mem_region(mem) {
memblock_remove(mem->base + mem->size - pref_backup,
pref_backup);
}
diff --git a/arch/nds32/kernel/setup.c b/arch/nds32/kernel/setup.c
index a066efbe53c0..c356e484dcab 100644
--- a/arch/nds32/kernel/setup.c
+++ b/arch/nds32/kernel/setup.c
@@ -249,12 +249,8 @@ static void __init setup_memory(void)
memory_end = memory_start = 0;
/* Find main memory where is the kernel */
- for_each_memblock(memory, region) {
- memory_start = region->base;
- memory_end = region->base + region->size;
- pr_info("%s: Memory: 0x%x-0x%x\n", __func__,
- memory_start, memory_end);
- }
+ memory_start = memblock_start_of_DRAM();
+ memory_end = memblock_end_of_DRAM();
if (!memory_end) {
panic("No memory!");
diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c
index 13c87f1f872b..2416a9f91533 100644
--- a/arch/openrisc/kernel/setup.c
+++ b/arch/openrisc/kernel/setup.c
@@ -48,17 +48,12 @@ static void __init setup_memory(void)
unsigned long ram_start_pfn;
unsigned long ram_end_pfn;
phys_addr_t memory_start, memory_end;
- struct memblock_region *region;
memory_end = memory_start = 0;
/* Find main memory where is the kernel, we assume its the only one */
- for_each_memblock(memory, region) {
- memory_start = region->base;
- memory_end = region->base + region->size;
- printk(KERN_INFO "%s: Memory: 0x%x-0x%x\n", __func__,
- memory_start, memory_end);
- }
+ memory_start = memblock_start_of_DRAM();
+ memory_end = memblock_end_of_DRAM();
if (!memory_end) {
panic("No memory!");
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
index 3d7c79c7745d..8348feaaf46e 100644
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -64,6 +64,7 @@ extern const char _s_kernel_ro[], _e_kernel_ro[];
*/
static void __init map_ram(void)
{
+ phys_addr_t start, end;
unsigned long v, p, e;
pgprot_t prot;
pgd_t *pge;
@@ -71,6 +72,7 @@ static void __init map_ram(void)
pud_t *pue;
pmd_t *pme;
pte_t *pte;
+ u64 i;
/* These mark extents of read-only kernel pages...
* ...from vmlinux.lds.S
*/
@@ -78,9 +80,9 @@ static void __init map_ram(void)
v = PAGE_OFFSET;
- for_each_memblock(memory, region) {
- p = (u32) region->base & PAGE_MASK;
- e = p + (u32) region->size;
+ for_each_mem_range(i, &start, &end) {
+ p = (u32) start & PAGE_MASK;
+ e = (u32) end;
v = (u32) __va(p);
pge = pgd_offset_k(v);
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 10ebb4bf71ad..5cdf4168a61a 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -191,13 +191,13 @@ int is_fadump_active(void)
*/
static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end)
{
- struct memblock_region *reg;
+ phys_addr_t reg_start, reg_end;
bool ret = false;
- u64 start, end;
+ u64 i, start, end;
- for_each_memblock(memory, reg) {
- start = max_t(u64, d_start, reg->base);
- end = min_t(u64, d_end, (reg->base + reg->size));
+ for_each_mem_range(i, &reg_start, &reg_end) {
+ start = max_t(u64, d_start, reg_start);
+ end = min_t(u64, d_end, reg_end);
if (d_start < end) {
/* Memory hole from d_start to start */
if (start > d_start)
@@ -422,34 +422,34 @@ static int __init add_boot_mem_regions(unsigned long mstart,
static int __init fadump_get_boot_mem_regions(void)
{
- unsigned long base, size, cur_size, hole_size, last_end;
+ unsigned long size, cur_size, hole_size, last_end;
unsigned long mem_size = fw_dump.boot_memory_size;
- struct memblock_region *reg;
+ phys_addr_t reg_start, reg_end;
int ret = 1;
+ u64 i;
fw_dump.boot_mem_regs_cnt = 0;
last_end = 0;
hole_size = 0;
cur_size = 0;
- for_each_memblock(memory, reg) {
- base = reg->base;
- size = reg->size;
- hole_size += (base - last_end);
+ for_each_mem_range(i, &reg_start, &reg_end) {
+ size = reg_end - reg_start;
+ hole_size += (reg_start - last_end);
if ((cur_size + size) >= mem_size) {
size = (mem_size - cur_size);
- ret = add_boot_mem_regions(base, size);
+ ret = add_boot_mem_regions(reg_start, size);
break;
}
mem_size -= size;
cur_size += size;
- ret = add_boot_mem_regions(base, size);
+ ret = add_boot_mem_regions(reg_start, size);
if (!ret)
break;
- last_end = base + size;
+ last_end = reg_end;
}
fw_dump.boot_mem_top = PAGE_ALIGN(fw_dump.boot_memory_size + hole_size);
@@ -985,9 +985,8 @@ static int fadump_init_elfcore_header(char *bufp)
*/
static int fadump_setup_crash_memory_ranges(void)
{
- struct memblock_region *reg;
- u64 start, end;
- int i, ret;
+ u64 i, start, end;
+ int ret;
pr_debug("Setup crash memory ranges.\n");
crash_mrange_info.mem_range_cnt = 0;
@@ -1005,10 +1004,7 @@ static int fadump_setup_crash_memory_ranges(void)
return ret;
}
- for_each_memblock(memory, reg) {
- start = (u64)reg->base;
- end = start + (u64)reg->size;
-
+ for_each_mem_range(i, &start, &end) {
/*
* skip the memory chunk that is already added
* (0 through boot_memory_top).
@@ -1242,14 +1238,17 @@ static void fadump_free_reserved_memory(unsigned long start_pfn,
*/
static void fadump_release_reserved_area(u64 start, u64 end)
{
+ unsigned long reg_spfn, reg_epfn;
u64 tstart, tend, spfn, epfn;
- struct memblock_region *reg;
+ int i;
spfn = PHYS_PFN(start);
epfn = PHYS_PFN(end);
- for_each_memblock(memory, reg) {
- tstart = max_t(u64, spfn, memblock_region_memory_base_pfn(reg));
- tend = min_t(u64, epfn, memblock_region_memory_end_pfn(reg));
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &reg_spfn, &reg_epfn, NULL) {
+ tstart = max_t(u64, spfn, reg_spfn);
+ tend = min_t(u64, epfn, reg_epfn);
+
if (tstart < tend) {
fadump_free_reserved_memory(tstart, tend);
@@ -1684,12 +1683,10 @@ int __init fadump_reserve_mem(void)
/* Preserve everything above the base address */
static void __init fadump_reserve_crash_area(u64 base)
{
- struct memblock_region *reg;
- u64 mstart, msize;
+ u64 i, mstart, mend, msize;
- for_each_memblock(memory, reg) {
- mstart = reg->base;
- msize = reg->size;
+ for_each_mem_range(i, &mstart, &mend) {
+ msize = mend - mstart;
if ((mstart + msize) < base)
continue;
diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index 53bb71e3a2e1..c69bcf9b547a 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -138,15 +138,13 @@ out:
*/
static int get_crash_memory_ranges(struct crash_mem **mem_ranges)
{
- struct memblock_region *reg;
+ phys_addr_t base, end;
struct crash_mem *tmem;
+ u64 i;
int ret;
- for_each_memblock(memory, reg) {
- u64 base, size;
-
- base = (u64)reg->base;
- size = (u64)reg->size;
+ for_each_mem_range(i, &base, &end) {
+ u64 size = end - base;
/* Skip backup memory region, which needs a separate entry */
if (base == BACKUP_SRC_START) {
@@ -250,8 +248,7 @@ static int __locate_mem_hole_top_down(struct kexec_buf *kbuf,
phys_addr_t start, end;
u64 i;
- for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE,
- MEMBLOCK_NONE, &start, &end, NULL) {
+ for_each_mem_range_rev(i, &start, &end) {
/*
* memblock uses [start, end) convention while it is
* [start, end] here. Fix the off-by-one to have the
@@ -350,8 +347,7 @@ static int __locate_mem_hole_bottom_up(struct kexec_buf *kbuf,
phys_addr_t start, end;
u64 i;
- for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
- MEMBLOCK_NONE, &start, &end, NULL) {
+ for_each_mem_range(i, &start, &end) {
/*
* memblock uses [start, end) convention while it is
* [start, end] here. Fix the off-by-one to have the
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 073617ce83e0..8f58dd20b362 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -95,23 +95,15 @@ EXPORT_SYMBOL_GPL(kvm_free_hpt_cma);
void __init kvm_cma_reserve(void)
{
unsigned long align_size;
- struct memblock_region *reg;
- phys_addr_t selected_size = 0;
+ phys_addr_t selected_size;
/*
* We need CMA reservation only when we are in HV mode
*/
if (!cpu_has_feature(CPU_FTR_HVMODE))
return;
- /*
- * We cannot use memblock_phys_mem_size() here, because
- * memblock_analyze() has not been called yet.
- */
- for_each_memblock(memory, reg)
- selected_size += memblock_region_memory_end_pfn(reg) -
- memblock_region_memory_base_pfn(reg);
- selected_size = (selected_size * kvm_cma_resv_ratio / 100) << PAGE_SHIFT;
+ selected_size = PAGE_ALIGN(memblock_phys_mem_size() * kvm_cma_resv_ratio / 100);
if (selected_size) {
pr_info("%s: reserving %ld MiB for global area\n", __func__,
(unsigned long)selected_size / SZ_1M);
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 7705d5557239..84e5a2dc8be5 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -687,9 +687,9 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
struct kvmppc_uvmem_page_pvt *pvt;
unsigned long pfn_last, pfn_first;
- pfn_first = kvmppc_uvmem_pgmap.res.start >> PAGE_SHIFT;
+ pfn_first = kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT;
pfn_last = pfn_first +
- (resource_size(&kvmppc_uvmem_pgmap.res) >> PAGE_SHIFT);
+ (range_len(&kvmppc_uvmem_pgmap.range) >> PAGE_SHIFT);
spin_lock(&kvmppc_uvmem_bitmap_lock);
bit = find_first_zero_bit(kvmppc_uvmem_bitmap,
@@ -1007,7 +1007,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf)
static void kvmppc_uvmem_page_free(struct page *page)
{
unsigned long pfn = page_to_pfn(page) -
- (kvmppc_uvmem_pgmap.res.start >> PAGE_SHIFT);
+ (kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT);
struct kvmppc_uvmem_page_pvt *pvt;
spin_lock(&kvmppc_uvmem_bitmap_lock);
@@ -1170,7 +1170,9 @@ int kvmppc_uvmem_init(void)
}
kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE;
- kvmppc_uvmem_pgmap.res = *res;
+ kvmppc_uvmem_pgmap.range.start = res->start;
+ kvmppc_uvmem_pgmap.range.end = res->end;
+ kvmppc_uvmem_pgmap.nr_range = 1;
kvmppc_uvmem_pgmap.ops = &kvmppc_uvmem_ops;
/* just one global instance: */
kvmppc_uvmem_pgmap.owner = &kvmppc_uvmem_pgmap;
@@ -1205,7 +1207,7 @@ void kvmppc_uvmem_free(void)
return;
memunmap_pages(&kvmppc_uvmem_pgmap);
- release_mem_region(kvmppc_uvmem_pgmap.res.start,
- resource_size(&kvmppc_uvmem_pgmap.res));
+ release_mem_region(kvmppc_uvmem_pgmap.range.start,
+ range_len(&kvmppc_uvmem_pgmap.range));
kfree(kvmppc_uvmem_bitmap);
}
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index c663e7ba801f..b830adee51f5 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -7,7 +7,7 @@
*
* SMP scalability work:
* Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- *
+ *
* Module name: htab.c
*
* Description:
@@ -867,8 +867,8 @@ static void __init htab_initialize(void)
unsigned long table;
unsigned long pteg_count;
unsigned long prot;
- unsigned long base = 0, size = 0;
- struct memblock_region *reg;
+ phys_addr_t base = 0, size = 0, end;
+ u64 i;
DBG(" -> htab_initialize()\n");
@@ -884,7 +884,7 @@ static void __init htab_initialize(void)
/*
* Calculate the required size of the htab. We want the number of
* PTEGs to equal one half the number of real pages.
- */
+ */
htab_size_bytes = htab_get_table_size();
pteg_count = htab_size_bytes >> 7;
@@ -894,7 +894,7 @@ static void __init htab_initialize(void)
firmware_has_feature(FW_FEATURE_PS3_LV1)) {
/* Using a hypervisor which owns the htab */
htab_address = NULL;
- _SDR1 = 0;
+ _SDR1 = 0;
#ifdef CONFIG_FA_DUMP
/*
* If firmware assisted dump is active firmware preserves
@@ -960,9 +960,9 @@ static void __init htab_initialize(void)
#endif /* CONFIG_DEBUG_PAGEALLOC */
/* create bolted the linear mapping in the hash table */
- for_each_memblock(memory, reg) {
- base = (unsigned long)__va(reg->base);
- size = reg->size;
+ for_each_mem_range(i, &base, &end) {
+ size = end - base;
+ base = (unsigned long)__va(base);
DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
base, size, prot);
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index d5f0c10d752a..cc72666e891a 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -329,7 +329,8 @@ static int __meminit create_physical_mapping(unsigned long start,
static void __init radix_init_pgtable(void)
{
unsigned long rts_field;
- struct memblock_region *reg;
+ phys_addr_t start, end;
+ u64 i;
/* We don't support slb for radix */
mmu_slb_size = 0;
@@ -337,20 +338,19 @@ static void __init radix_init_pgtable(void)
/*
* Create the linear mapping
*/
- for_each_memblock(memory, reg) {
+ for_each_mem_range(i, &start, &end) {
/*
* The memblock allocator is up at this point, so the
* page tables will be allocated within the range. No
* need or a node (which we don't have yet).
*/
- if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
+ if (end >= RADIX_VMALLOC_START) {
pr_warn("Outside the supported range\n");
continue;
}
- WARN_ON(create_physical_mapping(reg->base,
- reg->base + reg->size,
+ WARN_ON(create_physical_mapping(start, end,
radix_mem_block_size,
-1, PAGE_KERNEL));
}
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index fb294046e00e..26fda3203320 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -138,11 +138,11 @@ void __init kasan_mmu_init(void)
void __init kasan_init(void)
{
- struct memblock_region *reg;
+ phys_addr_t base, end;
+ u64 i;
- for_each_memblock(memory, reg) {
- phys_addr_t base = reg->base;
- phys_addr_t top = min(base + reg->size, total_lowmem);
+ for_each_mem_range(i, &base, &end) {
+ phys_addr_t top = min(end, total_lowmem);
int ret;
if (base >= top)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 42e25874f5a8..5e2e7c0a8f1a 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -184,15 +184,16 @@ void __init initmem_init(void)
/* mark pages that don't exist as nosave */
static int __init mark_nonram_nosave(void)
{
- struct memblock_region *reg, *prev = NULL;
-
- for_each_memblock(memory, reg) {
- if (prev &&
- memblock_region_memory_end_pfn(prev) < memblock_region_memory_base_pfn(reg))
- register_nosave_region(memblock_region_memory_end_pfn(prev),
- memblock_region_memory_base_pfn(reg));
- prev = reg;
+ unsigned long spfn, epfn, prev = 0;
+ int i;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &spfn, &epfn, NULL) {
+ if (prev && prev < spfn)
+ register_nosave_region(prev, spfn);
+
+ prev = epfn;
}
+
return 0;
}
#else /* CONFIG_NEED_MULTIPLE_NODES */
@@ -584,20 +585,24 @@ void flush_icache_user_page(struct vm_area_struct *vma, struct page *page,
*/
static int __init add_system_ram_resources(void)
{
- struct memblock_region *reg;
+ phys_addr_t start, end;
+ u64 i;
- for_each_memblock(memory, reg) {
+ for_each_mem_range(i, &start, &end) {
struct resource *res;
- unsigned long base = reg->base;
- unsigned long size = reg->size;
res = kzalloc(sizeof(struct resource), GFP_KERNEL);
WARN_ON(!res);
if (res) {
res->name = "System RAM";
- res->start = base;
- res->end = base + size - 1;
+ res->start = start;
+ /*
+ * In memblock, end points to the first byte after
+ * the range while in resourses, end points to the
+ * last byte in the range.
+ */
+ res->end = end - 1;
res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
WARN_ON(request_resource(&iomem_resource, res) < 0);
}
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 1f61fa2148b5..f4e20d8e6c02 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -804,17 +804,14 @@ static void __init setup_nonnuma(void)
unsigned long total_ram = memblock_phys_mem_size();
unsigned long start_pfn, end_pfn;
unsigned int nid = 0;
- struct memblock_region *reg;
+ int i;
printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
top_of_ram, total_ram);
printk(KERN_DEBUG "Memory hole size: %ldMB\n",
(top_of_ram - total_ram) >> 20);
- for_each_memblock(memory, reg) {
- start_pfn = memblock_region_memory_base_pfn(reg);
- end_pfn = memblock_region_memory_end_pfn(reg);
-
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
fake_numa_create_new_node(end_pfn, &nid);
memblock_set_node(PFN_PHYS(start_pfn),
PFN_PHYS(end_pfn - start_pfn),
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 6eb4eab79385..079159e97bca 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -123,11 +123,11 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
void __init mapin_ram(void)
{
- struct memblock_region *reg;
+ phys_addr_t base, end;
+ u64 i;
- for_each_memblock(memory, reg) {
- phys_addr_t base = reg->base;
- phys_addr_t top = min(base + reg->size, total_lowmem);
+ for_each_mem_range(i, &base, &end) {
+ phys_addr_t top = min(end, total_lowmem);
if (base >= top)
continue;
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index f750e012dbe5..1dc89303b679 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -145,21 +145,21 @@ static phys_addr_t dtb_early_pa __initdata;
void __init setup_bootmem(void)
{
- struct memblock_region *reg;
phys_addr_t mem_size = 0;
phys_addr_t total_mem = 0;
- phys_addr_t mem_start, end = 0;
+ phys_addr_t mem_start, start, end = 0;
phys_addr_t vmlinux_end = __pa_symbol(&_end);
phys_addr_t vmlinux_start = __pa_symbol(&_start);
+ u64 i;
/* Find the memory region containing the kernel */
- for_each_memblock(memory, reg) {
- end = reg->base + reg->size;
+ for_each_mem_range(i, &start, &end) {
+ phys_addr_t size = end - start;
if (!total_mem)
- mem_start = reg->base;
- if (reg->base <= vmlinux_start && vmlinux_end <= end)
- BUG_ON(reg->size == 0);
- total_mem = total_mem + reg->size;
+ mem_start = start;
+ if (start <= vmlinux_start && vmlinux_end <= end)
+ BUG_ON(size == 0);
+ total_mem = total_mem + size;
}
/*
@@ -191,15 +191,6 @@ void __init setup_bootmem(void)
early_init_fdt_scan_reserved_mem();
memblock_allow_resize();
memblock_dump_all();
-
- for_each_memblock(memory, reg) {
- unsigned long start_pfn = memblock_region_memory_base_pfn(reg);
- unsigned long end_pfn = memblock_region_memory_end_pfn(reg);
-
- memblock_set_node(PFN_PHYS(start_pfn),
- PFN_PHYS(end_pfn - start_pfn),
- &memblock.memory, 0);
- }
}
#ifdef CONFIG_MMU
@@ -464,7 +455,7 @@ static void __init setup_vm_final(void)
{
uintptr_t va, map_size;
phys_addr_t pa, start, end;
- struct memblock_region *reg;
+ u64 i;
/* Set mmu_enabled flag */
mmu_enabled = true;
@@ -475,14 +466,9 @@ static void __init setup_vm_final(void)
PGDIR_SIZE, PAGE_TABLE);
/* Map all memory banks */
- for_each_memblock(memory, reg) {
- start = reg->base;
- end = start + reg->size;
-
+ for_each_mem_range(i, &start, &end) {
if (start >= end)
break;
- if (memblock_is_nomap(reg))
- continue;
if (start <= __pa(PAGE_OFFSET) &&
__pa(PAGE_OFFSET) < end)
start = __pa(PAGE_OFFSET);
@@ -545,7 +531,7 @@ static void __init resource_init(void)
{
struct memblock_region *region;
- for_each_memblock(memory, region) {
+ for_each_mem_region(region) {
struct resource *res;
res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES);
diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
index 87b4ab3d3c77..12ddd1f6bf70 100644
--- a/arch/riscv/mm/kasan_init.c
+++ b/arch/riscv/mm/kasan_init.c
@@ -85,16 +85,16 @@ static void __init populate(void *start, void *end)
void __init kasan_init(void)
{
- struct memblock_region *reg;
- unsigned long i;
+ phys_addr_t _start, _end;
+ u64 i;
kasan_populate_early_shadow((void *)KASAN_SHADOW_START,
(void *)kasan_mem_to_shadow((void *)
VMALLOC_END));
- for_each_memblock(memory, reg) {
- void *start = (void *)__va(reg->base);
- void *end = (void *)__va(reg->base + reg->size);
+ for_each_mem_range(i, &_start, &_end) {
+ void *start = (void *)_start;
+ void *end = (void *)_end;
if (start >= end)
break;
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index c2c1b4e723ea..d44e522c569b 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -484,8 +484,9 @@ static struct resource __initdata *standard_resources[] = {
static void __init setup_resources(void)
{
struct resource *res, *std_res, *sub_res;
- struct memblock_region *reg;
+ phys_addr_t start, end;
int j;
+ u64 i;
code_resource.start = (unsigned long) _text;
code_resource.end = (unsigned long) _etext - 1;
@@ -494,7 +495,7 @@ static void __init setup_resources(void)
bss_resource.start = (unsigned long) __bss_start;
bss_resource.end = (unsigned long) __bss_stop - 1;
- for_each_memblock(memory, reg) {
+ for_each_mem_range(i, &start, &end) {
res = memblock_alloc(sizeof(*res), 8);
if (!res)
panic("%s: Failed to allocate %zu bytes align=0x%x\n",
@@ -502,8 +503,13 @@ static void __init setup_resources(void)
res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
res->name = "System RAM";
- res->start = reg->base;
- res->end = reg->base + reg->size - 1;
+ res->start = start;
+ /*
+ * In memblock, end points to the first byte after the
+ * range while in resourses, end points to the last byte in
+ * the range.
+ */
+ res->end = end - 1;
request_resource(&iomem_resource, res);
for (j = 0; j < ARRAY_SIZE(standard_resources); j++) {
@@ -776,8 +782,8 @@ static void __init memblock_add_mem_detect_info(void)
unsigned long start, end;
int i;
- memblock_dbg("physmem info source: %s (%hhd)\n",
- get_mem_info_source(), mem_detect.info_source);
+ pr_debug("physmem info source: %s (%hhd)\n",
+ get_mem_info_source(), mem_detect.info_source);
/* keep memblock lists close to the kernel */
memblock_set_bottom_up(true);
for_each_mem_detect_block(i, &start, &end) {
@@ -819,14 +825,15 @@ static void __init reserve_kernel(void)
static void __init setup_memory(void)
{
- struct memblock_region *reg;
+ phys_addr_t start, end;
+ u64 i;
/*
* Init storage key for present memory
*/
- for_each_memblock(memory, reg) {
- storage_key_init_range(reg->base, reg->base + reg->size);
- }
+ for_each_mem_range(i, &start, &end)
+ storage_key_init_range(start, end);
+
psw_set_key(PAGE_DEFAULT_KEY);
/* Only cosmetics */
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index fc141893d028..567c69f3069e 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -183,9 +183,9 @@ static void mark_kernel_pgd(void)
void __init cmma_init_nodat(void)
{
- struct memblock_region *reg;
struct page *page;
unsigned long start, end, ix;
+ int i;
if (cmma_flag < 2)
return;
@@ -193,9 +193,7 @@ void __init cmma_init_nodat(void)
mark_kernel_pgd();
/* Set all kernel pages not used for page tables to stable/no-dat */
- for_each_memblock(memory, reg) {
- start = memblock_region_memory_base_pfn(reg);
- end = memblock_region_memory_end_pfn(reg);
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) {
page = pfn_to_page(start);
for (ix = start; ix < end; ix++, page++) {
if (__test_and_clear_bit(PG_arch_1, &page->flags))
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index eddf71c22875..b239f2ba93b0 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -555,10 +555,11 @@ int vmem_add_mapping(unsigned long start, unsigned long size)
*/
void __init vmem_map_init(void)
{
- struct memblock_region *reg;
+ phys_addr_t base, end;
+ u64 i;
- for_each_memblock(memory, reg)
- vmem_add_range(reg->base, reg->size);
+ for_each_mem_range(i, &base, &end)
+ vmem_add_range(base, end - base);
__set_memory((unsigned long)_stext,
(unsigned long)(_etext - _stext) >> PAGE_SHIFT,
SET_MEMORY_RO | SET_MEMORY_X);
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 4735176ab811..3348e0c4d769 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -226,15 +226,12 @@ void __init allocate_pgdat(unsigned int nid)
static void __init do_init_bootmem(void)
{
- struct memblock_region *reg;
+ unsigned long start_pfn, end_pfn;
+ int i;
/* Add active regions with valid PFNs. */
- for_each_memblock(memory, reg) {
- unsigned long start_pfn, end_pfn;
- start_pfn = memblock_region_memory_base_pfn(reg);
- end_pfn = memblock_region_memory_end_pfn(reg);
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL)
__add_active_range(0, start_pfn, end_pfn);
- }
/* All of system RAM sits in node 0 for the non-NUMA case */
allocate_pgdat(0);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index fad6d3129904..96edf64d4fb3 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1192,18 +1192,14 @@ int of_node_to_nid(struct device_node *dp)
static void __init add_node_ranges(void)
{
- struct memblock_region *reg;
+ phys_addr_t start, end;
unsigned long prev_max;
+ u64 i;
memblock_resized:
prev_max = memblock.memory.max;
- for_each_memblock(memory, reg) {
- unsigned long size = reg->size;
- unsigned long start, end;
-
- start = reg->base;
- end = start + size;
+ for_each_mem_range(i, &start, &end) {
while (start < end) {
unsigned long this_end;
int nid;
@@ -1211,7 +1207,7 @@ memblock_resized:
this_end = memblock_nid_range(start, end, &nid);
numadbg("Setting memblock NUMA node nid[%d] "
- "start[%lx] end[%lx]\n",
+ "start[%llx] end[%lx]\n",
nid, start, this_end);
memblock_set_node(start, this_end - start,
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index bbfde3d2662f..0aecc0b629e0 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -3,6 +3,7 @@
#define _ASM_X86_NUMA_H
#include <linux/nodemask.h>
+#include <linux/errno.h>
#include <asm/topology.h>
#include <asm/apicdef.h>
@@ -77,7 +78,12 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable);
#ifdef CONFIG_NUMA_EMU
#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
-void numa_emu_cmdline(char *);
+int numa_emu_cmdline(char *str);
+#else /* CONFIG_NUMA_EMU */
+static inline int numa_emu_cmdline(char *str)
+{
+ return -EINVAL;
+}
#endif /* CONFIG_NUMA_EMU */
#endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 983cd53ed4c9..22aad412f965 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -305,6 +305,20 @@ static int __init cpcompare(const void *a, const void *b)
return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
}
+static bool e820_nomerge(enum e820_type type)
+{
+ /*
+ * These types may indicate distinct platform ranges aligned to
+ * numa node, protection domain, performance domain, or other
+ * boundaries. Do not merge them.
+ */
+ if (type == E820_TYPE_PRAM)
+ return true;
+ if (type == E820_TYPE_SOFT_RESERVED)
+ return true;
+ return false;
+}
+
int __init e820__update_table(struct e820_table *table)
{
struct e820_entry *entries = table->entries;
@@ -380,7 +394,7 @@ int __init e820__update_table(struct e820_table *table)
}
/* Continue building up new map based on this information: */
- if (current_type != last_type || current_type == E820_TYPE_PRAM) {
+ if (current_type != last_type || e820_nomerge(current_type)) {
if (last_type != 0) {
new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr;
/* Move forward only if the new size was non-zero: */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index fa16b906ea3f..210e878c4c0d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -264,16 +264,12 @@ static void __init relocate_initrd(void)
u64 area_size = PAGE_ALIGN(ramdisk_size);
/* We need to move the initrd down into directly mapped mem */
- relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
- area_size, PAGE_SIZE);
-
+ relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0,
+ PFN_PHYS(max_pfn_mapped));
if (!relocated_ramdisk)
panic("Cannot find place for new RAMDISK of size %lld\n",
ramdisk_size);
- /* Note: this includes all the mem currently occupied by
- the initrd, we rely on that fact to keep the data intact. */
- memblock_reserve(relocated_ramdisk, area_size);
initrd_start = relocated_ramdisk + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
@@ -300,13 +296,13 @@ static void __init early_reserve_initrd(void)
memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
}
+
static void __init reserve_initrd(void)
{
/* Assume only end is not page aligned */
u64 ramdisk_image = get_ramdisk_image();
u64 ramdisk_size = get_ramdisk_size();
u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- u64 mapped_size;
if (!boot_params.hdr.type_of_loader ||
!ramdisk_image || !ramdisk_size)
@@ -314,12 +310,6 @@ static void __init reserve_initrd(void)
initrd_start = 0;
- mapped_size = memblock_mem_size(max_pfn_mapped);
- if (ramdisk_size >= (mapped_size>>1))
- panic("initrd too large to handle, "
- "disabling initrd (%lld needed, %lld available)\n",
- ramdisk_size, mapped_size>>1);
-
printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
ramdisk_end - 1);
@@ -431,13 +421,13 @@ static int __init reserve_crashkernel_low(void)
{
#ifdef CONFIG_X86_64
unsigned long long base, low_base = 0, low_size = 0;
- unsigned long total_low_mem;
+ unsigned long low_mem_limit;
int ret;
- total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT));
+ low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX);
/* crashkernel=Y,low */
- ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base);
+ ret = parse_crashkernel_low(boot_command_line, low_mem_limit, &low_size, &base);
if (ret) {
/*
* two parts from kernel/dma/swiotlb.c:
@@ -455,23 +445,17 @@ static int __init reserve_crashkernel_low(void)
return 0;
}
- low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN);
+ low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
if (!low_base) {
pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
(unsigned long)(low_size >> 20));
return -ENOMEM;
}
- ret = memblock_reserve(low_base, low_size);
- if (ret) {
- pr_err("%s: Error reserving crashkernel low memblock.\n", __func__);
- return ret;
- }
-
- pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n",
+ pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low RAM limit: %ldMB)\n",
(unsigned long)(low_size >> 20),
(unsigned long)(low_base >> 20),
- (unsigned long)(total_low_mem >> 20));
+ (unsigned long)(low_mem_limit >> 20));
crashk_low_res.start = low_base;
crashk_low_res.end = low_base + low_size - 1;
@@ -515,13 +499,13 @@ static void __init reserve_crashkernel(void)
* unless "crashkernel=size[KMG],high" is specified.
*/
if (!high)
- crash_base = memblock_find_in_range(CRASH_ALIGN,
- CRASH_ADDR_LOW_MAX,
- crash_size, CRASH_ALIGN);
+ crash_base = memblock_phys_alloc_range(crash_size,
+ CRASH_ALIGN, CRASH_ALIGN,
+ CRASH_ADDR_LOW_MAX);
if (!crash_base)
- crash_base = memblock_find_in_range(CRASH_ALIGN,
- CRASH_ADDR_HIGH_MAX,
- crash_size, CRASH_ALIGN);
+ crash_base = memblock_phys_alloc_range(crash_size,
+ CRASH_ALIGN, CRASH_ALIGN,
+ CRASH_ADDR_HIGH_MAX);
if (!crash_base) {
pr_info("crashkernel reservation failed - No suitable area found.\n");
return;
@@ -529,19 +513,13 @@ static void __init reserve_crashkernel(void)
} else {
unsigned long long start;
- start = memblock_find_in_range(crash_base,
- crash_base + crash_size,
- crash_size, 1 << 20);
+ start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base,
+ crash_base + crash_size);
if (start != crash_base) {
pr_info("crashkernel reservation failed - memory is in use.\n");
return;
}
}
- ret = memblock_reserve(crash_base, crash_size);
- if (ret) {
- pr_err("%s: Error reserving crashkernel memblock.\n", __func__);
- return;
- }
if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
memblock_free(crash_base, crash_size);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index aa76ec2d359b..9df94e0aaee1 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -37,14 +37,12 @@ static __init int numa_setup(char *opt)
return -EINVAL;
if (!strncmp(opt, "off", 3))
numa_off = 1;
-#ifdef CONFIG_NUMA_EMU
if (!strncmp(opt, "fake=", 5))
- numa_emu_cmdline(opt + 5);
-#endif
-#ifdef CONFIG_ACPI_NUMA
+ return numa_emu_cmdline(opt + 5);
if (!strncmp(opt, "noacpi", 6))
- acpi_numa = -1;
-#endif
+ disable_srat();
+ if (!strncmp(opt, "nohmat", 6))
+ disable_hmat();
return 0;
}
early_param("numa", numa_setup);
@@ -516,7 +514,7 @@ static void __init numa_clear_kernel_node_hotplug(void)
* memory ranges, because quirks such as trim_snb_memory()
* reserve specific pages for Sandy Bridge graphics. ]
*/
- for_each_memblock(reserved, mb_region) {
+ for_each_reserved_mem_region(mb_region) {
int nid = memblock_get_region_node(mb_region);
if (nid != MAX_NUMNODES)
@@ -919,7 +917,6 @@ int phys_to_target_node(phys_addr_t start)
return meminfo_to_nid(&numa_reserved_meminfo, start);
}
-EXPORT_SYMBOL_GPL(phys_to_target_node);
int memory_add_physaddr_to_nid(u64 start)
{
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 683cd12f4793..87d77cc52f86 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -13,9 +13,10 @@
static int emu_nid_to_phys[MAX_NUMNODES];
static char *emu_cmdline __initdata;
-void __init numa_emu_cmdline(char *str)
+int __init numa_emu_cmdline(char *str)
{
emu_cmdline = str;
+ return 0;
}
static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 41485a8a6dcf..b1418a6c0e90 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1300,7 +1300,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
* any NUMA information the kernel tries to get from ACPI will
* be meaningless. Prevent it from trying.
*/
- acpi_numa = -1;
+ disable_srat();
#endif
WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index a05b306cf371..ad9d59d93f39 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -79,67 +79,32 @@ void __init zones_init(void)
free_area_init(max_zone_pfn);
}
-#ifdef CONFIG_HIGHMEM
-static void __init free_area_high(unsigned long pfn, unsigned long end)
-{
- for (; pfn < end; pfn++)
- free_highmem_page(pfn_to_page(pfn));
-}
-
static void __init free_highpages(void)
{
+#ifdef CONFIG_HIGHMEM
unsigned long max_low = max_low_pfn;
- struct memblock_region *mem, *res;
+ phys_addr_t range_start, range_end;
+ u64 i;
- reset_all_zones_managed_pages();
/* set highmem page free */
- for_each_memblock(memory, mem) {
- unsigned long start = memblock_region_memory_base_pfn(mem);
- unsigned long end = memblock_region_memory_end_pfn(mem);
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE,
+ &range_start, &range_end, NULL) {
+ unsigned long start = PHYS_PFN(range_start);
+ unsigned long end = PHYS_PFN(range_end);
/* Ignore complete lowmem entries */
if (end <= max_low)
continue;
- if (memblock_is_nomap(mem))
- continue;
-
/* Truncate partial highmem entries */
if (start < max_low)
start = max_low;
- /* Find and exclude any reserved regions */
- for_each_memblock(reserved, res) {
- unsigned long res_start, res_end;
-
- res_start = memblock_region_reserved_base_pfn(res);
- res_end = memblock_region_reserved_end_pfn(res);
-
- if (res_end < start)
- continue;
- if (res_start < start)
- res_start = start;
- if (res_start > end)
- res_start = end;
- if (res_end > end)
- res_end = end;
- if (res_start != start)
- free_area_high(start, res_start);
- start = res_end;
- if (start == end)
- break;
- }
-
- /* And now free anything which remains */
- if (start < end)
- free_area_high(start, end);
+ for (; start < end; start++)
+ free_highmem_page(pfn_to_page(start));
}
-}
-#else
-static void __init free_highpages(void)
-{
-}
#endif
+}
/*
* Initialize memory pages.
diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 2c32cfb72370..134bcb40b2af 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -24,8 +24,15 @@
#include <linux/mutex.h>
#include <linux/node.h>
#include <linux/sysfs.h>
+#include <linux/dax.h>
static u8 hmat_revision;
+static int hmat_disable __initdata;
+
+void __init disable_hmat(void)
+{
+ hmat_disable = 1;
+}
static LIST_HEAD(targets);
static LIST_HEAD(initiators);
@@ -634,66 +641,6 @@ static void hmat_register_target_perf(struct memory_target *target)
node_set_perf_attrs(mem_nid, &target->hmem_attrs, 0);
}
-static void hmat_register_target_device(struct memory_target *target,
- struct resource *r)
-{
- /* define a clean / non-busy resource for the platform device */
- struct resource res = {
- .start = r->start,
- .end = r->end,
- .flags = IORESOURCE_MEM,
- };
- struct platform_device *pdev;
- struct memregion_info info;
- int rc, id;
-
- rc = region_intersects(res.start, resource_size(&res), IORESOURCE_MEM,
- IORES_DESC_SOFT_RESERVED);
- if (rc != REGION_INTERSECTS)
- return;
-
- id = memregion_alloc(GFP_KERNEL);
- if (id < 0) {
- pr_err("memregion allocation failure for %pr\n", &res);
- return;
- }
-
- pdev = platform_device_alloc("hmem", id);
- if (!pdev) {
- pr_err("hmem device allocation failure for %pr\n", &res);
- goto out_pdev;
- }
-
- pdev->dev.numa_node = acpi_map_pxm_to_online_node(target->memory_pxm);
- info = (struct memregion_info) {
- .target_node = acpi_map_pxm_to_node(target->memory_pxm),
- };
- rc = platform_device_add_data(pdev, &info, sizeof(info));
- if (rc < 0) {
- pr_err("hmem memregion_info allocation failure for %pr\n", &res);
- goto out_pdev;
- }
-
- rc = platform_device_add_resources(pdev, &res, 1);
- if (rc < 0) {
- pr_err("hmem resource allocation failure for %pr\n", &res);
- goto out_resource;
- }
-
- rc = platform_device_add(pdev);
- if (rc < 0) {
- dev_err(&pdev->dev, "device add failed for %pr\n", &res);
- goto out_resource;
- }
-
- return;
-
-out_resource:
- put_device(&pdev->dev);
-out_pdev:
- memregion_free(id);
-}
-
static void hmat_register_target_devices(struct memory_target *target)
{
struct resource *res;
@@ -705,8 +652,11 @@ static void hmat_register_target_devices(struct memory_target *target)
if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM))
return;
- for (res = target->memregions.child; res; res = res->sibling)
- hmat_register_target_device(target, res);
+ for (res = target->memregions.child; res; res = res->sibling) {
+ int target_nid = acpi_map_pxm_to_node(target->memory_pxm);
+
+ hmem_register_device(target_nid, res);
+ }
}
static void hmat_register_target(struct memory_target *target)
@@ -814,7 +764,7 @@ static __init int hmat_init(void)
enum acpi_hmat_type i;
acpi_status status;
- if (srat_disabled())
+ if (srat_disabled() || hmat_disable)
return 0;
status = acpi_get_table(ACPI_SIG_SRAT, 0, &tbl);
diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index 15bbaab8500b..1b0ae0a1959b 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -27,7 +27,12 @@ static int node_to_pxm_map[MAX_NUMNODES]
= { [0 ... MAX_NUMNODES - 1] = PXM_INVAL };
unsigned char acpi_srat_revision __initdata;
-int acpi_numa __initdata;
+static int acpi_numa __initdata;
+
+void __init disable_srat(void)
+{
+ acpi_numa = -1;
+}
int pxm_to_node(int pxm)
{
@@ -163,7 +168,7 @@ static int __init slit_valid(struct acpi_table_slit *slit)
void __init bad_srat(void)
{
pr_err("SRAT: SRAT not used.\n");
- acpi_numa = -1;
+ disable_srat();
}
int __init srat_disabled(void)
diff --git a/drivers/base/core.c b/drivers/base/core.c
index f90e9f77bf8c..0bad9e6066c3 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -3324,7 +3324,7 @@ struct device *device_find_child_by_name(struct device *parent,
klist_iter_init(&parent->p->klist_children, &i);
while ((child = next_device(&i)))
- if (!strcmp(dev_name(child), name) && get_device(child))
+ if (sysfs_streq(dev_name(child), name) && get_device(child))
break;
klist_iter_exit(&i);
return child;
diff --git a/drivers/bus/mvebu-mbus.c b/drivers/bus/mvebu-mbus.c
index 5b2a11a88951..2519ceede64b 100644
--- a/drivers/bus/mvebu-mbus.c
+++ b/drivers/bus/mvebu-mbus.c
@@ -610,23 +610,23 @@ static unsigned int armada_xp_mbus_win_remap_offset(int win)
static void __init
mvebu_mbus_find_bridge_hole(uint64_t *start, uint64_t *end)
{
- struct memblock_region *r;
- uint64_t s = 0;
+ phys_addr_t reg_start, reg_end;
+ uint64_t i, s = 0;
- for_each_memblock(memory, r) {
+ for_each_mem_range(i, &reg_start, &reg_end) {
/*
* This part of the memory is above 4 GB, so we don't
* care for the MBus bridge hole.
*/
- if (r->base >= 0x100000000ULL)
+ if (reg_start >= 0x100000000ULL)
continue;
/*
* The MBus bridge hole is at the end of the RAM under
* the 4 GB limit.
*/
- if (r->base + r->size > s)
- s = r->base + r->size;
+ if (reg_end > s)
+ s = reg_end;
}
*start = s;
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 3b6c06f07326..567428e10b7b 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -35,6 +35,7 @@ config DEV_DAX_PMEM
config DEV_DAX_HMEM
tristate "HMEM DAX: direct access to 'specific purpose' memory"
depends on EFI_SOFT_RESERVE
+ select NUMA_KEEP_MEMINFO if (NUMA && X86)
default DEV_DAX
help
EFI 2.8 platforms, and others, may advertise 'specific purpose'
@@ -48,6 +49,11 @@ config DEV_DAX_HMEM
Say M if unsure.
+config DEV_DAX_HMEM_DEVICES
+ depends on NUMA_KEEP_MEMINFO # for phys_to_target_node()
+ depends on DEV_DAX_HMEM && DAX=y
+ def_bool y
+
config DEV_DAX_KMEM
tristate "KMEM DAX: volatile-use of persistent memory"
default DEV_DAX
diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
index 80065b38b3c4..9d4ba672d305 100644
--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
@@ -2,11 +2,10 @@
obj-$(CONFIG_DAX) += dax.o
obj-$(CONFIG_DEV_DAX) += device_dax.o
obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o
-obj-$(CONFIG_DEV_DAX_HMEM) += dax_hmem.o
dax-y := super.o
dax-y += bus.o
device_dax-y := device.o
-dax_hmem-y := hmem.o
obj-y += pmem/
+obj-y += hmem/
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index df238c8b6ef2..27513d311242 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -6,6 +6,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/dax.h>
+#include <linux/io.h>
#include "dax-private.h"
#include "bus.h"
@@ -130,10 +131,63 @@ ATTRIBUTE_GROUPS(dax_drv);
static int dax_bus_match(struct device *dev, struct device_driver *drv);
+static bool is_static(struct dax_region *dax_region)
+{
+ return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0;
+}
+
+static u64 dev_dax_size(struct dev_dax *dev_dax)
+{
+ u64 size = 0;
+ int i;
+
+ device_lock_assert(&dev_dax->dev);
+
+ for (i = 0; i < dev_dax->nr_range; i++)
+ size += range_len(&dev_dax->ranges[i].range);
+
+ return size;
+}
+
+static int dax_bus_probe(struct device *dev)
+{
+ struct dax_device_driver *dax_drv = to_dax_drv(dev->driver);
+ struct dev_dax *dev_dax = to_dev_dax(dev);
+ struct dax_region *dax_region = dev_dax->region;
+ int rc;
+
+ if (dev_dax_size(dev_dax) == 0 || dev_dax->id < 0)
+ return -ENXIO;
+
+ rc = dax_drv->probe(dev_dax);
+
+ if (rc || is_static(dax_region))
+ return rc;
+
+ /*
+ * Track new seed creation only after successful probe of the
+ * previous seed.
+ */
+ if (dax_region->seed == dev)
+ dax_region->seed = NULL;
+
+ return 0;
+}
+
+static int dax_bus_remove(struct device *dev)
+{
+ struct dax_device_driver *dax_drv = to_dax_drv(dev->driver);
+ struct dev_dax *dev_dax = to_dev_dax(dev);
+
+ return dax_drv->remove(dev_dax);
+}
+
static struct bus_type dax_bus_type = {
.name = "dax",
.uevent = dax_bus_uevent,
.match = dax_bus_match,
+ .probe = dax_bus_probe,
+ .remove = dax_bus_remove,
.drv_groups = dax_drv_groups,
};
@@ -176,18 +230,269 @@ static ssize_t region_size_show(struct device *dev,
static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
region_size_show, NULL);
-static ssize_t align_show(struct device *dev,
+static ssize_t region_align_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct dax_region *dax_region = dev_get_drvdata(dev);
return sprintf(buf, "%u\n", dax_region->align);
}
-static DEVICE_ATTR_RO(align);
+static struct device_attribute dev_attr_region_align =
+ __ATTR(align, 0400, region_align_show, NULL);
+
+#define for_each_dax_region_resource(dax_region, res) \
+ for (res = (dax_region)->res.child; res; res = res->sibling)
+
+static unsigned long long dax_region_avail_size(struct dax_region *dax_region)
+{
+ resource_size_t size = resource_size(&dax_region->res);
+ struct resource *res;
+
+ device_lock_assert(dax_region->dev);
+
+ for_each_dax_region_resource(dax_region, res)
+ size -= resource_size(res);
+ return size;
+}
+
+static ssize_t available_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dax_region *dax_region = dev_get_drvdata(dev);
+ unsigned long long size;
+
+ device_lock(dev);
+ size = dax_region_avail_size(dax_region);
+ device_unlock(dev);
+
+ return sprintf(buf, "%llu\n", size);
+}
+static DEVICE_ATTR_RO(available_size);
+
+static ssize_t seed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dax_region *dax_region = dev_get_drvdata(dev);
+ struct device *seed;
+ ssize_t rc;
+
+ if (is_static(dax_region))
+ return -EINVAL;
+
+ device_lock(dev);
+ seed = dax_region->seed;
+ rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : "");
+ device_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RO(seed);
+
+static ssize_t create_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dax_region *dax_region = dev_get_drvdata(dev);
+ struct device *youngest;
+ ssize_t rc;
+
+ if (is_static(dax_region))
+ return -EINVAL;
+
+ device_lock(dev);
+ youngest = dax_region->youngest;
+ rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : "");
+ device_unlock(dev);
+
+ return rc;
+}
+
+static ssize_t create_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct dax_region *dax_region = dev_get_drvdata(dev);
+ unsigned long long avail;
+ ssize_t rc;
+ int val;
+
+ if (is_static(dax_region))
+ return -EINVAL;
+
+ rc = kstrtoint(buf, 0, &val);
+ if (rc)
+ return rc;
+ if (val != 1)
+ return -EINVAL;
+
+ device_lock(dev);
+ avail = dax_region_avail_size(dax_region);
+ if (avail == 0)
+ rc = -ENOSPC;
+ else {
+ struct dev_dax_data data = {
+ .dax_region = dax_region,
+ .size = 0,
+ .id = -1,
+ };
+ struct dev_dax *dev_dax = devm_create_dev_dax(&data);
+
+ if (IS_ERR(dev_dax))
+ rc = PTR_ERR(dev_dax);
+ else {
+ /*
+ * In support of crafting multiple new devices
+ * simultaneously multiple seeds can be created,
+ * but only the first one that has not been
+ * successfully bound is tracked as the region
+ * seed.
+ */
+ if (!dax_region->seed)
+ dax_region->seed = &dev_dax->dev;
+ dax_region->youngest = &dev_dax->dev;
+ rc = len;
+ }
+ }
+ device_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RW(create);
+
+void kill_dev_dax(struct dev_dax *dev_dax)
+{
+ struct dax_device *dax_dev = dev_dax->dax_dev;
+ struct inode *inode = dax_inode(dax_dev);
+
+ kill_dax(dax_dev);
+ unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+}
+EXPORT_SYMBOL_GPL(kill_dev_dax);
+
+static void free_dev_dax_ranges(struct dev_dax *dev_dax)
+{
+ struct dax_region *dax_region = dev_dax->region;
+ int i;
+
+ device_lock_assert(dax_region->dev);
+ for (i = 0; i < dev_dax->nr_range; i++) {
+ struct range *range = &dev_dax->ranges[i].range;
+
+ __release_region(&dax_region->res, range->start,
+ range_len(range));
+ }
+ dev_dax->nr_range = 0;
+}
+
+static void unregister_dev_dax(void *dev)
+{
+ struct dev_dax *dev_dax = to_dev_dax(dev);
+
+ dev_dbg(dev, "%s\n", __func__);
+
+ kill_dev_dax(dev_dax);
+ free_dev_dax_ranges(dev_dax);
+ device_del(dev);
+ put_device(dev);
+}
+
+/* a return value >= 0 indicates this invocation invalidated the id */
+static int __free_dev_dax_id(struct dev_dax *dev_dax)
+{
+ struct dax_region *dax_region = dev_dax->region;
+ struct device *dev = &dev_dax->dev;
+ int rc = dev_dax->id;
+
+ device_lock_assert(dev);
+
+ if (is_static(dax_region) || dev_dax->id < 0)
+ return -1;
+ ida_free(&dax_region->ida, dev_dax->id);
+ dev_dax->id = -1;
+ return rc;
+}
+
+static int free_dev_dax_id(struct dev_dax *dev_dax)
+{
+ struct device *dev = &dev_dax->dev;
+ int rc;
+
+ device_lock(dev);
+ rc = __free_dev_dax_id(dev_dax);
+ device_unlock(dev);
+ return rc;
+}
+
+static ssize_t delete_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct dax_region *dax_region = dev_get_drvdata(dev);
+ struct dev_dax *dev_dax;
+ struct device *victim;
+ bool do_del = false;
+ int rc;
+
+ if (is_static(dax_region))
+ return -EINVAL;
+
+ victim = device_find_child_by_name(dax_region->dev, buf);
+ if (!victim)
+ return -ENXIO;
+
+ device_lock(dev);
+ device_lock(victim);
+ dev_dax = to_dev_dax(victim);
+ if (victim->driver || dev_dax_size(dev_dax))
+ rc = -EBUSY;
+ else {
+ /*
+ * Invalidate the device so it does not become active
+ * again, but always preserve device-id-0 so that
+ * /sys/bus/dax/ is guaranteed to be populated while any
+ * dax_region is registered.
+ */
+ if (dev_dax->id > 0) {
+ do_del = __free_dev_dax_id(dev_dax) >= 0;
+ rc = len;
+ if (dax_region->seed == victim)
+ dax_region->seed = NULL;
+ if (dax_region->youngest == victim)
+ dax_region->youngest = NULL;
+ } else
+ rc = -EBUSY;
+ }
+ device_unlock(victim);
+
+ /* won the race to invalidate the device, clean it up */
+ if (do_del)
+ devm_release_action(dev, unregister_dev_dax, victim);
+ device_unlock(dev);
+ put_device(victim);
+
+ return rc;
+}
+static DEVICE_ATTR_WO(delete);
+
+static umode_t dax_region_visible(struct kobject *kobj, struct attribute *a,
+ int n)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+ struct dax_region *dax_region = dev_get_drvdata(dev);
+
+ if (is_static(dax_region))
+ if (a == &dev_attr_available_size.attr
+ || a == &dev_attr_create.attr
+ || a == &dev_attr_seed.attr
+ || a == &dev_attr_delete.attr)
+ return 0;
+ return a->mode;
+}
static struct attribute *dax_region_attributes[] = {
+ &dev_attr_available_size.attr,
&dev_attr_region_size.attr,
- &dev_attr_align.attr,
+ &dev_attr_region_align.attr,
+ &dev_attr_create.attr,
+ &dev_attr_seed.attr,
+ &dev_attr_delete.attr,
&dev_attr_id.attr,
NULL,
};
@@ -195,6 +500,7 @@ static struct attribute *dax_region_attributes[] = {
static const struct attribute_group dax_region_attribute_group = {
.name = "dax_region",
.attrs = dax_region_attributes,
+ .is_visible = dax_region_visible,
};
static const struct attribute_group *dax_region_attribute_groups[] = {
@@ -226,8 +532,8 @@ static void dax_region_unregister(void *region)
}
struct dax_region *alloc_dax_region(struct device *parent, int region_id,
- struct resource *res, int target_node, unsigned int align,
- unsigned long long pfn_flags)
+ struct range *range, int target_node, unsigned int align,
+ unsigned long flags)
{
struct dax_region *dax_region;
@@ -241,8 +547,8 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
return NULL;
}
- if (!IS_ALIGNED(res->start, align)
- || !IS_ALIGNED(resource_size(res), align))
+ if (!IS_ALIGNED(range->start, align)
+ || !IS_ALIGNED(range_len(range), align))
return NULL;
dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL);
@@ -250,13 +556,18 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
return NULL;
dev_set_drvdata(parent, dax_region);
- memcpy(&dax_region->res, res, sizeof(*res));
- dax_region->pfn_flags = pfn_flags;
kref_init(&dax_region->kref);
dax_region->id = region_id;
dax_region->align = align;
dax_region->dev = parent;
dax_region->target_node = target_node;
+ ida_init(&dax_region->ida);
+ dax_region->res = (struct resource) {
+ .start = range->start,
+ .end = range->end,
+ .flags = IORESOURCE_MEM | flags,
+ };
+
if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) {
kfree(dax_region);
return NULL;
@@ -269,45 +580,631 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
}
EXPORT_SYMBOL_GPL(alloc_dax_region);
+static void dax_mapping_release(struct device *dev)
+{
+ struct dax_mapping *mapping = to_dax_mapping(dev);
+ struct dev_dax *dev_dax = to_dev_dax(dev->parent);
+
+ ida_free(&dev_dax->ida, mapping->id);
+ kfree(mapping);
+}
+
+static void unregister_dax_mapping(void *data)
+{
+ struct device *dev = data;
+ struct dax_mapping *mapping = to_dax_mapping(dev);
+ struct dev_dax *dev_dax = to_dev_dax(dev->parent);
+ struct dax_region *dax_region = dev_dax->region;
+
+ dev_dbg(dev, "%s\n", __func__);
+
+ device_lock_assert(dax_region->dev);
+
+ dev_dax->ranges[mapping->range_id].mapping = NULL;
+ mapping->range_id = -1;
+
+ device_del(dev);
+ put_device(dev);
+}
+
+static struct dev_dax_range *get_dax_range(struct device *dev)
+{
+ struct dax_mapping *mapping = to_dax_mapping(dev);
+ struct dev_dax *dev_dax = to_dev_dax(dev->parent);
+ struct dax_region *dax_region = dev_dax->region;
+
+ device_lock(dax_region->dev);
+ if (mapping->range_id < 0) {
+ device_unlock(dax_region->dev);
+ return NULL;
+ }
+
+ return &dev_dax->ranges[mapping->range_id];
+}
+
+static void put_dax_range(struct dev_dax_range *dax_range)
+{
+ struct dax_mapping *mapping = dax_range->mapping;
+ struct dev_dax *dev_dax = to_dev_dax(mapping->dev.parent);
+ struct dax_region *dax_region = dev_dax->region;
+
+ device_unlock(dax_region->dev);
+}
+
+static ssize_t start_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dev_dax_range *dax_range;
+ ssize_t rc;
+
+ dax_range = get_dax_range(dev);
+ if (!dax_range)
+ return -ENXIO;
+ rc = sprintf(buf, "%#llx\n", dax_range->range.start);
+ put_dax_range(dax_range);
+
+ return rc;
+}
+static DEVICE_ATTR(start, 0400, start_show, NULL);
+
+static ssize_t end_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dev_dax_range *dax_range;
+ ssize_t rc;
+
+ dax_range = get_dax_range(dev);
+ if (!dax_range)
+ return -ENXIO;
+ rc = sprintf(buf, "%#llx\n", dax_range->range.end);
+ put_dax_range(dax_range);
+
+ return rc;
+}
+static DEVICE_ATTR(end, 0400, end_show, NULL);
+
+static ssize_t pgoff_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dev_dax_range *dax_range;
+ ssize_t rc;
+
+ dax_range = get_dax_range(dev);
+ if (!dax_range)
+ return -ENXIO;
+ rc = sprintf(buf, "%#lx\n", dax_range->pgoff);
+ put_dax_range(dax_range);
+
+ return rc;
+}
+static DEVICE_ATTR(page_offset, 0400, pgoff_show, NULL);
+
+static struct attribute *dax_mapping_attributes[] = {
+ &dev_attr_start.attr,
+ &dev_attr_end.attr,
+ &dev_attr_page_offset.attr,
+ NULL,
+};
+
+static const struct attribute_group dax_mapping_attribute_group = {
+ .attrs = dax_mapping_attributes,
+};
+
+static const struct attribute_group *dax_mapping_attribute_groups[] = {
+ &dax_mapping_attribute_group,
+ NULL,
+};
+
+static struct device_type dax_mapping_type = {
+ .release = dax_mapping_release,
+ .groups = dax_mapping_attribute_groups,
+};
+
+static int devm_register_dax_mapping(struct dev_dax *dev_dax, int range_id)
+{
+ struct dax_region *dax_region = dev_dax->region;
+ struct dax_mapping *mapping;
+ struct device *dev;
+ int rc;
+
+ device_lock_assert(dax_region->dev);
+
+ if (dev_WARN_ONCE(&dev_dax->dev, !dax_region->dev->driver,
+ "region disabled\n"))
+ return -ENXIO;
+
+ mapping = kzalloc(sizeof(*mapping), GFP_KERNEL);
+ if (!mapping)
+ return -ENOMEM;
+ mapping->range_id = range_id;
+ mapping->id = ida_alloc(&dev_dax->ida, GFP_KERNEL);
+ if (mapping->id < 0) {
+ kfree(mapping);
+ return -ENOMEM;
+ }
+ dev_dax->ranges[range_id].mapping = mapping;
+ dev = &mapping->dev;
+ device_initialize(dev);
+ dev->parent = &dev_dax->dev;
+ dev->type = &dax_mapping_type;
+ dev_set_name(dev, "mapping%d", mapping->id);
+ rc = device_add(dev);
+ if (rc) {
+ put_device(dev);
+ return rc;
+ }
+
+ rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_mapping,
+ dev);
+ if (rc)
+ return rc;
+ return 0;
+}
+
+static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start,
+ resource_size_t size)
+{
+ struct dax_region *dax_region = dev_dax->region;
+ struct resource *res = &dax_region->res;
+ struct device *dev = &dev_dax->dev;
+ struct dev_dax_range *ranges;
+ unsigned long pgoff = 0;
+ struct resource *alloc;
+ int i, rc;
+
+ device_lock_assert(dax_region->dev);
+
+ /* handle the seed alloc special case */
+ if (!size) {
+ if (dev_WARN_ONCE(dev, dev_dax->nr_range,
+ "0-size allocation must be first\n"))
+ return -EBUSY;
+ /* nr_range == 0 is elsewhere special cased as 0-size device */
+ return 0;
+ }
+
+ ranges = krealloc(dev_dax->ranges, sizeof(*ranges)
+ * (dev_dax->nr_range + 1), GFP_KERNEL);
+ if (!ranges)
+ return -ENOMEM;
+
+ alloc = __request_region(res, start, size, dev_name(dev), 0);
+ if (!alloc) {
+ /*
+ * If this was an empty set of ranges nothing else
+ * will release @ranges, so do it now.
+ */
+ if (!dev_dax->nr_range) {
+ kfree(ranges);
+ ranges = NULL;
+ }
+ dev_dax->ranges = ranges;
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < dev_dax->nr_range; i++)
+ pgoff += PHYS_PFN(range_len(&ranges[i].range));
+ dev_dax->ranges = ranges;
+ ranges[dev_dax->nr_range++] = (struct dev_dax_range) {
+ .pgoff = pgoff,
+ .range = {
+ .start = alloc->start,
+ .end = alloc->end,
+ },
+ };
+
+ dev_dbg(dev, "alloc range[%d]: %pa:%pa\n", dev_dax->nr_range - 1,
+ &alloc->start, &alloc->end);
+ /*
+ * A dev_dax instance must be registered before mapping device
+ * children can be added. Defer to devm_create_dev_dax() to add
+ * the initial mapping device.
+ */
+ if (!device_is_registered(&dev_dax->dev))
+ return 0;
+
+ rc = devm_register_dax_mapping(dev_dax, dev_dax->nr_range - 1);
+ if (rc) {
+ dev_dbg(dev, "delete range[%d]: %pa:%pa\n", dev_dax->nr_range - 1,
+ &alloc->start, &alloc->end);
+ dev_dax->nr_range--;
+ __release_region(res, alloc->start, resource_size(alloc));
+ return rc;
+ }
+
+ return 0;
+}
+
+static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, resource_size_t size)
+{
+ int last_range = dev_dax->nr_range - 1;
+ struct dev_dax_range *dax_range = &dev_dax->ranges[last_range];
+ struct dax_region *dax_region = dev_dax->region;
+ bool is_shrink = resource_size(res) > size;
+ struct range *range = &dax_range->range;
+ struct device *dev = &dev_dax->dev;
+ int rc;
+
+ device_lock_assert(dax_region->dev);
+
+ if (dev_WARN_ONCE(dev, !size, "deletion is handled by dev_dax_shrink\n"))
+ return -EINVAL;
+
+ rc = adjust_resource(res, range->start, size);
+ if (rc)
+ return rc;
+
+ *range = (struct range) {
+ .start = range->start,
+ .end = range->start + size - 1,
+ };
+
+ dev_dbg(dev, "%s range[%d]: %#llx:%#llx\n", is_shrink ? "shrink" : "extend",
+ last_range, (unsigned long long) range->start,
+ (unsigned long long) range->end);
+
+ return 0;
+}
+
static ssize_t size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct dev_dax *dev_dax = to_dev_dax(dev);
- unsigned long long size = resource_size(&dev_dax->region->res);
+ unsigned long long size;
+
+ device_lock(dev);
+ size = dev_dax_size(dev_dax);
+ device_unlock(dev);
return sprintf(buf, "%llu\n", size);
}
-static DEVICE_ATTR_RO(size);
-static int dev_dax_target_node(struct dev_dax *dev_dax)
+static bool alloc_is_aligned(struct dev_dax *dev_dax, resource_size_t size)
+{
+ /*
+ * The minimum mapping granularity for a device instance is a
+ * single subsection, unless the arch says otherwise.
+ */
+ return IS_ALIGNED(size, max_t(unsigned long, dev_dax->align, memremap_compat_align()));
+}
+
+static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size)
{
+ resource_size_t to_shrink = dev_dax_size(dev_dax) - size;
struct dax_region *dax_region = dev_dax->region;
+ struct device *dev = &dev_dax->dev;
+ int i;
+
+ for (i = dev_dax->nr_range - 1; i >= 0; i--) {
+ struct range *range = &dev_dax->ranges[i].range;
+ struct dax_mapping *mapping = dev_dax->ranges[i].mapping;
+ struct resource *adjust = NULL, *res;
+ resource_size_t shrink;
+
+ shrink = min_t(u64, to_shrink, range_len(range));
+ if (shrink >= range_len(range)) {
+ devm_release_action(dax_region->dev,
+ unregister_dax_mapping, &mapping->dev);
+ __release_region(&dax_region->res, range->start,
+ range_len(range));
+ dev_dax->nr_range--;
+ dev_dbg(dev, "delete range[%d]: %#llx:%#llx\n", i,
+ (unsigned long long) range->start,
+ (unsigned long long) range->end);
+ to_shrink -= shrink;
+ if (!to_shrink)
+ break;
+ continue;
+ }
+
+ for_each_dax_region_resource(dax_region, res)
+ if (strcmp(res->name, dev_name(dev)) == 0
+ && res->start == range->start) {
+ adjust = res;
+ break;
+ }
+
+ if (dev_WARN_ONCE(dev, !adjust || i != dev_dax->nr_range - 1,
+ "failed to find matching resource\n"))
+ return -ENXIO;
+ return adjust_dev_dax_range(dev_dax, adjust, range_len(range)
+ - shrink);
+ }
+ return 0;
+}
- return dax_region->target_node;
+/*
+ * Only allow adjustments that preserve the relative pgoff of existing
+ * allocations. I.e. the dev_dax->ranges array is ordered by increasing pgoff.
+ */
+static bool adjust_ok(struct dev_dax *dev_dax, struct resource *res)
+{
+ struct dev_dax_range *last;
+ int i;
+
+ if (dev_dax->nr_range == 0)
+ return false;
+ if (strcmp(res->name, dev_name(&dev_dax->dev)) != 0)
+ return false;
+ last = &dev_dax->ranges[dev_dax->nr_range - 1];
+ if (last->range.start != res->start || last->range.end != res->end)
+ return false;
+ for (i = 0; i < dev_dax->nr_range - 1; i++) {
+ struct dev_dax_range *dax_range = &dev_dax->ranges[i];
+
+ if (dax_range->pgoff > last->pgoff)
+ return false;
+ }
+
+ return true;
}
-static ssize_t target_node_show(struct device *dev,
+static ssize_t dev_dax_resize(struct dax_region *dax_region,
+ struct dev_dax *dev_dax, resource_size_t size)
+{
+ resource_size_t avail = dax_region_avail_size(dax_region), to_alloc;
+ resource_size_t dev_size = dev_dax_size(dev_dax);
+ struct resource *region_res = &dax_region->res;
+ struct device *dev = &dev_dax->dev;
+ struct resource *res, *first;
+ resource_size_t alloc = 0;
+ int rc;
+
+ if (dev->driver)
+ return -EBUSY;
+ if (size == dev_size)
+ return 0;
+ if (size > dev_size && size - dev_size > avail)
+ return -ENOSPC;
+ if (size < dev_size)
+ return dev_dax_shrink(dev_dax, size);
+
+ to_alloc = size - dev_size;
+ if (dev_WARN_ONCE(dev, !alloc_is_aligned(dev_dax, to_alloc),
+ "resize of %pa misaligned\n", &to_alloc))
+ return -ENXIO;
+
+ /*
+ * Expand the device into the unused portion of the region. This
+ * may involve adjusting the end of an existing resource, or
+ * allocating a new resource.
+ */
+retry:
+ first = region_res->child;
+ if (!first)
+ return alloc_dev_dax_range(dev_dax, dax_region->res.start, to_alloc);
+
+ rc = -ENOSPC;
+ for (res = first; res; res = res->sibling) {
+ struct resource *next = res->sibling;
+
+ /* space at the beginning of the region */
+ if (res == first && res->start > dax_region->res.start) {
+ alloc = min(res->start - dax_region->res.start, to_alloc);
+ rc = alloc_dev_dax_range(dev_dax, dax_region->res.start, alloc);
+ break;
+ }
+
+ alloc = 0;
+ /* space between allocations */
+ if (next && next->start > res->end + 1)
+ alloc = min(next->start - (res->end + 1), to_alloc);
+
+ /* space at the end of the region */
+ if (!alloc && !next && res->end < region_res->end)
+ alloc = min(region_res->end - res->end, to_alloc);
+
+ if (!alloc)
+ continue;
+
+ if (adjust_ok(dev_dax, res)) {
+ rc = adjust_dev_dax_range(dev_dax, res, resource_size(res) + alloc);
+ break;
+ }
+ rc = alloc_dev_dax_range(dev_dax, res->end + 1, alloc);
+ break;
+ }
+ if (rc)
+ return rc;
+ to_alloc -= alloc;
+ if (to_alloc)
+ goto retry;
+ return 0;
+}
+
+static ssize_t size_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ ssize_t rc;
+ unsigned long long val;
+ struct dev_dax *dev_dax = to_dev_dax(dev);
+ struct dax_region *dax_region = dev_dax->region;
+
+ rc = kstrtoull(buf, 0, &val);
+ if (rc)
+ return rc;
+
+ if (!alloc_is_aligned(dev_dax, val)) {
+ dev_dbg(dev, "%s: size: %lld misaligned\n", __func__, val);
+ return -EINVAL;
+ }
+
+ device_lock(dax_region->dev);
+ if (!dax_region->dev->driver) {
+ device_unlock(dax_region->dev);
+ return -ENXIO;
+ }
+ device_lock(dev);
+ rc = dev_dax_resize(dax_region, dev_dax, val);
+ device_unlock(dev);
+ device_unlock(dax_region->dev);
+
+ return rc == 0 ? len : rc;
+}
+static DEVICE_ATTR_RW(size);
+
+static ssize_t range_parse(const char *opt, size_t len, struct range *range)
+{
+ unsigned long long addr = 0;
+ char *start, *end, *str;
+ ssize_t rc = EINVAL;
+
+ str = kstrdup(opt, GFP_KERNEL);
+ if (!str)
+ return rc;
+
+ end = str;
+ start = strsep(&end, "-");
+ if (!start || !end)
+ goto err;
+
+ rc = kstrtoull(start, 16, &addr);
+ if (rc)
+ goto err;
+ range->start = addr;
+
+ rc = kstrtoull(end, 16, &addr);
+ if (rc)
+ goto err;
+ range->end = addr;
+
+err:
+ kfree(str);
+ return rc;
+}
+
+static ssize_t mapping_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct dev_dax *dev_dax = to_dev_dax(dev);
+ struct dax_region *dax_region = dev_dax->region;
+ size_t to_alloc;
+ struct range r;
+ ssize_t rc;
+
+ rc = range_parse(buf, len, &r);
+ if (rc)
+ return rc;
+
+ rc = -ENXIO;
+ device_lock(dax_region->dev);
+ if (!dax_region->dev->driver) {
+ device_unlock(dax_region->dev);
+ return rc;
+ }
+ device_lock(dev);
+
+ to_alloc = range_len(&r);
+ if (alloc_is_aligned(dev_dax, to_alloc))
+ rc = alloc_dev_dax_range(dev_dax, r.start, to_alloc);
+ device_unlock(dev);
+ device_unlock(dax_region->dev);
+
+ return rc == 0 ? len : rc;
+}
+static DEVICE_ATTR_WO(mapping);
+
+static ssize_t align_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct dev_dax *dev_dax = to_dev_dax(dev);
- return sprintf(buf, "%d\n", dev_dax_target_node(dev_dax));
+ return sprintf(buf, "%d\n", dev_dax->align);
}
-static DEVICE_ATTR_RO(target_node);
-static unsigned long long dev_dax_resource(struct dev_dax *dev_dax)
+static ssize_t dev_dax_validate_align(struct dev_dax *dev_dax)
{
+ resource_size_t dev_size = dev_dax_size(dev_dax);
+ struct device *dev = &dev_dax->dev;
+ int i;
+
+ if (dev_size > 0 && !alloc_is_aligned(dev_dax, dev_size)) {
+ dev_dbg(dev, "%s: align %u invalid for size %pa\n",
+ __func__, dev_dax->align, &dev_size);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < dev_dax->nr_range; i++) {
+ size_t len = range_len(&dev_dax->ranges[i].range);
+
+ if (!alloc_is_aligned(dev_dax, len)) {
+ dev_dbg(dev, "%s: align %u invalid for range %d\n",
+ __func__, dev_dax->align, i);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static ssize_t align_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct dev_dax *dev_dax = to_dev_dax(dev);
struct dax_region *dax_region = dev_dax->region;
+ unsigned long val, align_save;
+ ssize_t rc;
- return dax_region->res.start;
+ rc = kstrtoul(buf, 0, &val);
+ if (rc)
+ return -ENXIO;
+
+ if (!dax_align_valid(val))
+ return -EINVAL;
+
+ device_lock(dax_region->dev);
+ if (!dax_region->dev->driver) {
+ device_unlock(dax_region->dev);
+ return -ENXIO;
+ }
+
+ device_lock(dev);
+ if (dev->driver) {
+ rc = -EBUSY;
+ goto out_unlock;
+ }
+
+ align_save = dev_dax->align;
+ dev_dax->align = val;
+ rc = dev_dax_validate_align(dev_dax);
+ if (rc)
+ dev_dax->align = align_save;
+out_unlock:
+ device_unlock(dev);
+ device_unlock(dax_region->dev);
+ return rc == 0 ? len : rc;
}
+static DEVICE_ATTR_RW(align);
+
+static int dev_dax_target_node(struct dev_dax *dev_dax)
+{
+ struct dax_region *dax_region = dev_dax->region;
+
+ return dax_region->target_node;
+}
+
+static ssize_t target_node_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dev_dax *dev_dax = to_dev_dax(dev);
+
+ return sprintf(buf, "%d\n", dev_dax_target_node(dev_dax));
+}
+static DEVICE_ATTR_RO(target_node);
static ssize_t resource_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct dev_dax *dev_dax = to_dev_dax(dev);
+ struct dax_region *dax_region = dev_dax->region;
+ unsigned long long start;
+
+ if (dev_dax->nr_range < 1)
+ start = dax_region->res.start;
+ else
+ start = dev_dax->ranges[0].range.start;
- return sprintf(buf, "%#llx\n", dev_dax_resource(dev_dax));
+ return sprintf(buf, "%#llx\n", start);
}
static DEVICE_ATTR(resource, 0400, resource_show, NULL);
@@ -333,18 +1230,26 @@ static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int n)
{
struct device *dev = container_of(kobj, struct device, kobj);
struct dev_dax *dev_dax = to_dev_dax(dev);
+ struct dax_region *dax_region = dev_dax->region;
if (a == &dev_attr_target_node.attr && dev_dax_target_node(dev_dax) < 0)
return 0;
if (a == &dev_attr_numa_node.attr && !IS_ENABLED(CONFIG_NUMA))
return 0;
+ if (a == &dev_attr_mapping.attr && is_static(dax_region))
+ return 0;
+ if ((a == &dev_attr_align.attr ||
+ a == &dev_attr_size.attr) && is_static(dax_region))
+ return 0444;
return a->mode;
}
static struct attribute *dev_dax_attributes[] = {
&dev_attr_modalias.attr,
&dev_attr_size.attr,
+ &dev_attr_mapping.attr,
&dev_attr_target_node.attr,
+ &dev_attr_align.attr,
&dev_attr_resource.attr,
&dev_attr_numa_node.attr,
NULL,
@@ -360,24 +1265,17 @@ static const struct attribute_group *dax_attribute_groups[] = {
NULL,
};
-void kill_dev_dax(struct dev_dax *dev_dax)
-{
- struct dax_device *dax_dev = dev_dax->dax_dev;
- struct inode *inode = dax_inode(dax_dev);
-
- kill_dax(dax_dev);
- unmap_mapping_range(inode->i_mapping, 0, 0, 1);
-}
-EXPORT_SYMBOL_GPL(kill_dev_dax);
-
static void dev_dax_release(struct device *dev)
{
struct dev_dax *dev_dax = to_dev_dax(dev);
struct dax_region *dax_region = dev_dax->region;
struct dax_device *dax_dev = dev_dax->dax_dev;
- dax_region_put(dax_region);
put_dax(dax_dev);
+ free_dev_dax_id(dev_dax);
+ dax_region_put(dax_region);
+ kfree(dev_dax->ranges);
+ kfree(dev_dax->pgmap);
kfree(dev_dax);
}
@@ -386,35 +1284,61 @@ static const struct device_type dev_dax_type = {
.groups = dax_attribute_groups,
};
-static void unregister_dev_dax(void *dev)
-{
- struct dev_dax *dev_dax = to_dev_dax(dev);
-
- dev_dbg(dev, "%s\n", __func__);
-
- kill_dev_dax(dev_dax);
- device_del(dev);
- put_device(dev);
-}
-
-struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
- struct dev_pagemap *pgmap, enum dev_dax_subsys subsys)
+struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
{
+ struct dax_region *dax_region = data->dax_region;
struct device *parent = dax_region->dev;
struct dax_device *dax_dev;
struct dev_dax *dev_dax;
struct inode *inode;
struct device *dev;
- int rc = -ENOMEM;
-
- if (id < 0)
- return ERR_PTR(-EINVAL);
+ int rc;
dev_dax = kzalloc(sizeof(*dev_dax), GFP_KERNEL);
if (!dev_dax)
return ERR_PTR(-ENOMEM);
- memcpy(&dev_dax->pgmap, pgmap, sizeof(*pgmap));
+ if (is_static(dax_region)) {
+ if (dev_WARN_ONCE(parent, data->id < 0,
+ "dynamic id specified to static region\n")) {
+ rc = -EINVAL;
+ goto err_id;
+ }
+
+ dev_dax->id = data->id;
+ } else {
+ if (dev_WARN_ONCE(parent, data->id >= 0,
+ "static id specified to dynamic region\n")) {
+ rc = -EINVAL;
+ goto err_id;
+ }
+
+ rc = ida_alloc(&dax_region->ida, GFP_KERNEL);
+ if (rc < 0)
+ goto err_id;
+ dev_dax->id = rc;
+ }
+
+ dev_dax->region = dax_region;
+ dev = &dev_dax->dev;
+ device_initialize(dev);
+ dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id);
+
+ rc = alloc_dev_dax_range(dev_dax, dax_region->res.start, data->size);
+ if (rc)
+ goto err_range;
+
+ if (data->pgmap) {
+ dev_WARN_ONCE(parent, !is_static(dax_region),
+ "custom dev_pagemap requires a static dax_region\n");
+
+ dev_dax->pgmap = kmemdup(data->pgmap,
+ sizeof(struct dev_pagemap), GFP_KERNEL);
+ if (!dev_dax->pgmap) {
+ rc = -ENOMEM;
+ goto err_pgmap;
+ }
+ }
/*
* No 'host' or dax_operations since there is no access to this
@@ -423,30 +1347,26 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC);
if (IS_ERR(dax_dev)) {
rc = PTR_ERR(dax_dev);
- goto err;
+ goto err_alloc_dax;
}
/* a device_dax instance is dead while the driver is not attached */
kill_dax(dax_dev);
- /* from here on we're committed to teardown via dax_dev_release() */
- dev = &dev_dax->dev;
- device_initialize(dev);
-
dev_dax->dax_dev = dax_dev;
- dev_dax->region = dax_region;
dev_dax->target_node = dax_region->target_node;
+ dev_dax->align = dax_region->align;
+ ida_init(&dev_dax->ida);
kref_get(&dax_region->kref);
inode = dax_inode(dax_dev);
dev->devt = inode->i_rdev;
- if (subsys == DEV_DAX_BUS)
+ if (data->subsys == DEV_DAX_BUS)
dev->bus = &dax_bus_type;
else
dev->class = dax_class;
dev->parent = parent;
dev->type = &dev_dax_type;
- dev_set_name(dev, "dax%d.%d", dax_region->id, id);
rc = device_add(dev);
if (rc) {
@@ -459,14 +1379,27 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
if (rc)
return ERR_PTR(rc);
+ /* register mapping device for the initial allocation range */
+ if (dev_dax->nr_range && range_len(&dev_dax->ranges[0].range)) {
+ rc = devm_register_dax_mapping(dev_dax, 0);
+ if (rc)
+ return ERR_PTR(rc);
+ }
+
return dev_dax;
- err:
+err_alloc_dax:
+ kfree(dev_dax->pgmap);
+err_pgmap:
+ free_dev_dax_ranges(dev_dax);
+err_range:
+ free_dev_dax_id(dev_dax);
+err_id:
kfree(dev_dax);
return ERR_PTR(rc);
}
-EXPORT_SYMBOL_GPL(__devm_create_dev_dax);
+EXPORT_SYMBOL_GPL(devm_create_dev_dax);
static int match_always_count;
diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
index 9e4eba67e8b9..72b92f95509f 100644
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@ -3,29 +3,33 @@
#ifndef __DAX_BUS_H__
#define __DAX_BUS_H__
#include <linux/device.h>
+#include <linux/range.h>
struct dev_dax;
struct resource;
struct dax_device;
struct dax_region;
void dax_region_put(struct dax_region *dax_region);
+
+#define IORESOURCE_DAX_STATIC (1UL << 0)
struct dax_region *alloc_dax_region(struct device *parent, int region_id,
- struct resource *res, int target_node, unsigned int align,
- unsigned long long flags);
+ struct range *range, int target_node, unsigned int align,
+ unsigned long flags);
enum dev_dax_subsys {
- DEV_DAX_BUS,
+ DEV_DAX_BUS = 0, /* zeroed dev_dax_data picks this by default */
DEV_DAX_CLASS,
};
-struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
- struct dev_pagemap *pgmap, enum dev_dax_subsys subsys);
+struct dev_dax_data {
+ struct dax_region *dax_region;
+ struct dev_pagemap *pgmap;
+ enum dev_dax_subsys subsys;
+ resource_size_t size;
+ int id;
+};
-static inline struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
- int id, struct dev_pagemap *pgmap)
-{
- return __devm_create_dev_dax(dax_region, id, pgmap, DEV_DAX_BUS);
-}
+struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data);
/* to be deleted when DEV_DAX_CLASS is removed */
struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys);
@@ -34,6 +38,8 @@ struct dax_device_driver {
struct device_driver drv;
struct list_head ids;
int match_always;
+ int (*probe)(struct dev_dax *dev);
+ int (*remove)(struct dev_dax *dev);
};
int __dax_driver_register(struct dax_device_driver *dax_drv,
@@ -44,7 +50,7 @@ void dax_driver_unregister(struct dax_device_driver *dax_drv);
void kill_dev_dax(struct dev_dax *dev_dax);
#if IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT)
-int dev_dax_probe(struct device *dev);
+int dev_dax_probe(struct dev_dax *dev_dax);
#endif
/*
diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 16850d5388ab..1c974b7caae6 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -7,6 +7,7 @@
#include <linux/device.h>
#include <linux/cdev.h>
+#include <linux/idr.h>
/* private routines between core files */
struct dax_device;
@@ -22,8 +23,10 @@ void dax_bus_exit(void);
* @kref: to pin while other agents have a need to do lookups
* @dev: parent device backing this region
* @align: allocation and mapping alignment for child dax devices
- * @res: physical address range of the region
- * @pfn_flags: identify whether the pfns are paged back or not
+ * @ida: instance id allocator
+ * @res: resource tree to track instance allocations
+ * @seed: allow userspace to find the first unbound seed device
+ * @youngest: allow userspace to find the most recently created device
*/
struct dax_region {
int id;
@@ -31,8 +34,16 @@ struct dax_region {
struct kref kref;
struct device *dev;
unsigned int align;
+ struct ida ida;
struct resource res;
- unsigned long long pfn_flags;
+ struct device *seed;
+ struct device *youngest;
+};
+
+struct dax_mapping {
+ struct device dev;
+ int range_id;
+ int id;
};
/**
@@ -41,22 +52,57 @@ struct dax_region {
* @region - parent region
* @dax_dev - core dax functionality
* @target_node: effective numa node if dev_dax memory range is onlined
+ * @id: ida allocated id
+ * @ida: mapping id allocator
* @dev - device core
* @pgmap - pgmap for memmap setup / lifetime (driver owned)
- * @dax_mem_res: physical address range of hotadded DAX memory
- * @dax_mem_name: name for hotadded DAX memory via add_memory_driver_managed()
+ * @nr_range: size of @ranges
+ * @ranges: resource-span + pgoff tuples for the instance
*/
struct dev_dax {
struct dax_region *region;
struct dax_device *dax_dev;
+ unsigned int align;
int target_node;
+ int id;
+ struct ida ida;
struct device dev;
- struct dev_pagemap pgmap;
- struct resource *dax_kmem_res;
+ struct dev_pagemap *pgmap;
+ int nr_range;
+ struct dev_dax_range {
+ unsigned long pgoff;
+ struct range range;
+ struct dax_mapping *mapping;
+ } *ranges;
};
static inline struct dev_dax *to_dev_dax(struct device *dev)
{
return container_of(dev, struct dev_dax, dev);
}
+
+static inline struct dax_mapping *to_dax_mapping(struct device *dev)
+{
+ return container_of(dev, struct dax_mapping, dev);
+}
+
+phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline bool dax_align_valid(unsigned long align)
+{
+ if (align == PUD_SIZE && IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
+ return true;
+ if (align == PMD_SIZE && has_transparent_hugepage())
+ return true;
+ if (align == PAGE_SIZE)
+ return true;
+ return false;
+}
+#else
+static inline bool dax_align_valid(unsigned long align)
+{
+ return align == PAGE_SIZE;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 1e89513f3c59..25e0b84a4296 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -17,7 +17,6 @@
static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
const char *func)
{
- struct dax_region *dax_region = dev_dax->region;
struct device *dev = &dev_dax->dev;
unsigned long mask;
@@ -32,7 +31,7 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
return -EINVAL;
}
- mask = dax_region->align - 1;
+ mask = dev_dax->align - 1;
if (vma->vm_start & mask || vma->vm_end & mask) {
dev_info_ratelimited(dev,
"%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
@@ -41,14 +40,6 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
return -EINVAL;
}
- if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
- && (vma->vm_flags & VM_DONTCOPY) == 0) {
- dev_info_ratelimited(dev,
- "%s: %s: fail, dax range requires MADV_DONTFORK\n",
- current->comm, func);
- return -EINVAL;
- }
-
if (!vma_is_dax(vma)) {
dev_info_ratelimited(dev,
"%s: %s: fail, vma is not DAX capable\n",
@@ -63,15 +54,22 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
__weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
unsigned long size)
{
- struct resource *res = &dev_dax->region->res;
- phys_addr_t phys;
-
- phys = pgoff * PAGE_SIZE + res->start;
- if (phys >= res->start && phys <= res->end) {
- if (phys + size - 1 <= res->end)
+ int i;
+
+ for (i = 0; i < dev_dax->nr_range; i++) {
+ struct dev_dax_range *dax_range = &dev_dax->ranges[i];
+ struct range *range = &dax_range->range;
+ unsigned long long pgoff_end;
+ phys_addr_t phys;
+
+ pgoff_end = dax_range->pgoff + PHYS_PFN(range_len(range)) - 1;
+ if (pgoff < dax_range->pgoff || pgoff > pgoff_end)
+ continue;
+ phys = PFN_PHYS(pgoff - dax_range->pgoff) + range->start;
+ if (phys + size - 1 <= range->end)
return phys;
+ break;
}
-
return -1;
}
@@ -79,21 +77,19 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
struct vm_fault *vmf, pfn_t *pfn)
{
struct device *dev = &dev_dax->dev;
- struct dax_region *dax_region;
phys_addr_t phys;
unsigned int fault_size = PAGE_SIZE;
if (check_vma(dev_dax, vmf->vma, __func__))
return VM_FAULT_SIGBUS;
- dax_region = dev_dax->region;
- if (dax_region->align > PAGE_SIZE) {
+ if (dev_dax->align > PAGE_SIZE) {
dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n",
- dax_region->align, fault_size);
+ dev_dax->align, fault_size);
return VM_FAULT_SIGBUS;
}
- if (fault_size != dax_region->align)
+ if (fault_size != dev_dax->align)
return VM_FAULT_SIGBUS;
phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE);
@@ -102,7 +98,7 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
return VM_FAULT_SIGBUS;
}
- *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+ *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
return vmf_insert_mixed(vmf->vma, vmf->address, *pfn);
}
@@ -112,7 +108,6 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
{
unsigned long pmd_addr = vmf->address & PMD_MASK;
struct device *dev = &dev_dax->dev;
- struct dax_region *dax_region;
phys_addr_t phys;
pgoff_t pgoff;
unsigned int fault_size = PMD_SIZE;
@@ -120,22 +115,15 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
if (check_vma(dev_dax, vmf->vma, __func__))
return VM_FAULT_SIGBUS;
- dax_region = dev_dax->region;
- if (dax_region->align > PMD_SIZE) {
+ if (dev_dax->align > PMD_SIZE) {
dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n",
- dax_region->align, fault_size);
- return VM_FAULT_SIGBUS;
- }
-
- /* dax pmd mappings require pfn_t_devmap() */
- if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
- dev_dbg(dev, "region lacks devmap flags\n");
+ dev_dax->align, fault_size);
return VM_FAULT_SIGBUS;
}
- if (fault_size < dax_region->align)
+ if (fault_size < dev_dax->align)
return VM_FAULT_SIGBUS;
- else if (fault_size > dax_region->align)
+ else if (fault_size > dev_dax->align)
return VM_FAULT_FALLBACK;
/* if we are outside of the VMA */
@@ -150,7 +138,7 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
return VM_FAULT_SIGBUS;
}
- *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+ *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE);
}
@@ -161,7 +149,6 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
{
unsigned long pud_addr = vmf->address & PUD_MASK;
struct device *dev = &dev_dax->dev;
- struct dax_region *dax_region;
phys_addr_t phys;
pgoff_t pgoff;
unsigned int fault_size = PUD_SIZE;
@@ -170,22 +157,15 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
if (check_vma(dev_dax, vmf->vma, __func__))
return VM_FAULT_SIGBUS;
- dax_region = dev_dax->region;
- if (dax_region->align > PUD_SIZE) {
+ if (dev_dax->align > PUD_SIZE) {
dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n",
- dax_region->align, fault_size);
- return VM_FAULT_SIGBUS;
- }
-
- /* dax pud mappings require pfn_t_devmap() */
- if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
- dev_dbg(dev, "region lacks devmap flags\n");
+ dev_dax->align, fault_size);
return VM_FAULT_SIGBUS;
}
- if (fault_size < dax_region->align)
+ if (fault_size < dev_dax->align)
return VM_FAULT_SIGBUS;
- else if (fault_size > dax_region->align)
+ else if (fault_size > dev_dax->align)
return VM_FAULT_FALLBACK;
/* if we are outside of the VMA */
@@ -200,7 +180,7 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
return VM_FAULT_SIGBUS;
}
- *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+ *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE);
}
@@ -280,9 +260,8 @@ static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr)
{
struct file *filp = vma->vm_file;
struct dev_dax *dev_dax = filp->private_data;
- struct dax_region *dax_region = dev_dax->region;
- if (!IS_ALIGNED(addr, dax_region->align))
+ if (!IS_ALIGNED(addr, dev_dax->align))
return -EINVAL;
return 0;
}
@@ -291,9 +270,8 @@ static unsigned long dev_dax_pagesize(struct vm_area_struct *vma)
{
struct file *filp = vma->vm_file;
struct dev_dax *dev_dax = filp->private_data;
- struct dax_region *dax_region = dev_dax->region;
- return dax_region->align;
+ return dev_dax->align;
}
static const struct vm_operations_struct dax_vm_ops = {
@@ -332,13 +310,11 @@ static unsigned long dax_get_unmapped_area(struct file *filp,
{
unsigned long off, off_end, off_align, len_align, addr_align, align;
struct dev_dax *dev_dax = filp ? filp->private_data : NULL;
- struct dax_region *dax_region;
if (!dev_dax || addr)
goto out;
- dax_region = dev_dax->region;
- align = dax_region->align;
+ align = dev_dax->align;
off = pgoff << PAGE_SHIFT;
off_end = off + len;
off_align = round_up(off, align);
@@ -412,25 +388,45 @@ static void dev_dax_kill(void *dev_dax)
kill_dev_dax(dev_dax);
}
-int dev_dax_probe(struct device *dev)
+int dev_dax_probe(struct dev_dax *dev_dax)
{
- struct dev_dax *dev_dax = to_dev_dax(dev);
struct dax_device *dax_dev = dev_dax->dax_dev;
- struct resource *res = &dev_dax->region->res;
+ struct device *dev = &dev_dax->dev;
+ struct dev_pagemap *pgmap;
struct inode *inode;
struct cdev *cdev;
void *addr;
- int rc;
+ int rc, i;
- /* 1:1 map region resource range to device-dax instance range */
- if (!devm_request_mem_region(dev, res->start, resource_size(res),
- dev_name(dev))) {
- dev_warn(dev, "could not reserve region %pR\n", res);
- return -EBUSY;
+ pgmap = dev_dax->pgmap;
+ if (dev_WARN_ONCE(dev, pgmap && dev_dax->nr_range > 1,
+ "static pgmap / multi-range device conflict\n"))
+ return -EINVAL;
+
+ if (!pgmap) {
+ pgmap = devm_kzalloc(dev, sizeof(*pgmap) + sizeof(struct range)
+ * (dev_dax->nr_range - 1), GFP_KERNEL);
+ if (!pgmap)
+ return -ENOMEM;
+ pgmap->nr_range = dev_dax->nr_range;
+ }
+
+ for (i = 0; i < dev_dax->nr_range; i++) {
+ struct range *range = &dev_dax->ranges[i].range;
+
+ if (!devm_request_mem_region(dev, range->start,
+ range_len(range), dev_name(dev))) {
+ dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n",
+ i, range->start, range->end);
+ return -EBUSY;
+ }
+ /* don't update the range for static pgmap */
+ if (!dev_dax->pgmap)
+ pgmap->ranges[i] = *range;
}
- dev_dax->pgmap.type = MEMORY_DEVICE_GENERIC;
- addr = devm_memremap_pages(dev, &dev_dax->pgmap);
+ pgmap->type = MEMORY_DEVICE_GENERIC;
+ addr = devm_memremap_pages(dev, pgmap);
if (IS_ERR(addr))
return PTR_ERR(addr);
@@ -456,17 +452,15 @@ int dev_dax_probe(struct device *dev)
}
EXPORT_SYMBOL_GPL(dev_dax_probe);
-static int dev_dax_remove(struct device *dev)
+static int dev_dax_remove(struct dev_dax *dev_dax)
{
/* all probe actions are unwound by devm */
return 0;
}
static struct dax_device_driver device_dax_driver = {
- .drv = {
- .probe = dev_dax_probe,
- .remove = dev_dax_remove,
- },
+ .probe = dev_dax_probe,
+ .remove = dev_dax_remove,
.match_always = 1,
};
diff --git a/drivers/dax/hmem/Makefile b/drivers/dax/hmem/Makefile
new file mode 100644
index 000000000000..57377b4c3d47
--- /dev/null
+++ b/drivers/dax/hmem/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_DEV_DAX_HMEM) += dax_hmem.o
+obj-$(CONFIG_DEV_DAX_HMEM_DEVICES) += device_hmem.o
+
+device_hmem-y := device.o
+dax_hmem-y := hmem.o
diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c
new file mode 100644
index 000000000000..cb6401c9e9a4
--- /dev/null
+++ b/drivers/dax/hmem/device.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/platform_device.h>
+#include <linux/memregion.h>
+#include <linux/module.h>
+#include <linux/dax.h>
+#include <linux/mm.h>
+
+static bool nohmem;
+module_param_named(disable, nohmem, bool, 0444);
+
+void hmem_register_device(int target_nid, struct resource *r)
+{
+ /* define a clean / non-busy resource for the platform device */
+ struct resource res = {
+ .start = r->start,
+ .end = r->end,
+ .flags = IORESOURCE_MEM,
+ };
+ struct platform_device *pdev;
+ struct memregion_info info;
+ int rc, id;
+
+ if (nohmem)
+ return;
+
+ rc = region_intersects(res.start, resource_size(&res), IORESOURCE_MEM,
+ IORES_DESC_SOFT_RESERVED);
+ if (rc != REGION_INTERSECTS)
+ return;
+
+ id = memregion_alloc(GFP_KERNEL);
+ if (id < 0) {
+ pr_err("memregion allocation failure for %pr\n", &res);
+ return;
+ }
+
+ pdev = platform_device_alloc("hmem", id);
+ if (!pdev) {
+ pr_err("hmem device allocation failure for %pr\n", &res);
+ goto out_pdev;
+ }
+
+ pdev->dev.numa_node = numa_map_to_online_node(target_nid);
+ info = (struct memregion_info) {
+ .target_node = target_nid,
+ };
+ rc = platform_device_add_data(pdev, &info, sizeof(info));
+ if (rc < 0) {
+ pr_err("hmem memregion_info allocation failure for %pr\n", &res);
+ goto out_pdev;
+ }
+
+ rc = platform_device_add_resources(pdev, &res, 1);
+ if (rc < 0) {
+ pr_err("hmem resource allocation failure for %pr\n", &res);
+ goto out_resource;
+ }
+
+ rc = platform_device_add(pdev);
+ if (rc < 0) {
+ dev_err(&pdev->dev, "device add failed for %pr\n", &res);
+ goto out_resource;
+ }
+
+ return;
+
+out_resource:
+ put_device(&pdev->dev);
+out_pdev:
+ memregion_free(id);
+}
+
+static __init int hmem_register_one(struct resource *res, void *data)
+{
+ /*
+ * If the resource is not a top-level resource it was already
+ * assigned to a device by the HMAT parsing.
+ */
+ if (res->parent != &iomem_resource) {
+ pr_info("HMEM: skip %pr, already claimed\n", res);
+ return 0;
+ }
+
+ hmem_register_device(phys_to_target_node(res->start), res);
+
+ return 0;
+}
+
+static __init int hmem_init(void)
+{
+ walk_iomem_res_desc(IORES_DESC_SOFT_RESERVED,
+ IORESOURCE_MEM, 0, -1, NULL, hmem_register_one);
+ return 0;
+}
+
+/*
+ * As this is a fallback for address ranges unclaimed by the ACPI HMAT
+ * parsing it must be at an initcall level greater than hmat_init().
+ */
+late_initcall(hmem_init);
diff --git a/drivers/dax/hmem.c b/drivers/dax/hmem/hmem.c
index fe7214daf62e..1bf040dbc834 100644
--- a/drivers/dax/hmem.c
+++ b/drivers/dax/hmem/hmem.c
@@ -3,30 +3,39 @@
#include <linux/memregion.h>
#include <linux/module.h>
#include <linux/pfn_t.h>
-#include "bus.h"
+#include "../bus.h"
+
+static bool region_idle;
+module_param_named(region_idle, region_idle, bool, 0644);
static int dax_hmem_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
- struct dev_pagemap pgmap = { };
struct dax_region *dax_region;
struct memregion_info *mri;
+ struct dev_dax_data data;
struct dev_dax *dev_dax;
struct resource *res;
+ struct range range;
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
if (!res)
return -ENOMEM;
mri = dev->platform_data;
- memcpy(&pgmap.res, res, sizeof(*res));
-
- dax_region = alloc_dax_region(dev, pdev->id, res, mri->target_node,
- PMD_SIZE, PFN_DEV|PFN_MAP);
+ range.start = res->start;
+ range.end = res->end;
+ dax_region = alloc_dax_region(dev, pdev->id, &range, mri->target_node,
+ PMD_SIZE, 0);
if (!dax_region)
return -ENOMEM;
- dev_dax = devm_create_dev_dax(dax_region, 0, &pgmap);
+ data = (struct dev_dax_data) {
+ .dax_region = dax_region,
+ .id = -1,
+ .size = region_idle ? 0 : resource_size(res),
+ };
+ dev_dax = devm_create_dev_dax(&data);
if (IS_ERR(dev_dax))
return PTR_ERR(dev_dax);
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 275aa5f87399..6c933f2b604e 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -19,17 +19,28 @@ static const char *kmem_name;
/* Set if any memory will remain added when the driver will be unloaded. */
static bool any_hotremove_failed;
-int dev_dax_kmem_probe(struct device *dev)
+static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r)
{
- struct dev_dax *dev_dax = to_dev_dax(dev);
- struct resource *res = &dev_dax->region->res;
- resource_size_t kmem_start;
- resource_size_t kmem_size;
- resource_size_t kmem_end;
- struct resource *new_res;
- const char *new_res_name;
+ struct dev_dax_range *dax_range = &dev_dax->ranges[i];
+ struct range *range = &dax_range->range;
+
+ /* memory-block align the hotplug range */
+ r->start = ALIGN(range->start, memory_block_size_bytes());
+ r->end = ALIGN_DOWN(range->end + 1, memory_block_size_bytes()) - 1;
+ if (r->start >= r->end) {
+ r->start = range->start;
+ r->end = range->end;
+ return -ENOSPC;
+ }
+ return 0;
+}
+
+static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
+{
+ struct device *dev = &dev_dax->dev;
+ int i, mapped = 0;
+ char *res_name;
int numa_node;
- int rc;
/*
* Ensure good NUMA information for the persistent memory.
@@ -39,68 +50,80 @@ int dev_dax_kmem_probe(struct device *dev)
*/
numa_node = dev_dax->target_node;
if (numa_node < 0) {
- dev_warn(dev, "rejecting DAX region %pR with invalid node: %d\n",
- res, numa_node);
+ dev_warn(dev, "rejecting DAX region with invalid node: %d\n",
+ numa_node);
return -EINVAL;
}
- /* Hotplug starting at the beginning of the next block: */
- kmem_start = ALIGN(res->start, memory_block_size_bytes());
-
- kmem_size = resource_size(res);
- /* Adjust the size down to compensate for moving up kmem_start: */
- kmem_size -= kmem_start - res->start;
- /* Align the size down to cover only complete blocks: */
- kmem_size &= ~(memory_block_size_bytes() - 1);
- kmem_end = kmem_start + kmem_size;
-
- new_res_name = kstrdup(dev_name(dev), GFP_KERNEL);
- if (!new_res_name)
+ res_name = kstrdup(dev_name(dev), GFP_KERNEL);
+ if (!res_name)
return -ENOMEM;
- /* Region is permanently reserved if hotremove fails. */
- new_res = request_mem_region(kmem_start, kmem_size, new_res_name);
- if (!new_res) {
- dev_warn(dev, "could not reserve region [%pa-%pa]\n",
- &kmem_start, &kmem_end);
- kfree(new_res_name);
- return -EBUSY;
+ for (i = 0; i < dev_dax->nr_range; i++) {
+ struct resource *res;
+ struct range range;
+ int rc;
+
+ rc = dax_kmem_range(dev_dax, i, &range);
+ if (rc) {
+ dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n",
+ i, range.start, range.end);
+ continue;
+ }
+
+ /* Region is permanently reserved if hotremove fails. */
+ res = request_mem_region(range.start, range_len(&range), res_name);
+ if (!res) {
+ dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n",
+ i, range.start, range.end);
+ /*
+ * Once some memory has been onlined we can't
+ * assume that it can be un-onlined safely.
+ */
+ if (mapped)
+ continue;
+ kfree(res_name);
+ return -EBUSY;
+ }
+
+ /*
+ * Set flags appropriate for System RAM. Leave ..._BUSY clear
+ * so that add_memory() can add a child resource. Do not
+ * inherit flags from the parent since it may set new flags
+ * unknown to us that will break add_memory() below.
+ */
+ res->flags = IORESOURCE_SYSTEM_RAM;
+
+ /*
+ * Ensure that future kexec'd kernels will not treat
+ * this as RAM automatically.
+ */
+ rc = add_memory_driver_managed(numa_node, range.start,
+ range_len(&range), kmem_name);
+
+ if (rc) {
+ dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n",
+ i, range.start, range.end);
+ release_mem_region(range.start, range_len(&range));
+ if (mapped)
+ continue;
+ kfree(res_name);
+ return rc;
+ }
+ mapped++;
}
- /*
- * Set flags appropriate for System RAM. Leave ..._BUSY clear
- * so that add_memory() can add a child resource. Do not
- * inherit flags from the parent since it may set new flags
- * unknown to us that will break add_memory() below.
- */
- new_res->flags = IORESOURCE_SYSTEM_RAM;
-
- /*
- * Ensure that future kexec'd kernels will not treat this as RAM
- * automatically.
- */
- rc = add_memory_driver_managed(numa_node, new_res->start,
- resource_size(new_res), kmem_name);
- if (rc) {
- release_resource(new_res);
- kfree(new_res);
- kfree(new_res_name);
- return rc;
- }
- dev_dax->dax_kmem_res = new_res;
+ dev_set_drvdata(dev, res_name);
return 0;
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-static int dev_dax_kmem_remove(struct device *dev)
+static int dev_dax_kmem_remove(struct dev_dax *dev_dax)
{
- struct dev_dax *dev_dax = to_dev_dax(dev);
- struct resource *res = dev_dax->dax_kmem_res;
- resource_size_t kmem_start = res->start;
- resource_size_t kmem_size = resource_size(res);
- const char *res_name = res->name;
- int rc;
+ int i, success = 0;
+ struct device *dev = &dev_dax->dev;
+ const char *res_name = dev_get_drvdata(dev);
/*
* We have one shot for removing memory, if some memory blocks were not
@@ -108,25 +131,36 @@ static int dev_dax_kmem_remove(struct device *dev)
* there is no way to hotremove this memory until reboot because device
* unbind will succeed even if we return failure.
*/
- rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size);
- if (rc) {
+ for (i = 0; i < dev_dax->nr_range; i++) {
+ struct range range;
+ int rc;
+
+ rc = dax_kmem_range(dev_dax, i, &range);
+ if (rc)
+ continue;
+
+ rc = remove_memory(dev_dax->target_node, range.start,
+ range_len(&range));
+ if (rc == 0) {
+ release_mem_region(range.start, range_len(&range));
+ success++;
+ continue;
+ }
any_hotremove_failed = true;
dev_err(dev,
- "DAX region %pR cannot be hotremoved until the next reboot\n",
- res);
- return rc;
+ "mapping%d: %#llx-%#llx cannot be hotremoved until the next reboot\n",
+ i, range.start, range.end);
}
- /* Release and free dax resources */
- release_resource(res);
- kfree(res);
- kfree(res_name);
- dev_dax->dax_kmem_res = NULL;
+ if (success >= dev_dax->nr_range) {
+ kfree(res_name);
+ dev_set_drvdata(dev, NULL);
+ }
return 0;
}
#else
-static int dev_dax_kmem_remove(struct device *dev)
+static int dev_dax_kmem_remove(struct dev_dax *dev_dax)
{
/*
* Without hotremove purposely leak the request_mem_region() for the
@@ -141,10 +175,8 @@ static int dev_dax_kmem_remove(struct device *dev)
#endif /* CONFIG_MEMORY_HOTREMOVE */
static struct dax_device_driver device_dax_kmem_driver = {
- .drv = {
- .probe = dev_dax_kmem_probe,
- .remove = dev_dax_kmem_remove,
- },
+ .probe = dev_dax_kmem_probe,
+ .remove = dev_dax_kmem_remove,
};
static int __init dax_kmem_init(void)
diff --git a/drivers/dax/pmem/compat.c b/drivers/dax/pmem/compat.c
index d7b15e6f30c5..863c114fd88c 100644
--- a/drivers/dax/pmem/compat.c
+++ b/drivers/dax/pmem/compat.c
@@ -22,7 +22,7 @@ static int dax_pmem_compat_probe(struct device *dev)
return -ENOMEM;
device_lock(&dev_dax->dev);
- rc = dev_dax_probe(&dev_dax->dev);
+ rc = dev_dax_probe(dev_dax);
device_unlock(&dev_dax->dev);
devres_close_group(&dev_dax->dev, dev_dax);
diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c
index 2bedf8414fff..62b26bfceab1 100644
--- a/drivers/dax/pmem/core.c
+++ b/drivers/dax/pmem/core.c
@@ -9,11 +9,12 @@
struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
{
- struct resource res;
+ struct range range;
int rc, id, region_id;
resource_size_t offset;
struct nd_pfn_sb *pfn_sb;
struct dev_dax *dev_dax;
+ struct dev_dax_data data;
struct nd_namespace_io *nsio;
struct dax_region *dax_region;
struct dev_pagemap pgmap = { };
@@ -49,16 +50,23 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
if (rc != 2)
return ERR_PTR(-EINVAL);
- /* adjust the dax_region resource to the start of data */
- memcpy(&res, &pgmap.res, sizeof(res));
- res.start += offset;
- dax_region = alloc_dax_region(dev, region_id, &res,
+ /* adjust the dax_region range to the start of data */
+ range = pgmap.range;
+ range.start += offset,
+ dax_region = alloc_dax_region(dev, region_id, &range,
nd_region->target_node, le32_to_cpu(pfn_sb->align),
- PFN_DEV|PFN_MAP);
+ IORESOURCE_DAX_STATIC);
if (!dax_region)
return ERR_PTR(-ENOMEM);
- dev_dax = __devm_create_dev_dax(dax_region, id, &pgmap, subsys);
+ data = (struct dev_dax_data) {
+ .dax_region = dax_region,
+ .id = id,
+ .pgmap = &pgmap,
+ .subsys = subsys,
+ .size = range_len(&range),
+ };
+ dev_dax = devm_create_dev_dax(&data);
/* child dev_dax instances now own the lifetime of the dax_region */
dax_region_put(dax_region);
diff --git a/drivers/firmware/efi/x86_fake_mem.c b/drivers/firmware/efi/x86_fake_mem.c
index e5d6d5a1b240..0bafcc1bb0f6 100644
--- a/drivers/firmware/efi/x86_fake_mem.c
+++ b/drivers/firmware/efi/x86_fake_mem.c
@@ -38,7 +38,7 @@ void __init efi_fake_memmap_early(void)
m_start = mem->range.start;
m_end = mem->range.end;
for_each_efi_memory_desc(md) {
- u64 start, end;
+ u64 start, end, size;
if (md->type != EFI_CONVENTIONAL_MEMORY)
continue;
@@ -58,11 +58,17 @@ void __init efi_fake_memmap_early(void)
*/
start = max(start, m_start);
end = min(end, m_end);
+ size = end - start + 1;
if (end <= start)
continue;
- e820__range_update(start, end - start + 1, E820_TYPE_RAM,
- E820_TYPE_SOFT_RESERVED);
+
+ /*
+ * Ensure each efi_fake_mem instance results in
+ * a unique e820 resource
+ */
+ e820__range_remove(start, size, E820_TYPE_RAM, 1);
+ e820__range_add(start, size, E820_TYPE_SOFT_RESERVED);
e820__update_table(e820_table);
}
}
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
index 38113d3c0138..75e8b71c18b9 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
@@ -258,8 +258,8 @@ shmem_writeback(struct drm_i915_gem_object *obj)
for (i = 0; i < obj->base.size >> PAGE_SHIFT; i++) {
struct page *page;
- page = find_lock_entry(mapping, i);
- if (!page || xa_is_value(page))
+ page = find_lock_page(mapping, i);
+ if (!page)
continue;
if (!page_mapped(page) && clear_page_dirty_for_io(page)) {
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 4e8112fde3e6..a13c6215bba8 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -101,7 +101,7 @@ unsigned long nouveau_dmem_page_addr(struct page *page)
{
struct nouveau_dmem_chunk *chunk = nouveau_page_to_chunk(page);
unsigned long off = (page_to_pfn(page) << PAGE_SHIFT) -
- chunk->pagemap.res.start;
+ chunk->pagemap.range.start;
return chunk->bo->offset + off;
}
@@ -249,7 +249,9 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage)
chunk->drm = drm;
chunk->pagemap.type = MEMORY_DEVICE_PRIVATE;
- chunk->pagemap.res = *res;
+ chunk->pagemap.range.start = res->start;
+ chunk->pagemap.range.end = res->end;
+ chunk->pagemap.nr_range = 1;
chunk->pagemap.ops = &nouveau_dmem_pagemap_ops;
chunk->pagemap.owner = drm->dev;
@@ -273,7 +275,7 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage)
list_add(&chunk->list, &drm->dmem->chunks);
mutex_unlock(&drm->dmem->mutex);
- pfn_first = chunk->pagemap.res.start >> PAGE_SHIFT;
+ pfn_first = chunk->pagemap.range.start >> PAGE_SHIFT;
page = pfn_to_page(pfn_first);
spin_lock(&drm->dmem->lock);
for (i = 0; i < DMEM_CHUNK_NPAGES - 1; ++i, ++page) {
@@ -294,8 +296,7 @@ out_bo_unpin:
out_bo_free:
nouveau_bo_ref(NULL, &chunk->bo);
out_release:
- release_mem_region(chunk->pagemap.res.start,
- resource_size(&chunk->pagemap.res));
+ release_mem_region(chunk->pagemap.range.start, range_len(&chunk->pagemap.range));
out_free:
kfree(chunk);
out:
@@ -382,8 +383,8 @@ nouveau_dmem_fini(struct nouveau_drm *drm)
nouveau_bo_ref(NULL, &chunk->bo);
list_del(&chunk->list);
memunmap_pages(&chunk->pagemap);
- release_mem_region(chunk->pagemap.res.start,
- resource_size(&chunk->pagemap.res));
+ release_mem_region(chunk->pagemap.range.start,
+ range_len(&chunk->pagemap.range));
kfree(chunk);
}
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 0418071a3724..46d885575601 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2198,7 +2198,7 @@ static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size)
addr_end = addr + size - 1;
- for_each_reserved_mem_region(i, &start, &end) {
+ for_each_reserved_mem_range(i, &start, &end) {
if (addr >= start && addr_end <= end)
return true;
}
diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c
index b9eeefa27e3a..aaf6e215a8c6 100644
--- a/drivers/nvdimm/badrange.c
+++ b/drivers/nvdimm/badrange.c
@@ -211,7 +211,7 @@ static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
}
static void badblocks_populate(struct badrange *badrange,
- struct badblocks *bb, const struct resource *res)
+ struct badblocks *bb, const struct range *range)
{
struct badrange_entry *bre;
@@ -222,34 +222,34 @@ static void badblocks_populate(struct badrange *badrange,
u64 bre_end = bre->start + bre->length - 1;
/* Discard intervals with no intersection */
- if (bre_end < res->start)
+ if (bre_end < range->start)
continue;
- if (bre->start > res->end)
+ if (bre->start > range->end)
continue;
/* Deal with any overlap after start of the namespace */
- if (bre->start >= res->start) {
+ if (bre->start >= range->start) {
u64 start = bre->start;
u64 len;
- if (bre_end <= res->end)
+ if (bre_end <= range->end)
len = bre->length;
else
- len = res->start + resource_size(res)
+ len = range->start + range_len(range)
- bre->start;
- __add_badblock_range(bb, start - res->start, len);
+ __add_badblock_range(bb, start - range->start, len);
continue;
}
/*
* Deal with overlap for badrange starting before
* the namespace.
*/
- if (bre->start < res->start) {
+ if (bre->start < range->start) {
u64 len;
- if (bre_end < res->end)
- len = bre->start + bre->length - res->start;
+ if (bre_end < range->end)
+ len = bre->start + bre->length - range->start;
else
- len = resource_size(res);
+ len = range_len(range);
__add_badblock_range(bb, 0, len);
}
}
@@ -267,7 +267,7 @@ static void badblocks_populate(struct badrange *badrange,
* and add badblocks entries for all matching sub-ranges
*/
void nvdimm_badblocks_populate(struct nd_region *nd_region,
- struct badblocks *bb, const struct resource *res)
+ struct badblocks *bb, const struct range *range)
{
struct nvdimm_bus *nvdimm_bus;
@@ -279,7 +279,7 @@ void nvdimm_badblocks_populate(struct nd_region *nd_region,
nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
nvdimm_bus_lock(&nvdimm_bus->dev);
- badblocks_populate(&nvdimm_bus->badrange, bb, res);
+ badblocks_populate(&nvdimm_bus->badrange, bb, range);
nvdimm_bus_unlock(&nvdimm_bus->dev);
}
EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 22d865ba6353..5a7c80053c62 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -303,13 +303,16 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio,
resource_size_t size)
{
- struct resource *res = &nsio->res;
struct nd_namespace_common *ndns = &nsio->common;
+ struct range range = {
+ .start = nsio->res.start,
+ .end = nsio->res.end,
+ };
nsio->size = size;
- if (!devm_request_mem_region(dev, res->start, size,
+ if (!devm_request_mem_region(dev, range.start, size,
dev_name(&ndns->dev))) {
- dev_warn(dev, "could not reserve region %pR\n", res);
+ dev_warn(dev, "could not reserve region %pR\n", &nsio->res);
return -EBUSY;
}
@@ -317,9 +320,9 @@ int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio,
if (devm_init_badblocks(dev, &nsio->bb))
return -ENOMEM;
nvdimm_badblocks_populate(to_nd_region(ndns->dev.parent), &nsio->bb,
- &nsio->res);
+ &range);
- nsio->addr = devm_memremap(dev, res->start, size, ARCH_MEMREMAP_PMEM);
+ nsio->addr = devm_memremap(dev, range.start, size, ARCH_MEMREMAP_PMEM);
return PTR_ERR_OR_ZERO(nsio->addr);
}
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 72740108ba42..696b55556d4d 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -377,8 +377,9 @@ int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt);
const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
char *name);
unsigned int pmem_sector_size(struct nd_namespace_common *ndns);
+struct range;
void nvdimm_badblocks_populate(struct nd_region *nd_region,
- struct badblocks *bb, const struct resource *res);
+ struct badblocks *bb, const struct range *range);
int devm_namespace_enable(struct device *dev, struct nd_namespace_common *ndns,
resource_size_t size);
void devm_namespace_disable(struct device *dev,
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 3e11ef8d3f5b..b499df630d4d 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -672,7 +672,7 @@ static unsigned long init_altmap_reserve(resource_size_t base)
static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
{
- struct resource *res = &pgmap->res;
+ struct range *range = &pgmap->range;
struct vmem_altmap *altmap = &pgmap->altmap;
struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
u64 offset = le64_to_cpu(pfn_sb->dataoff);
@@ -689,16 +689,17 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
.end_pfn = PHYS_PFN(end),
};
- memcpy(res, &nsio->res, sizeof(*res));
- res->start += start_pad;
- res->end -= end_trunc;
-
+ *range = (struct range) {
+ .start = nsio->res.start + start_pad,
+ .end = nsio->res.end - end_trunc,
+ };
+ pgmap->nr_range = 1;
if (nd_pfn->mode == PFN_MODE_RAM) {
if (offset < reserve)
return -EINVAL;
nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
} else if (nd_pfn->mode == PFN_MODE_PMEM) {
- nd_pfn->npfns = PHYS_PFN((resource_size(res) - offset));
+ nd_pfn->npfns = PHYS_PFN((range_len(range) - offset));
if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
dev_info(&nd_pfn->dev,
"number of pfns truncated from %lld to %ld\n",
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index c86a0ceaece6..875076b0ea6c 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -375,7 +375,7 @@ static int pmem_attach_disk(struct device *dev,
struct nd_region *nd_region = to_nd_region(dev->parent);
int nid = dev_to_node(dev), fua;
struct resource *res = &nsio->res;
- struct resource bb_res;
+ struct range bb_range;
struct nd_pfn *nd_pfn = NULL;
struct dax_device *dax_dev;
struct nd_pfn_sb *pfn_sb;
@@ -434,24 +434,27 @@ static int pmem_attach_disk(struct device *dev,
pfn_sb = nd_pfn->pfn_sb;
pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
pmem->pfn_pad = resource_size(res) -
- resource_size(&pmem->pgmap.res);
+ range_len(&pmem->pgmap.range);
pmem->pfn_flags |= PFN_MAP;
- memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
- bb_res.start += pmem->data_offset;
+ bb_range = pmem->pgmap.range;
+ bb_range.start += pmem->data_offset;
} else if (pmem_should_map_pages(dev)) {
- memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
+ pmem->pgmap.range.start = res->start;
+ pmem->pgmap.range.end = res->end;
+ pmem->pgmap.nr_range = 1;
pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
pmem->pgmap.ops = &fsdax_pagemap_ops;
addr = devm_memremap_pages(dev, &pmem->pgmap);
pmem->pfn_flags |= PFN_MAP;
- memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
+ bb_range = pmem->pgmap.range;
} else {
if (devm_add_action_or_reset(dev, pmem_release_queue,
&pmem->pgmap))
return -ENOMEM;
addr = devm_memremap(dev, pmem->phys_addr,
pmem->size, ARCH_MEMREMAP_PMEM);
- memcpy(&bb_res, &nsio->res, sizeof(bb_res));
+ bb_range.start = res->start;
+ bb_range.end = res->end;
}
if (IS_ERR(addr))
@@ -480,7 +483,7 @@ static int pmem_attach_disk(struct device *dev,
/ 512);
if (devm_init_badblocks(dev, &pmem->bb))
return -ENOMEM;
- nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res);
+ nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_range);
disk->bb = &pmem->bb;
if (is_nvdimm_sync(nd_region))
@@ -591,8 +594,8 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
resource_size_t offset = 0, end_trunc = 0;
struct nd_namespace_common *ndns;
struct nd_namespace_io *nsio;
- struct resource res;
struct badblocks *bb;
+ struct range range;
struct kernfs_node *bb_state;
if (event != NVDIMM_REVALIDATE_POISON)
@@ -628,9 +631,9 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
nsio = to_nd_namespace_io(&ndns->dev);
}
- res.start = nsio->res.start + offset;
- res.end = nsio->res.end - end_trunc;
- nvdimm_badblocks_populate(nd_region, bb, &res);
+ range.start = nsio->res.start + offset;
+ range.end = nsio->res.end - end_trunc;
+ nvdimm_badblocks_populate(nd_region, bb, &range);
if (bb_state)
sysfs_notify_dirent(bb_state);
}
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 0f6978e72e7c..bfce87ed72ab 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -35,7 +35,10 @@ static int nd_region_probe(struct device *dev)
return rc;
if (is_memory(&nd_region->dev)) {
- struct resource ndr_res;
+ struct range range = {
+ .start = nd_region->ndr_start,
+ .end = nd_region->ndr_start + nd_region->ndr_size - 1,
+ };
if (devm_init_badblocks(dev, &nd_region->bb))
return -ENODEV;
@@ -44,9 +47,7 @@ static int nd_region_probe(struct device *dev)
if (!nd_region->bb_state)
dev_warn(&nd_region->dev,
"'badblocks' notification disabled\n");
- ndr_res.start = nd_region->ndr_start;
- ndr_res.end = nd_region->ndr_start + nd_region->ndr_size - 1;
- nvdimm_badblocks_populate(nd_region, &nd_region->bb, &ndr_res);
+ nvdimm_badblocks_populate(nd_region, &nd_region->bb, &range);
}
rc = nd_region_register_namespaces(nd_region, &err);
@@ -121,14 +122,16 @@ static void nd_region_notify(struct device *dev, enum nvdimm_event event)
{
if (event == NVDIMM_REVALIDATE_POISON) {
struct nd_region *nd_region = to_nd_region(dev);
- struct resource res;
if (is_memory(&nd_region->dev)) {
- res.start = nd_region->ndr_start;
- res.end = nd_region->ndr_start +
- nd_region->ndr_size - 1;
+ struct range range = {
+ .start = nd_region->ndr_start,
+ .end = nd_region->ndr_start +
+ nd_region->ndr_size - 1,
+ };
+
nvdimm_badblocks_populate(nd_region,
- &nd_region->bb, &res);
+ &nd_region->bb, &range);
if (nd_region->bb_state)
sysfs_notify_dirent(nd_region->bb_state);
}
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index f357f9a32b3a..9d53c16b7329 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -185,9 +185,9 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
return -ENOMEM;
pgmap = &p2p_pgmap->pgmap;
- pgmap->res.start = pci_resource_start(pdev, bar) + offset;
- pgmap->res.end = pgmap->res.start + size - 1;
- pgmap->res.flags = pci_resource_flags(pdev, bar);
+ pgmap->range.start = pci_resource_start(pdev, bar) + offset;
+ pgmap->range.end = pgmap->range.start + size - 1;
+ pgmap->nr_range = 1;
pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
p2p_pgmap->provider = pdev;
@@ -202,13 +202,13 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
error = gen_pool_add_owner(pdev->p2pdma->pool, (unsigned long)addr,
pci_bus_address(pdev, bar) + offset,
- resource_size(&pgmap->res), dev_to_node(&pdev->dev),
+ range_len(&pgmap->range), dev_to_node(&pdev->dev),
pgmap->ref);
if (error)
goto pages_free;
- pci_info(pdev, "added peer-to-peer DMA memory %pR\n",
- &pgmap->res);
+ pci_info(pdev, "added peer-to-peer DMA memory %#llx-%#llx\n",
+ pgmap->range.start, pgmap->range.end);
return 0;
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index c08512fcea90..834b7c13ef3d 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -36,18 +36,10 @@ enum virtio_mem_mb_state {
VIRTIO_MEM_MB_STATE_OFFLINE,
/* Partially plugged, fully added to Linux, offline. */
VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL,
- /* Fully plugged, fully added to Linux, online (!ZONE_MOVABLE). */
+ /* Fully plugged, fully added to Linux, online. */
VIRTIO_MEM_MB_STATE_ONLINE,
- /* Partially plugged, fully added to Linux, online (!ZONE_MOVABLE). */
+ /* Partially plugged, fully added to Linux, online. */
VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL,
- /*
- * Fully plugged, fully added to Linux, online (ZONE_MOVABLE).
- * We are not allowed to allocate (unplug) parts of this block that
- * are not movable (similar to gigantic pages). We will never allow
- * to online OFFLINE_PARTIAL to ZONE_MOVABLE (as they would contain
- * unmovable parts).
- */
- VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE,
VIRTIO_MEM_MB_STATE_COUNT
};
@@ -526,21 +518,10 @@ static bool virtio_mem_owned_mb(struct virtio_mem *vm, unsigned long mb_id)
}
static int virtio_mem_notify_going_online(struct virtio_mem *vm,
- unsigned long mb_id,
- enum zone_type zone)
+ unsigned long mb_id)
{
switch (virtio_mem_mb_get_state(vm, mb_id)) {
case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL:
- /*
- * We won't allow to online a partially plugged memory block
- * to the MOVABLE zone - it would contain unmovable parts.
- */
- if (zone == ZONE_MOVABLE) {
- dev_warn_ratelimited(&vm->vdev->dev,
- "memory block has holes, MOVABLE not supported\n");
- return NOTIFY_BAD;
- }
- return NOTIFY_OK;
case VIRTIO_MEM_MB_STATE_OFFLINE:
return NOTIFY_OK;
default:
@@ -560,7 +541,6 @@ static void virtio_mem_notify_offline(struct virtio_mem *vm,
VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL);
break;
case VIRTIO_MEM_MB_STATE_ONLINE:
- case VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE:
virtio_mem_mb_set_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE);
break;
@@ -579,24 +559,17 @@ static void virtio_mem_notify_offline(struct virtio_mem *vm,
virtio_mem_retry(vm);
}
-static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id,
- enum zone_type zone)
+static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id)
{
unsigned long nb_offline;
switch (virtio_mem_mb_get_state(vm, mb_id)) {
case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL:
- BUG_ON(zone == ZONE_MOVABLE);
virtio_mem_mb_set_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL);
break;
case VIRTIO_MEM_MB_STATE_OFFLINE:
- if (zone == ZONE_MOVABLE)
- virtio_mem_mb_set_state(vm, mb_id,
- VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE);
- else
- virtio_mem_mb_set_state(vm, mb_id,
- VIRTIO_MEM_MB_STATE_ONLINE);
+ virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_ONLINE);
break;
default:
BUG();
@@ -675,7 +648,6 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
const unsigned long start = PFN_PHYS(mhp->start_pfn);
const unsigned long size = PFN_PHYS(mhp->nr_pages);
const unsigned long mb_id = virtio_mem_phys_to_mb_id(start);
- enum zone_type zone;
int rc = NOTIFY_OK;
if (!virtio_mem_overlaps_range(vm, start, size))
@@ -717,8 +689,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
break;
}
vm->hotplug_active = true;
- zone = page_zonenum(pfn_to_page(mhp->start_pfn));
- rc = virtio_mem_notify_going_online(vm, mb_id, zone);
+ rc = virtio_mem_notify_going_online(vm, mb_id);
break;
case MEM_OFFLINE:
virtio_mem_notify_offline(vm, mb_id);
@@ -726,8 +697,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
mutex_unlock(&vm->hotplug_mutex);
break;
case MEM_ONLINE:
- zone = page_zonenum(pfn_to_page(mhp->start_pfn));
- virtio_mem_notify_online(vm, mb_id, zone);
+ virtio_mem_notify_online(vm, mb_id);
vm->hotplug_active = false;
mutex_unlock(&vm->hotplug_mutex);
break;
@@ -1906,8 +1876,7 @@ static void virtio_mem_remove(struct virtio_device *vdev)
if (vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] ||
vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL] ||
vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE] ||
- vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL] ||
- vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE]) {
+ vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL]) {
dev_warn(&vdev->dev, "device still has system memory added\n");
} else {
virtio_mem_delete_resource(vm);
diff --git a/drivers/xen/unpopulated-alloc.c b/drivers/xen/unpopulated-alloc.c
index 3b98dc921426..8c512ea550bb 100644
--- a/drivers/xen/unpopulated-alloc.c
+++ b/drivers/xen/unpopulated-alloc.c
@@ -18,27 +18,38 @@ static unsigned int list_count;
static int fill_list(unsigned int nr_pages)
{
struct dev_pagemap *pgmap;
+ struct resource *res;
void *vaddr;
unsigned int i, alloc_pages = round_up(nr_pages, PAGES_PER_SECTION);
- int ret;
+ int ret = -ENOMEM;
+
+ res = kzalloc(sizeof(*res), GFP_KERNEL);
+ if (!res)
+ return -ENOMEM;
pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL);
if (!pgmap)
- return -ENOMEM;
+ goto err_pgmap;
pgmap->type = MEMORY_DEVICE_GENERIC;
- pgmap->res.name = "Xen scratch";
- pgmap->res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ res->name = "Xen scratch";
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- ret = allocate_resource(&iomem_resource, &pgmap->res,
+ ret = allocate_resource(&iomem_resource, res,
alloc_pages * PAGE_SIZE, 0, -1,
PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL);
if (ret < 0) {
pr_err("Cannot allocate new IOMEM resource\n");
- kfree(pgmap);
- return ret;
+ goto err_resource;
}
+ pgmap->range = (struct range) {
+ .start = res->start,
+ .end = res->end,
+ };
+ pgmap->nr_range = 1;
+ pgmap->owner = res;
+
#ifdef CONFIG_XEN_HAVE_PVMMU
/*
* memremap will build page tables for the new memory so
@@ -50,14 +61,13 @@ static int fill_list(unsigned int nr_pages)
* conflict with any devices.
*/
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- xen_pfn_t pfn = PFN_DOWN(pgmap->res.start);
+ xen_pfn_t pfn = PFN_DOWN(res->start);
for (i = 0; i < alloc_pages; i++) {
if (!set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY)) {
pr_warn("set_phys_to_machine() failed, no memory added\n");
- release_resource(&pgmap->res);
- kfree(pgmap);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto err_memremap;
}
}
}
@@ -66,9 +76,8 @@ static int fill_list(unsigned int nr_pages)
vaddr = memremap_pages(pgmap, NUMA_NO_NODE);
if (IS_ERR(vaddr)) {
pr_err("Cannot remap memory range\n");
- release_resource(&pgmap->res);
- kfree(pgmap);
- return PTR_ERR(vaddr);
+ ret = PTR_ERR(vaddr);
+ goto err_memremap;
}
for (i = 0; i < alloc_pages; i++) {
@@ -80,6 +89,14 @@ static int fill_list(unsigned int nr_pages)
}
return 0;
+
+err_memremap:
+ release_resource(res);
+err_resource:
+ kfree(pgmap);
+err_pgmap:
+ kfree(res);
+ return ret;
}
/**
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index ab53e42a874a..68b0148f4bb8 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -189,7 +189,7 @@ out:
}
EXPORT_SYMBOL(fs_lookup_param);
-int fs_param_bad_value(struct p_log *log, struct fs_parameter *param)
+static int fs_param_bad_value(struct p_log *log, struct fs_parameter *param)
{
return inval_plog(log, "Bad value for '%s'", param->key);
}
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 9bb9f0952b18..caf563981532 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1810,6 +1810,12 @@ int ntfs_read_inode_mount(struct inode *vi)
brelse(bh);
}
+ if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) {
+ ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.",
+ le32_to_cpu(m->bytes_allocated), vol->mft_record_size);
+ goto err_out;
+ }
+
/* Apply the mst fixups. */
if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) {
/* FIXME: Try to use the $MFTMirr now. */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 4c1b90442d6f..78710788c237 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6013,7 +6013,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out;
}
- /* Appending truncate log(TA) and and flushing truncate log(TF) are
+ /* Appending truncate log(TA) and flushing truncate log(TF) are
* two separated transactions. They can be both committed but not
* checkpointed. If crash occurs then, both two transaction will be
* replayed with several already released to global bitmap clusters.
@@ -7654,8 +7654,10 @@ out_mutex:
* main_bm related locks for avoiding the current IO starve, then go to
* trim the next group
*/
- if (ret >= 0 && group <= last_group)
+ if (ret >= 0 && group <= last_group) {
+ cond_resched();
goto next_group;
+ }
out:
range->len = trimmed * sb->s_blocksize;
return ret;
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 720e9f94957e..fc8252a28cb1 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -677,7 +677,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
/*
* Under certain conditions, the window slide code
* might have reduced the number of bits available or
- * disabled the the local alloc entirely. Re-check
+ * disabled the local alloc entirely. Re-check
* here and return -ENOSPC if necessary.
*/
status = -ENOSPC;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 617db4e0faa0..aa69c35d904c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1055,7 +1055,6 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
{
- static DEFINE_MUTEX(oom_adj_mutex);
struct mm_struct *mm = NULL;
struct task_struct *task;
int err = 0;
@@ -1095,7 +1094,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
struct task_struct *p = find_lock_task_mm(task);
if (p) {
- if (atomic_read(&p->mm->mm_users) > 1) {
+ if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) {
mm = p->mm;
mmgrab(mm);
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 35172a91148e..846d43df3fdf 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -520,16 +520,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
page = device_private_entry_to_page(swpent);
} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
&& pte_none(*pte))) {
- page = find_get_entry(vma->vm_file->f_mapping,
+ page = xa_load(&vma->vm_file->f_mapping->i_pages,
linear_page_index(vma, addr));
- if (!page)
- return;
-
if (xa_is_value(page))
mss->swap += PAGE_SIZE;
- else
- put_page(page);
-
return;
}
@@ -727,9 +721,21 @@ static const struct mm_walk_ops smaps_shmem_walk_ops = {
.pte_hole = smaps_pte_hole,
};
+/*
+ * Gather mem stats from @vma with the indicated beginning
+ * address @start, and keep them in @mss.
+ *
+ * Use vm_start of @vma as the beginning address if @start is 0.
+ */
static void smap_gather_stats(struct vm_area_struct *vma,
- struct mem_size_stats *mss)
+ struct mem_size_stats *mss, unsigned long start)
{
+ const struct mm_walk_ops *ops = &smaps_walk_ops;
+
+ /* Invalid start */
+ if (start >= vma->vm_end)
+ return;
+
#ifdef CONFIG_SHMEM
/* In case of smaps_rollup, reset the value from previous vma */
mss->check_shmem_swap = false;
@@ -746,18 +752,20 @@ static void smap_gather_stats(struct vm_area_struct *vma,
*/
unsigned long shmem_swapped = shmem_swap_usage(vma);
- if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
- !(vma->vm_flags & VM_WRITE)) {
+ if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
+ !(vma->vm_flags & VM_WRITE))) {
mss->swap += shmem_swapped;
} else {
mss->check_shmem_swap = true;
- walk_page_vma(vma, &smaps_shmem_walk_ops, mss);
- return;
+ ops = &smaps_shmem_walk_ops;
}
}
#endif
/* mmap_lock is held in m_start */
- walk_page_vma(vma, &smaps_walk_ops, mss);
+ if (!start)
+ walk_page_vma(vma, ops, mss);
+ else
+ walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
}
#define SEQ_PUT_DEC(str, val) \
@@ -809,7 +817,7 @@ static int show_smap(struct seq_file *m, void *v)
memset(&mss, 0, sizeof(mss));
- smap_gather_stats(vma, &mss);
+ smap_gather_stats(vma, &mss, 0);
show_map_vma(m, vma);
@@ -857,9 +865,73 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
hold_task_mempolicy(priv);
- for (vma = priv->mm->mmap; vma; vma = vma->vm_next) {
- smap_gather_stats(vma, &mss);
+ for (vma = priv->mm->mmap; vma;) {
+ smap_gather_stats(vma, &mss, 0);
last_vma_end = vma->vm_end;
+
+ /*
+ * Release mmap_lock temporarily if someone wants to
+ * access it for write request.
+ */
+ if (mmap_lock_is_contended(mm)) {
+ mmap_read_unlock(mm);
+ ret = mmap_read_lock_killable(mm);
+ if (ret) {
+ release_task_mempolicy(priv);
+ goto out_put_mm;
+ }
+
+ /*
+ * After dropping the lock, there are four cases to
+ * consider. See the following example for explanation.
+ *
+ * +------+------+-----------+
+ * | VMA1 | VMA2 | VMA3 |
+ * +------+------+-----------+
+ * | | | |
+ * 4k 8k 16k 400k
+ *
+ * Suppose we drop the lock after reading VMA2 due to
+ * contention, then we get:
+ *
+ * last_vma_end = 16k
+ *
+ * 1) VMA2 is freed, but VMA3 exists:
+ *
+ * find_vma(mm, 16k - 1) will return VMA3.
+ * In this case, just continue from VMA3.
+ *
+ * 2) VMA2 still exists:
+ *
+ * find_vma(mm, 16k - 1) will return VMA2.
+ * Iterate the loop like the original one.
+ *
+ * 3) No more VMAs can be found:
+ *
+ * find_vma(mm, 16k - 1) will return NULL.
+ * No more things to do, just break.
+ *
+ * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
+ *
+ * find_vma(mm, 16k - 1) will return VMA' whose range
+ * contains last_vma_end.
+ * Iterate VMA' from last_vma_end.
+ */
+ vma = find_vma(mm, last_vma_end - 1);
+ /* Case 3 above */
+ if (!vma)
+ break;
+
+ /* Case 1 above */
+ if (vma->vm_start >= last_vma_end)
+ continue;
+
+ /* Case 4 above */
+ if (vma->vm_end > last_vma_end)
+ smap_gather_stats(vma, &mss, last_vma_end);
+ }
+ /* Case 2 above */
+ vma = vma->vm_next;
}
show_vma_header_prefix(m, priv->mm->mmap->vm_start,
diff --git a/fs/xattr.c b/fs/xattr.c
index 386b45676d7e..cd7a563e8bcd 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -232,15 +232,15 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
}
/**
- * __vfs_setxattr_locked: set an extended attribute while holding the inode
+ * __vfs_setxattr_locked - set an extended attribute while holding the inode
* lock
*
- * @dentry - object to perform setxattr on
- * @name - xattr name to set
- * @value - value to set @name to
- * @size - size of @value
- * @flags - flags to pass into filesystem operations
- * @delegated_inode - on return, will contain an inode pointer that
+ * @dentry: object to perform setxattr on
+ * @name: xattr name to set
+ * @value: value to set @name to
+ * @size: size of @value
+ * @flags: flags to pass into filesystem operations
+ * @delegated_inode: on return, will contain an inode pointer that
* a delegation was broken on, NULL if none.
*/
int
@@ -443,12 +443,12 @@ __vfs_removexattr(struct dentry *dentry, const char *name)
EXPORT_SYMBOL(__vfs_removexattr);
/**
- * __vfs_removexattr_locked: set an extended attribute while holding the inode
+ * __vfs_removexattr_locked - set an extended attribute while holding the inode
* lock
*
- * @dentry - object to perform setxattr on
- * @name - name of xattr to remove
- * @delegated_inode - on return, will contain an inode pointer that
+ * @dentry: object to perform setxattr on
+ * @name: name of xattr to remove
+ * @delegated_inode: on return, will contain an inode pointer that
* a delegation was broken on, NULL if none.
*/
int
diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h
index fdebcfc6c8df..0e9302285f14 100644
--- a/include/acpi/acpi_numa.h
+++ b/include/acpi/acpi_numa.h
@@ -17,10 +17,22 @@ extern int pxm_to_node(int);
extern int node_to_pxm(int);
extern int acpi_map_pxm_to_node(int);
extern unsigned char acpi_srat_revision;
-extern int acpi_numa __initdata;
+extern void disable_srat(void);
extern void bad_srat(void);
extern int srat_disabled(void);
+#else /* CONFIG_ACPI_NUMA */
+static inline void disable_srat(void)
+{
+}
#endif /* CONFIG_ACPI_NUMA */
+
+#ifdef CONFIG_ACPI_HMAT
+extern void disable_hmat(void);
+#else /* CONFIG_ACPI_HMAT */
+static inline void disable_hmat(void)
+{
+}
+#endif /* CONFIG_ACPI_HMAT */
#endif /* __ACP_NUMA_H */
diff --git a/include/kunit/test.h b/include/kunit/test.h
index 59f3144f009a..3391f38389f8 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -224,6 +224,11 @@ struct kunit {
struct list_head resources; /* Protected by lock. */
};
+static inline void kunit_set_failure(struct kunit *test)
+{
+ WRITE_ONCE(test->success, false);
+}
+
void kunit_init_test(struct kunit *test, const char *name, char *log);
int kunit_run_tests(struct kunit_suite *suite);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 64ae25c59d55..cfa8c0015863 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -709,6 +709,8 @@ static inline u64 acpi_arch_get_root_pointer(void)
#define ACPI_HANDLE_FWNODE(fwnode) (NULL)
#define ACPI_DEVICE_CLASS(_cls, _msk) .cls = (0), .cls_msk = (0),
+#include <acpi/acpi_numa.h>
+
struct fwnode_handle;
static inline bool acpi_dev_found(const char *hid)
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 25a521d299c1..1de5a1151ee7 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -29,9 +29,6 @@ enum compact_result {
/* compaction didn't start as it was deferred due to past failures */
COMPACT_DEFERRED,
- /* compaction not active last round */
- COMPACT_INACTIVE = COMPACT_DEFERRED,
-
/* For more detailed tracepoint output - internal to compaction */
COMPACT_NO_SUITABLE_PAGE,
/* compaction should continue to another pageblock */
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index cee0c728d39a..230604e7f057 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -3,6 +3,14 @@
#error "Please don't include <linux/compiler-clang.h> directly, include <linux/compiler.h> instead."
#endif
+#define CLANG_VERSION (__clang_major__ * 10000 \
+ + __clang_minor__ * 100 \
+ + __clang_patchlevel__)
+
+#if CLANG_VERSION < 100001
+# error Sorry, your version of Clang is too old - please use 10.0.1 or newer.
+#endif
+
/* Compiler specific definitions for Clang compiler */
/* same as gcc, this was present in clang-2.6 so we can assume it works
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 7a3769040d7d..d1e3c6896b71 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -12,7 +12,7 @@
/* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 */
#if GCC_VERSION < 40900
-# error Sorry, your compiler is too old - please upgrade it.
+# error Sorry, your version of GCC is too old - please use 4.9 or newer.
#endif
/* Optimization barrier */
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 92ef163a7479..ac45f6d40d39 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -155,7 +155,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
extern typeof(sym) sym; \
static const unsigned long __kentry_##sym \
__used \
- __section("___kentry" "+" #sym ) \
+ __attribute__((__section__("___kentry+" #sym))) \
= (unsigned long)&sym;
#endif
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 43b39ab9de1a..4ec0bbf86205 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -238,4 +238,12 @@ static inline bool dax_mapping(struct address_space *mapping)
return mapping->host && IS_DAX(mapping->host);
}
+#ifdef CONFIG_DEV_DAX_HMEM_DEVICES
+void hmem_register_device(int target_nid, struct resource *r);
+#else
+static inline void hmem_register_device(int target_nid, struct resource *r)
+{
+}
+#endif
+
#endif
diff --git a/include/linux/export.h b/include/linux/export.h
index fceb5e855717..8933ff6ad23a 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -130,7 +130,7 @@ struct kernel_symbol {
* discarded in the final link stage.
*/
#define __ksym_marker(sym) \
- static int __ksym_marker_##sym[0] __section(".discard.ksym") __used
+ static int __ksym_marker_##sym[0] __section(.discard.ksym) __used
#define __EXPORT_SYMBOL(sym, sec, ns) \
__ksym_marker(sym); \
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2e621d28cd65..5815f7d4dbf4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2581,6 +2581,10 @@ extern bool is_bad_inode(struct inode *);
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end);
+void invalidate_mapping_pagevec(struct address_space *mapping,
+ pgoff_t start, pgoff_t end,
+ unsigned long *nr_pagevec);
+
static inline void invalidate_remote_inode(struct inode *inode)
{
if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 67a0774e080b..07e481993ef5 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -238,7 +238,9 @@ struct vm_area_struct;
* %__GFP_FOO flags as necessary.
*
* %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower
- * watermark is applied to allow access to "atomic reserves"
+ * watermark is applied to allow access to "atomic reserves".
+ * The current implementation doesn't support NMI and few other strict
+ * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT.
*
* %GFP_KERNEL is typical for kernel-internal allocations. The caller requires
* %ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
@@ -560,8 +562,6 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
#define alloc_page_vma(gfp_mask, vma, addr) \
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
-#define alloc_page_vma_node(gfp_mask, vma, addr, node) \
- alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
extern unsigned long get_zeroed_page(gfp_t gfp_mask);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 8a8bc46a2432..0365aa97f8e7 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -38,9 +38,6 @@ extern int zap_huge_pmd(struct mmu_gather *tlb,
extern int zap_huge_pud(struct mmu_gather *tlb,
struct vm_area_struct *vma,
pud_t *pud, unsigned long addr);
-extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- unsigned char *vec);
extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr,
pmd_t *old_pmd, pmd_t *new_pmd);
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 087fba34b209..30d343b4a40a 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -14,6 +14,12 @@ struct task_struct;
#include <linux/pgtable.h>
#include <asm/kasan.h>
+/* kasan_data struct is used in KUnit tests for KASAN expected failures */
+struct kunit_kasan_expectation {
+ bool report_expected;
+ bool report_found;
+};
+
extern unsigned char kasan_early_shadow_page[PAGE_SIZE];
extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE];
extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD];
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 9d925db0d355..ef131255cedc 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -86,7 +86,6 @@ struct memblock {
};
extern struct memblock memblock;
-extern int memblock_debug;
#ifndef CONFIG_ARCH_KEEP_MEMBLOCK
#define __init_memblock __meminit
@@ -98,9 +97,6 @@ void memblock_discard(void);
static inline void memblock_discard(void) {}
#endif
-#define memblock_dbg(fmt, ...) \
- if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
-
phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align);
void memblock_allow_resize(void);
@@ -136,9 +132,6 @@ void __next_mem_range_rev(u64 *idx, int nid, enum memblock_flags flags,
struct memblock_type *type_b, phys_addr_t *out_start,
phys_addr_t *out_end, int *out_nid);
-void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
- phys_addr_t *out_end);
-
void __memblock_free_late(phys_addr_t base, phys_addr_t size);
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -166,7 +159,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
#endif /* CONFIG_HAVE_MEMBLOCK_PHYS_MAP */
/**
- * for_each_mem_range - iterate through memblock areas from type_a and not
+ * __for_each_mem_range - iterate through memblock areas from type_a and not
* included in type_b. Or just type_a if type_b is NULL.
* @i: u64 used as loop variable
* @type_a: ptr to memblock_type to iterate
@@ -177,7 +170,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*/
-#define for_each_mem_range(i, type_a, type_b, nid, flags, \
+#define __for_each_mem_range(i, type_a, type_b, nid, flags, \
p_start, p_end, p_nid) \
for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid); \
@@ -186,7 +179,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
p_start, p_end, p_nid))
/**
- * for_each_mem_range_rev - reverse iterate through memblock areas from
+ * __for_each_mem_range_rev - reverse iterate through memblock areas from
* type_a and not included in type_b. Or just type_a if type_b is NULL.
* @i: u64 used as loop variable
* @type_a: ptr to memblock_type to iterate
@@ -197,17 +190,38 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*/
-#define for_each_mem_range_rev(i, type_a, type_b, nid, flags, \
- p_start, p_end, p_nid) \
+#define __for_each_mem_range_rev(i, type_a, type_b, nid, flags, \
+ p_start, p_end, p_nid) \
for (i = (u64)ULLONG_MAX, \
- __next_mem_range_rev(&i, nid, flags, type_a, type_b,\
+ __next_mem_range_rev(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid); \
i != (u64)ULLONG_MAX; \
__next_mem_range_rev(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid))
/**
- * for_each_reserved_mem_region - iterate over all reserved memblock areas
+ * for_each_mem_range - iterate through memory areas.
+ * @i: u64 used as loop variable
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ */
+#define for_each_mem_range(i, p_start, p_end) \
+ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, \
+ MEMBLOCK_NONE, p_start, p_end, NULL)
+
+/**
+ * for_each_mem_range_rev - reverse iterate through memblock areas from
+ * type_a and not included in type_b. Or just type_a if type_b is NULL.
+ * @i: u64 used as loop variable
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ */
+#define for_each_mem_range_rev(i, p_start, p_end) \
+ __for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE, \
+ MEMBLOCK_NONE, p_start, p_end, NULL)
+
+/**
+ * for_each_reserved_mem_range - iterate over all reserved memblock areas
* @i: u64 used as loop variable
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
@@ -215,10 +229,9 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
* Walks over reserved areas of memblock. Available as soon as memblock
* is initialized.
*/
-#define for_each_reserved_mem_region(i, p_start, p_end) \
- for (i = 0UL, __next_reserved_mem_region(&i, p_start, p_end); \
- i != (u64)ULLONG_MAX; \
- __next_reserved_mem_region(&i, p_start, p_end))
+#define for_each_reserved_mem_range(i, p_start, p_end) \
+ __for_each_mem_range(i, &memblock.reserved, NULL, NUMA_NO_NODE, \
+ MEMBLOCK_NONE, p_start, p_end, NULL)
static inline bool memblock_is_hotpluggable(struct memblock_region *m)
{
@@ -311,8 +324,8 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask);
* soon as memblock is initialized.
*/
#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \
- for_each_mem_range(i, &memblock.memory, &memblock.reserved, \
- nid, flags, p_start, p_end, p_nid)
+ __for_each_mem_range(i, &memblock.memory, &memblock.reserved, \
+ nid, flags, p_start, p_end, p_nid)
/**
* for_each_free_mem_range_reverse - rev-iterate through free memblock areas
@@ -328,8 +341,8 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask);
*/
#define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end, \
p_nid) \
- for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
- nid, flags, p_start, p_end, p_nid)
+ __for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
+ nid, flags, p_start, p_end, p_nid)
int memblock_set_node(phys_addr_t base, phys_addr_t size,
struct memblock_type *type, int nid);
@@ -464,7 +477,6 @@ static inline bool memblock_bottom_up(void)
phys_addr_t memblock_phys_mem_size(void);
phys_addr_t memblock_reserved_size(void);
-phys_addr_t memblock_mem_size(unsigned long limit_pfn);
phys_addr_t memblock_start_of_DRAM(void);
phys_addr_t memblock_end_of_DRAM(void);
void memblock_enforce_memory_limit(phys_addr_t memory_limit);
@@ -476,13 +488,7 @@ bool memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
bool memblock_is_reserved(phys_addr_t addr);
bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
-extern void __memblock_dump_all(void);
-
-static inline void memblock_dump_all(void)
-{
- if (memblock_debug)
- __memblock_dump_all();
-}
+void memblock_dump_all(void);
/**
* memblock_set_current_limit - Set the current allocation limit to allow
@@ -547,15 +553,23 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
return PFN_UP(reg->base + reg->size);
}
-#define for_each_memblock(memblock_type, region) \
- for (region = memblock.memblock_type.regions; \
- region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \
+/**
+ * for_each_mem_region - itereate over memory regions
+ * @region: loop variable
+ */
+#define for_each_mem_region(region) \
+ for (region = memblock.memory.regions; \
+ region < (memblock.memory.regions + memblock.memory.cnt); \
region++)
-#define for_each_memblock_type(i, memblock_type, rgn) \
- for (i = 0, rgn = &memblock_type->regions[0]; \
- i < memblock_type->cnt; \
- i++, rgn = &memblock_type->regions[i])
+/**
+ * for_each_reserved_mem_region - itereate over reserved memory regions
+ * @region: loop variable
+ */
+#define for_each_reserved_mem_region(region) \
+ for (region = memblock.reserved.regions; \
+ region < (memblock.reserved.regions + memblock.reserved.cnt); \
+ region++)
extern void *alloc_large_system_hash(const char *tablename,
unsigned long bucketsize,
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d0b036123c6a..6ef4a552e09d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -215,13 +215,16 @@ struct mem_cgroup {
struct mem_cgroup_id id;
/* Accounted resources */
- struct page_counter memory;
- struct page_counter swap;
+ struct page_counter memory; /* Both v1 & v2 */
+
+ union {
+ struct page_counter swap; /* v2 only */
+ struct page_counter memsw; /* v1 only */
+ };
/* Legacy consumer-oriented counters */
- struct page_counter memsw;
- struct page_counter kmem;
- struct page_counter tcpmem;
+ struct page_counter kmem; /* v1 only */
+ struct page_counter tcpmem; /* v1 only */
/* Range enforcement for interrupt charges */
struct work_struct high_work;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 375515803cd8..c0faa7a30c46 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -149,15 +149,6 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
struct mhp_params *params);
#endif /* ARCH_HAS_ADD_PAGES */
-#ifdef CONFIG_NUMA
-extern int memory_add_physaddr_to_nid(u64 start);
-#else
-static inline int memory_add_physaddr_to_nid(u64 start)
-{
- return 0;
-}
-#endif
-
#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION
/*
* For supporting node-hotadd, we have to allocate a new pgdat.
@@ -284,6 +275,20 @@ static inline bool movable_node_is_enabled(void)
}
#endif /* ! CONFIG_MEMORY_HOTPLUG */
+#ifdef CONFIG_NUMA
+extern int memory_add_physaddr_to_nid(u64 start);
+extern int phys_to_target_node(u64 start);
+#else
+static inline int memory_add_physaddr_to_nid(u64 start)
+{
+ return 0;
+}
+static inline int phys_to_target_node(u64 start)
+{
+ return 0;
+}
+#endif
+
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
/*
* pgdat resizing functions
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index e5862746751b..79c49e7f5c30 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MEMREMAP_H_
#define _LINUX_MEMREMAP_H_
+#include <linux/range.h>
#include <linux/ioport.h>
#include <linux/percpu-refcount.h>
@@ -93,7 +94,6 @@ struct dev_pagemap_ops {
/**
* struct dev_pagemap - metadata for ZONE_DEVICE mappings
* @altmap: pre-allocated/reserved memory for vmemmap allocations
- * @res: physical address range covered by @ref
* @ref: reference count that pins the devm_memremap_pages() mapping
* @internal_ref: internal reference if @ref is not provided by the caller
* @done: completion for @internal_ref
@@ -103,10 +103,12 @@ struct dev_pagemap_ops {
* @owner: an opaque pointer identifying the entity that manages this
* instance. Used by various helpers to make sure that no
* foreign ZONE_DEVICE memory is accessed.
+ * @nr_range: number of ranges to be mapped
+ * @range: range to be mapped when nr_range == 1
+ * @ranges: array of ranges to be mapped when nr_range > 1
*/
struct dev_pagemap {
struct vmem_altmap altmap;
- struct resource res;
struct percpu_ref *ref;
struct percpu_ref internal_ref;
struct completion done;
@@ -114,6 +116,11 @@ struct dev_pagemap {
unsigned int flags;
const struct dev_pagemap_ops *ops;
void *owner;
+ int nr_range;
+ union {
+ struct range range;
+ struct range ranges[0];
+ };
};
static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 13dc9b9ccf8e..620961e4f32b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -791,7 +791,7 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
extern void kvfree(const void *addr);
extern void kvfree_sensitive(const void *addr, size_t len);
-static inline int head_mapcount(struct page *head)
+static inline int head_compound_mapcount(struct page *head)
{
return atomic_read(compound_mapcount_ptr(head)) + 1;
}
@@ -805,7 +805,7 @@ static inline int compound_mapcount(struct page *page)
{
VM_BUG_ON_PAGE(!PageCompound(page), page);
page = compound_head(page);
- return head_mapcount(page);
+ return head_compound_mapcount(page);
}
/*
@@ -918,7 +918,7 @@ static inline bool hpage_pincount_available(struct page *page)
return PageCompound(page) && compound_order(page) > 1;
}
-static inline int head_pincount(struct page *head)
+static inline int head_compound_pincount(struct page *head)
{
return atomic_read(compound_pincount_ptr(head));
}
@@ -927,7 +927,7 @@ static inline int compound_pincount(struct page *page)
{
VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
page = compound_head(page);
- return head_pincount(page);
+ return head_compound_pincount(page);
}
static inline void set_compound_order(struct page *page, unsigned int order)
@@ -1653,8 +1653,8 @@ struct mmu_notifier_range;
void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
-int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
- struct vm_area_struct *vma, struct vm_area_struct *new);
+int
+copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
struct mmu_notifier_range *range,
pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
@@ -2254,7 +2254,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
return ptlock_ptr(pmd_to_page(pmd));
}
-static inline bool pgtable_pmd_page_ctor(struct page *page)
+static inline bool pmd_ptlock_init(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
page->pmd_huge_pte = NULL;
@@ -2262,7 +2262,7 @@ static inline bool pgtable_pmd_page_ctor(struct page *page)
return ptlock_init(page);
}
-static inline void pgtable_pmd_page_dtor(struct page *page)
+static inline void pmd_ptlock_free(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
VM_BUG_ON_PAGE(page->pmd_huge_pte, page);
@@ -2279,8 +2279,8 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
return &mm->page_table_lock;
}
-static inline bool pgtable_pmd_page_ctor(struct page *page) { return true; }
-static inline void pgtable_pmd_page_dtor(struct page *page) {}
+static inline bool pmd_ptlock_init(struct page *page) { return true; }
+static inline void pmd_ptlock_free(struct page *page) {}
#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
@@ -2293,6 +2293,22 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
return ptl;
}
+static inline bool pgtable_pmd_page_ctor(struct page *page)
+{
+ if (!pmd_ptlock_init(page))
+ return false;
+ __SetPageTable(page);
+ inc_zone_page_state(page, NR_PAGETABLE);
+ return true;
+}
+
+static inline void pgtable_pmd_page_dtor(struct page *page)
+{
+ pmd_ptlock_free(page);
+ __ClearPageTable(page);
+ dec_zone_page_state(page, NR_PAGETABLE);
+}
+
/*
* No scalability reason to split PUD locks yet, but follow the same pattern
* as the PMD locks to make it easier if we decide to. The VM should not be
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 0707671851a8..18e7eae9b5ba 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -87,4 +87,9 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
}
+static inline int mmap_lock_is_contended(struct mm_struct *mm)
+{
+ return rwsem_is_contended(&mm->mmap_lock);
+}
+
#endif /* _LINUX_MMAP_LOCK_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0f7a4ff4b059..c27fb1faffe5 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -396,6 +396,41 @@ enum zone_type {
*/
ZONE_HIGHMEM,
#endif
+ /*
+ * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains
+ * movable pages with few exceptional cases described below. Main use
+ * cases for ZONE_MOVABLE are to make memory offlining/unplug more
+ * likely to succeed, and to locally limit unmovable allocations - e.g.,
+ * to increase the number of THP/huge pages. Notable special cases are:
+ *
+ * 1. Pinned pages: (long-term) pinning of movable pages might
+ * essentially turn such pages unmovable. Memory offlining might
+ * retry a long time.
+ * 2. memblock allocations: kernelcore/movablecore setups might create
+ * situations where ZONE_MOVABLE contains unmovable allocations
+ * after boot. Memory offlining and allocations fail early.
+ * 3. Memory holes: kernelcore/movablecore setups might create very rare
+ * situations where ZONE_MOVABLE contains memory holes after boot,
+ * for example, if we have sections that are only partially
+ * populated. Memory offlining and allocations fail early.
+ * 4. PG_hwpoison pages: while poisoned pages can be skipped during
+ * memory offlining, such pages cannot be allocated.
+ * 5. Unmovable PG_offline pages: in paravirtualized environments,
+ * hotplugged memory blocks might only partially be managed by the
+ * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The
+ * parts not manged by the buddy are unmovable PG_offline pages. In
+ * some cases (virtio-mem), such pages can be skipped during
+ * memory offlining, however, cannot be moved/allocated. These
+ * techniques might use alloc_contig_range() to hide previously
+ * exposed pages from the buddy again (e.g., to implement some sort
+ * of memory unplug in virtio-mem).
+ *
+ * In general, no unmovable allocations that degrade memory offlining
+ * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
+ * have to expect that migrating pages in ZONE_MOVABLE can fail (even
+ * if has_unmovable_pages() states that there are no unmovable pages,
+ * there can be false negatives).
+ */
ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
ZONE_DEVICE,
@@ -1081,7 +1116,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
z = next_zones_zonelist(++z, highidx, nodemask), \
zone = zonelist_zone(z))
-#define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
+#define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \
for (zone = z->zone; \
zone; \
z = next_zones_zonelist(++z, highidx, nodemask), \
diff --git a/include/linux/numa.h b/include/linux/numa.h
index a42df804679e..8cb33ccfb671 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -23,22 +23,11 @@
#ifdef CONFIG_NUMA
/* Generic implementation available */
int numa_map_to_online_node(int node);
-
-/*
- * Optional architecture specific implementation, users need a "depends
- * on $ARCH"
- */
-int phys_to_target_node(phys_addr_t addr);
#else
static inline int numa_map_to_online_node(int node)
{
return NUMA_NO_NODE;
}
-
-static inline int phys_to_target_node(phys_addr_t addr)
-{
- return NUMA_NO_NODE;
-}
#endif
#endif /* _LINUX_NUMA_H */
diff --git a/include/linux/oom.h b/include/linux/oom.h
index f022f581ac29..2db9a1432511 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -55,6 +55,7 @@ struct oom_control {
};
extern struct mutex oom_lock;
+extern struct mutex oom_adj_mutex;
static inline void set_current_oom_origin(void)
{
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 276140c94f4a..38ded408bd4c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -167,7 +167,7 @@ enum pageflags {
PG_slob_free = PG_private,
/* Compound pages. Stored in first tail page's flags */
- PG_double_map = PG_private_2,
+ PG_double_map = PG_workingset,
/* non-lru isolated movable page */
PG_isolated = PG_reclaim,
@@ -235,6 +235,9 @@ static inline void page_init_poison(struct page *page, size_t size)
*
* PF_NO_COMPOUND:
* the page flag is not relevant for compound pages.
+ *
+ * PF_SECOND:
+ * the page flag is stored in the first tail page.
*/
#define PF_POISONED_CHECK(page) ({ \
VM_BUG_ON_PGFLAGS(PagePoisoned(page), page); \
@@ -250,6 +253,9 @@ static inline void page_init_poison(struct page *page, size_t size)
#define PF_NO_COMPOUND(page, enforce) ({ \
VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \
PF_POISONED_CHECK(page); })
+#define PF_SECOND(page, enforce) ({ \
+ VM_BUG_ON_PGFLAGS(!PageHead(page), page); \
+ PF_POISONED_CHECK(&page[1]); })
/*
* Macros to create function definitions for page flags
@@ -688,42 +694,15 @@ static inline int PageTransTail(struct page *page)
*
* See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap().
*/
-static inline int PageDoubleMap(struct page *page)
-{
- return PageHead(page) && test_bit(PG_double_map, &page[1].flags);
-}
-
-static inline void SetPageDoubleMap(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageHead(page), page);
- set_bit(PG_double_map, &page[1].flags);
-}
-
-static inline void ClearPageDoubleMap(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageHead(page), page);
- clear_bit(PG_double_map, &page[1].flags);
-}
-static inline int TestSetPageDoubleMap(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageHead(page), page);
- return test_and_set_bit(PG_double_map, &page[1].flags);
-}
-
-static inline int TestClearPageDoubleMap(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageHead(page), page);
- return test_and_clear_bit(PG_double_map, &page[1].flags);
-}
-
+PAGEFLAG(DoubleMap, double_map, PF_SECOND)
+ TESTSCFLAG(DoubleMap, double_map, PF_SECOND)
#else
TESTPAGEFLAG_FALSE(TransHuge)
TESTPAGEFLAG_FALSE(TransCompound)
TESTPAGEFLAG_FALSE(TransCompoundMap)
TESTPAGEFLAG_FALSE(TransTail)
PAGEFLAG_FALSE(DoubleMap)
- TESTSETFLAG_FALSE(DoubleMap)
- TESTCLEARFLAG_FALSE(DoubleMap)
+ TESTSCFLAG_FALSE(DoubleMap)
#endif
/*
@@ -888,6 +867,7 @@ static inline int page_has_private(struct page *page)
#undef PF_ONLY_HEAD
#undef PF_NO_TAIL
#undef PF_NO_COMPOUND
+#undef PF_SECOND
#endif /* !__GENERATING_BOUNDS_H */
#endif /* PAGE_FLAGS_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 434c9c34aeb6..1a3554f5d992 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -279,6 +279,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
#define FGP_NOFS 0x00000010
#define FGP_NOWAIT 0x00000020
#define FGP_FOR_MMAP 0x00000040
+#define FGP_HEAD 0x00000080
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
int fgp_flags, gfp_t cache_gfp_mask);
@@ -310,18 +311,37 @@ static inline struct page *find_get_page_flags(struct address_space *mapping,
* @mapping: the address_space to search
* @offset: the page index
*
- * Looks up the page cache slot at @mapping & @offset. If there is a
+ * Looks up the page cache entry at @mapping & @offset. If there is a
* page cache page, it is returned locked and with an increased
* refcount.
*
- * Otherwise, %NULL is returned.
- *
- * find_lock_page() may sleep.
+ * Context: May sleep.
+ * Return: A struct page or %NULL if there is no page in the cache for this
+ * index.
*/
static inline struct page *find_lock_page(struct address_space *mapping,
- pgoff_t offset)
+ pgoff_t index)
+{
+ return pagecache_get_page(mapping, index, FGP_LOCK, 0);
+}
+
+/**
+ * find_lock_head - Locate, pin and lock a pagecache page.
+ * @mapping: The address_space to search.
+ * @offset: The page index.
+ *
+ * Looks up the page cache entry at @mapping & @offset. If there is a
+ * page cache page, its head page is returned locked and with an increased
+ * refcount.
+ *
+ * Context: May sleep.
+ * Return: A struct page which is !PageTail, or %NULL if there is no page
+ * in the cache for this index.
+ */
+static inline struct page *find_lock_head(struct address_space *mapping,
+ pgoff_t index)
{
- return pagecache_get_page(mapping, offset, FGP_LOCK, 0);
+ return pagecache_get_page(mapping, index, FGP_LOCK | FGP_HEAD, 0);
}
/**
@@ -372,6 +392,15 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
mapping_gfp_mask(mapping));
}
+/* Does this page contain this index? */
+static inline bool thp_contains(struct page *head, pgoff_t index)
+{
+ /* HugeTLBfs indexes the page cache in units of hpage_size */
+ if (PageHuge(head))
+ return head->index == index;
+ return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL));
+}
+
/*
* Given the page we found in the page cache, return the page corresponding
* to this index in the file
@@ -385,8 +414,6 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index)
return head + (index & (thp_nr_pages(head) - 1));
}
-struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
unsigned int nr_entries, struct page **entries,
pgoff_t *indices);
diff --git a/include/linux/range.h b/include/linux/range.h
index d1fbeb664012..274681cc3154 100644
--- a/include/linux/range.h
+++ b/include/linux/range.h
@@ -1,12 +1,18 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RANGE_H
#define _LINUX_RANGE_H
+#include <linux/types.h>
struct range {
u64 start;
u64 end;
};
+static inline u64 range_len(const struct range *range)
+{
+ return range->end - range->start + 1;
+}
+
int add_range(struct range *range, int az, int nr_range,
u64 start, u64 end);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 829b0697d19c..9030f3abd969 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1208,6 +1208,10 @@ struct task_struct {
#endif
#endif
+#if IS_ENABLED(CONFIG_KUNIT)
+ struct kunit *kunit_test;
+#endif
+
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* Index of current stored address in ret_stack: */
int curr_ret_stack;
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index ecdc6542070f..dfd82eab2902 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -72,6 +72,7 @@ static inline int get_dumpable(struct mm_struct *mm)
#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */
#define MMF_OOM_VICTIM 25 /* mm is the oom victim */
#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */
+#define MMF_MULTIPROCESS 27 /* mm is shared between processes */
#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 24df2393ec03..9e155cc83b8a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -279,7 +279,7 @@ static inline void __check_heap_object(const void *ptr, unsigned long n,
#define KMALLOC_MAX_SIZE (1UL << KMALLOC_SHIFT_MAX)
/* Maximum size for which we actually use a slab cache */
#define KMALLOC_MAX_CACHE_SIZE (1UL << KMALLOC_SHIFT_HIGH)
-/* Maximum order allocatable via the slab allocagtor */
+/* Maximum order allocatable via the slab allocator */
#define KMALLOC_MAX_ORDER (KMALLOC_SHIFT_MAX - PAGE_SHIFT)
/*
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4340a7b6e7a1..667935c0dbd4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -170,7 +170,7 @@ enum {
SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */
SWP_BLKDEV = (1 << 6), /* its a block device */
SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */
- SWP_FS = (1 << 8), /* swap file goes through fs */
+ SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */
SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */
SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
@@ -340,7 +340,6 @@ extern void lru_note_cost_page(struct page *);
extern void lru_cache_add(struct page *);
extern void lru_add_page_tail(struct page *page, struct page *page_tail,
struct lruvec *lruvec, struct list_head *head);
-extern void activate_page(struct page *);
extern void mark_page_accessed(struct page *);
extern void lru_add_drain(void);
extern void lru_add_drain_cpu(int cpu);
@@ -427,6 +426,7 @@ extern void free_pages_and_swap_cache(struct page **, int);
extern struct page *lookup_swap_cache(swp_entry_t entry,
struct vm_area_struct *vma,
unsigned long addr);
+struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index);
extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
struct vm_area_struct *vma, unsigned long addr,
bool do_poll);
@@ -570,6 +570,12 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp,
return NULL;
}
+static inline
+struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
+{
+ return find_get_page(mapping, index);
+}
+
static inline int add_to_swap(struct page *page)
{
return 0;
diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h
index e36b200c2a77..347f1a304190 100644
--- a/include/linux/swap_slots.h
+++ b/include/linux/swap_slots.h
@@ -23,7 +23,7 @@ struct swap_slots_cache {
void disable_swap_slots_cache_lock(void);
void reenable_swap_slots_cache_unlock(void);
-int enable_swap_slots_cache(void);
+void enable_swap_slots_cache(void);
int free_swap_slot(swp_entry_t entry);
extern bool swap_slot_cache_enabled;
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index cff7e60968b9..0369fd5fda8f 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -73,16 +73,7 @@ early_param("cma", early_cma);
static phys_addr_t __init __maybe_unused cma_early_percent_memory(void)
{
- struct memblock_region *reg;
- unsigned long total_pages = 0;
-
- /*
- * We cannot use memblock_phys_mem_size() here, because
- * memblock_analyze() has not been called yet.
- */
- for_each_memblock(memory, reg)
- total_pages += memblock_region_memory_end_pfn(reg) -
- memblock_region_memory_base_pfn(reg);
+ unsigned long total_pages = PHYS_PFN(memblock_phys_mem_size());
return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index a3795aaaab5c..50c90d368117 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -559,7 +559,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
atomic_dec(&inode->i_writecount);
i_mmap_lock_write(mapping);
if (tmp->vm_flags & VM_SHARED)
- atomic_inc(&mapping->i_mmap_writable);
+ mapping_allow_writable(mapping);
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
vma_interval_tree_insert_after(tmp, mpnt,
@@ -590,7 +590,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
- retval = copy_page_range(mm, oldmm, mpnt, tmp);
+ retval = copy_page_range(tmp, mpnt);
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
@@ -1812,6 +1812,25 @@ static __always_inline void delayed_free_task(struct task_struct *tsk)
free_task(tsk);
}
+static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
+{
+ /* Skip if kernel thread */
+ if (!tsk->mm)
+ return;
+
+ /* Skip if spawning a thread or using vfork */
+ if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
+ return;
+
+ /* We need to synchronize with __set_oom_adj */
+ mutex_lock(&oom_adj_mutex);
+ set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
+ /* Update the values in case they were changed after copy_signal */
+ tsk->signal->oom_score_adj = current->signal->oom_score_adj;
+ tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
+ mutex_unlock(&oom_adj_mutex);
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -2288,6 +2307,8 @@ static __latent_entropy struct task_struct *copy_process(
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
+ copy_oom_score_adj(clone_flags, p);
+
return p;
bad_fork_cancel_cgroup:
diff --git a/kernel/resource.c b/kernel/resource.c
index 841737bbda9e..f1175ce93a1d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -382,10 +382,13 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
if (p) {
/* copy data */
- res->start = max(start, p->start);
- res->end = min(end, p->end);
- res->flags = p->flags;
- res->desc = p->desc;
+ *res = (struct resource) {
+ .start = max(start, p->start),
+ .end = min(end, p->end),
+ .flags = p->flags,
+ .desc = p->desc,
+ .parent = p->parent,
+ };
}
read_unlock(&resource_lock);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 0c781f912f9f..491789a793ae 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2367,6 +2367,15 @@ config TEST_HMM
If unsure, say N.
+config TEST_FREE_PAGES
+ tristate "Test freeing pages"
+ help
+ Test that a memory leak does not occur due to a race between
+ freeing a block of pages and a speculative page reference.
+ Loading this module is safe if your kernel has the bug fixed.
+ If the bug is not fixed, it will leak gigabytes of memory and
+ probably OOM your system.
+
config TEST_FPU
tristate "Test floating point operations in kernel space"
depends on X86 && !KCOV_INSTRUMENT_ALL
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 047b53dbfd58..542a9c18398e 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -54,9 +54,9 @@ config KASAN_GENERIC
Enables generic KASAN mode.
This mode is supported in both GCC and Clang. With GCC it requires
- version 8.3.0 or later. With Clang it requires version 7.0.0 or
- later, but detection of out-of-bounds accesses for global variables
- is supported only since Clang 11.
+ version 8.3.0 or later. Any supported Clang version is compatible,
+ but detection of out-of-bounds accesses for global variables is
+ supported only since Clang 11.
This mode consumes about 1/8th of available memory at kernel start
and introduces an overhead of ~x1.5 for the rest of the allocations.
@@ -78,8 +78,7 @@ config KASAN_SW_TAGS
Enables software tag-based KASAN mode.
This mode requires Top Byte Ignore support by the CPU and therefore
- is only supported for arm64. This mode requires Clang version 7.0.0
- or later.
+ is only supported for arm64. This mode requires Clang.
This mode consumes about 1/16th of available memory at kernel start
and introduces an overhead of ~20% for the rest of the allocations.
@@ -167,12 +166,24 @@ config KASAN_VMALLOC
for KASAN to detect more sorts of errors (and to support vmapped
stacks), but at the cost of higher memory usage.
-config TEST_KASAN
- tristate "Module for testing KASAN for bug detection"
- depends on m
+config KASAN_KUNIT_TEST
+ tristate "KUnit-compatible tests of KASAN bug detection capabilities" if !KUNIT_ALL_TESTS
+ depends on KASAN && KUNIT
+ default KUNIT_ALL_TESTS
help
- This is a test module doing various nasty things like
- out of bounds accesses, use after free. It is useful for testing
+ This is a KUnit test suite doing various nasty things like
+ out of bounds and use after free accesses. It is useful for testing
kernel debugging features like KASAN.
+ For more information on KUnit and unit tests in general, please refer
+ to the KUnit documentation in Documentation/dev-tools/kunit
+
+config TEST_KASAN_MODULE
+ tristate "KUnit-incompatible tests of KASAN bug detection capabilities"
+ depends on m && KASAN
+ help
+ This is a part of the KASAN test suite that is incompatible with
+ KUnit. Currently includes tests that do bad copy_from/to_user
+ accesses.
+
endif # KASAN
diff --git a/lib/Makefile b/lib/Makefile
index a4a4c6864f51..49a2a9e36224 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -65,9 +65,11 @@ CFLAGS_test_bitops.o += -Werror
obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o
obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o
obj-$(CONFIG_TEST_IDA) += test_ida.o
-obj-$(CONFIG_TEST_KASAN) += test_kasan.o
+obj-$(CONFIG_KASAN_KUNIT_TEST) += test_kasan.o
CFLAGS_test_kasan.o += -fno-builtin
CFLAGS_test_kasan.o += $(call cc-disable-warning, vla)
+obj-$(CONFIG_TEST_KASAN_MODULE) += test_kasan_module.o
+CFLAGS_test_kasan_module.o += -fno-builtin
obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o
CFLAGS_test_ubsan.o += $(call cc-disable-warning, vla)
UBSAN_SANITIZE_test_ubsan.o := y
@@ -99,6 +101,7 @@ obj-$(CONFIG_TEST_BLACKHOLE_DEV) += test_blackhole_dev.o
obj-$(CONFIG_TEST_MEMINIT) += test_meminit.o
obj-$(CONFIG_TEST_LOCKUP) += test_lockup.o
obj-$(CONFIG_TEST_HMM) += test_hmm.o
+obj-$(CONFIG_TEST_FREE_PAGES) += test_free_pages.o
#
# CFLAGS for compiling floating point code inside the kernel. x86/Makefile turns
diff --git a/lib/kunit/test.c b/lib/kunit/test.c
index c36037200310..dcc35fd30d95 100644
--- a/lib/kunit/test.c
+++ b/lib/kunit/test.c
@@ -10,16 +10,12 @@
#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/sched/debug.h>
+#include <linux/sched.h>
#include "debugfs.h"
#include "string-stream.h"
#include "try-catch-impl.h"
-static void kunit_set_failure(struct kunit *test)
-{
- WRITE_ONCE(test->success, false);
-}
-
static void kunit_print_tap_version(void)
{
static bool kunit_has_printed_tap_version;
@@ -288,6 +284,10 @@ static void kunit_try_run_case(void *data)
struct kunit_suite *suite = ctx->suite;
struct kunit_case *test_case = ctx->test_case;
+#if (IS_ENABLED(CONFIG_KASAN) && IS_ENABLED(CONFIG_KUNIT))
+ current->kunit_test = test;
+#endif /* IS_ENABLED(CONFIG_KASAN) && IS_ENABLED(CONFIG_KUNIT) */
+
/*
* kunit_run_case_internal may encounter a fatal error; if it does,
* abort will be called, this thread will exit, and finally the parent
@@ -602,6 +602,9 @@ void kunit_cleanup(struct kunit *test)
spin_unlock(&test->lock);
kunit_remove_resource(test, res);
}
+#if (IS_ENABLED(CONFIG_KASAN) && IS_ENABLED(CONFIG_KUNIT))
+ current->kunit_test = NULL;
+#endif /* IS_ENABLED(CONFIG_KASAN) && IS_ENABLED(CONFIG_KUNIT)*/
}
EXPORT_SYMBOL_GPL(kunit_cleanup);
diff --git a/lib/test_free_pages.c b/lib/test_free_pages.c
new file mode 100644
index 000000000000..074e76bd76b2
--- /dev/null
+++ b/lib/test_free_pages.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * test_free_pages.c: Check that free_pages() doesn't leak memory
+ * Copyright (c) 2020 Oracle
+ * Author: Matthew Wilcox <willy@infradead.org>
+ */
+
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+static void test_free_pages(gfp_t gfp)
+{
+ unsigned int i;
+
+ for (i = 0; i < 1000 * 1000; i++) {
+ unsigned long addr = __get_free_pages(gfp, 3);
+ struct page *page = virt_to_page(addr);
+
+ /* Simulate page cache getting a speculative reference */
+ get_page(page);
+ free_pages(addr, 3);
+ put_page(page);
+ }
+}
+
+static int m_in(void)
+{
+ test_free_pages(GFP_KERNEL);
+ test_free_pages(GFP_KERNEL | __GFP_COMP);
+
+ return 0;
+}
+
+static void m_ex(void)
+{
+}
+
+module_init(m_in);
+module_exit(m_ex);
+MODULE_AUTHOR("Matthew Wilcox <willy@infradead.org>");
+MODULE_LICENSE("GPL");
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index e7dc3de355b7..e151a7f10519 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -36,7 +36,6 @@
static const struct dev_pagemap_ops dmirror_devmem_ops;
static const struct mmu_interval_notifier_ops dmirror_min_ops;
static dev_t dmirror_dev;
-static struct page *dmirror_zero_page;
struct dmirror_device;
@@ -460,6 +459,22 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
unsigned long pfn_last;
void *ptr;
+ devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
+ if (!devmem)
+ return -ENOMEM;
+
+ res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE,
+ "hmm_dmirror");
+ if (IS_ERR(res))
+ goto err_devmem;
+
+ devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+ devmem->pagemap.range.start = res->start;
+ devmem->pagemap.range.end = res->end;
+ devmem->pagemap.nr_range = 1;
+ devmem->pagemap.ops = &dmirror_devmem_ops;
+ devmem->pagemap.owner = mdevice;
+
mutex_lock(&mdevice->devmem_lock);
if (mdevice->devmem_count == mdevice->devmem_capacity) {
@@ -472,33 +487,18 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
sizeof(new_chunks[0]) * new_capacity,
GFP_KERNEL);
if (!new_chunks)
- goto err;
+ goto err_release;
mdevice->devmem_capacity = new_capacity;
mdevice->devmem_chunks = new_chunks;
}
- res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE,
- "hmm_dmirror");
- if (IS_ERR(res))
- goto err;
-
- devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
- if (!devmem)
- goto err_release;
-
- devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
- devmem->pagemap.res = *res;
- devmem->pagemap.ops = &dmirror_devmem_ops;
- devmem->pagemap.owner = mdevice;
-
ptr = memremap_pages(&devmem->pagemap, numa_node_id());
if (IS_ERR(ptr))
- goto err_free;
+ goto err_release;
devmem->mdevice = mdevice;
- pfn_first = devmem->pagemap.res.start >> PAGE_SHIFT;
- pfn_last = pfn_first +
- (resource_size(&devmem->pagemap.res) >> PAGE_SHIFT);
+ pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT;
+ pfn_last = pfn_first + (range_len(&devmem->pagemap.range) >> PAGE_SHIFT);
mdevice->devmem_chunks[mdevice->devmem_count++] = devmem;
mutex_unlock(&mdevice->devmem_lock);
@@ -525,12 +525,12 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
return true;
-err_free:
- kfree(devmem);
err_release:
- release_mem_region(res->start, resource_size(res));
-err:
mutex_unlock(&mdevice->devmem_lock);
+ release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range));
+err_devmem:
+ kfree(devmem);
+
return false;
}
@@ -1100,8 +1100,8 @@ static void dmirror_device_remove(struct dmirror_device *mdevice)
mdevice->devmem_chunks[i];
memunmap_pages(&devmem->pagemap);
- release_mem_region(devmem->pagemap.res.start,
- resource_size(&devmem->pagemap.res));
+ release_mem_region(devmem->pagemap.range.start,
+ range_len(&devmem->pagemap.range));
kfree(devmem);
}
kfree(mdevice->devmem_chunks);
@@ -1126,17 +1126,6 @@ static int __init hmm_dmirror_init(void)
goto err_chrdev;
}
- /*
- * Allocate a zero page to simulate a reserved page of device private
- * memory which is always zero. The zero_pfn page isn't used just to
- * make the code here simpler (i.e., we need a struct page for it).
- */
- dmirror_zero_page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
- if (!dmirror_zero_page) {
- ret = -ENOMEM;
- goto err_chrdev;
- }
-
pr_info("HMM test module loaded. This is only for testing HMM.\n");
return 0;
@@ -1152,8 +1141,6 @@ static void __exit hmm_dmirror_exit(void)
{
int id;
- if (dmirror_zero_page)
- __free_page(dmirror_zero_page);
for (id = 0; id < DMIRROR_NDEVICES; id++)
dmirror_device_remove(dmirror_devices + id);
unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES);
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 53e953bb1d1d..63c26171a791 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -5,8 +5,6 @@
* Author: Andrey Ryabinin <a.ryabinin@samsung.com>
*/
-#define pr_fmt(fmt) "kasan test: %s " fmt, __func__
-
#include <linux/bitops.h>
#include <linux/delay.h>
#include <linux/kasan.h>
@@ -23,6 +21,8 @@
#include <asm/page.h>
+#include <kunit/test.h>
+
#include "../mm/kasan/kasan.h"
#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_SHADOW_SCALE_SIZE)
@@ -32,418 +32,370 @@
* are not eliminated as dead code.
*/
-int kasan_int_result;
void *kasan_ptr_result;
+int kasan_int_result;
-/*
- * Note: test functions are marked noinline so that their names appear in
- * reports.
- */
+static struct kunit_resource resource;
+static struct kunit_kasan_expectation fail_data;
+static bool multishot;
-static noinline void __init kmalloc_oob_right(void)
+static int kasan_test_init(struct kunit *test)
+{
+ /*
+ * Temporarily enable multi-shot mode and set panic_on_warn=0.
+ * Otherwise, we'd only get a report for the first case.
+ */
+ multishot = kasan_save_enable_multi_shot();
+
+ return 0;
+}
+
+static void kasan_test_exit(struct kunit *test)
+{
+ kasan_restore_multi_shot(multishot);
+}
+
+/**
+ * KUNIT_EXPECT_KASAN_FAIL() - Causes a test failure when the expression does
+ * not cause a KASAN error. This uses a KUnit resource named "kasan_data." Do
+ * Do not use this name for a KUnit resource outside here.
+ *
+ */
+#define KUNIT_EXPECT_KASAN_FAIL(test, condition) do { \
+ fail_data.report_expected = true; \
+ fail_data.report_found = false; \
+ kunit_add_named_resource(test, \
+ NULL, \
+ NULL, \
+ &resource, \
+ "kasan_data", &fail_data); \
+ condition; \
+ KUNIT_EXPECT_EQ(test, \
+ fail_data.report_expected, \
+ fail_data.report_found); \
+} while (0)
+
+static void kmalloc_oob_right(struct kunit *test)
{
char *ptr;
size_t size = 123;
- pr_info("out-of-bounds to right\n");
ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
-
- ptr[size + OOB_TAG_OFF] = 'x';
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + OOB_TAG_OFF] = 'x');
kfree(ptr);
}
-static noinline void __init kmalloc_oob_left(void)
+static void kmalloc_oob_left(struct kunit *test)
{
char *ptr;
size_t size = 15;
- pr_info("out-of-bounds to left\n");
ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
- *ptr = *(ptr - 1);
+ KUNIT_EXPECT_KASAN_FAIL(test, *ptr = *(ptr - 1));
kfree(ptr);
}
-static noinline void __init kmalloc_node_oob_right(void)
+static void kmalloc_node_oob_right(struct kunit *test)
{
char *ptr;
size_t size = 4096;
- pr_info("kmalloc_node(): out-of-bounds to right\n");
ptr = kmalloc_node(size, GFP_KERNEL, 0);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
- ptr[size] = 0;
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0);
kfree(ptr);
}
-#ifdef CONFIG_SLUB
-static noinline void __init kmalloc_pagealloc_oob_right(void)
+static void kmalloc_pagealloc_oob_right(struct kunit *test)
{
char *ptr;
size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
+ if (!IS_ENABLED(CONFIG_SLUB)) {
+ kunit_info(test, "CONFIG_SLUB is not enabled.");
+ return;
+ }
+
/* Allocate a chunk that does not fit into a SLUB cache to trigger
* the page allocator fallback.
*/
- pr_info("kmalloc pagealloc allocation: out-of-bounds to right\n");
ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
-
- ptr[size + OOB_TAG_OFF] = 0;
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + OOB_TAG_OFF] = 0);
kfree(ptr);
}
-static noinline void __init kmalloc_pagealloc_uaf(void)
+static void kmalloc_pagealloc_uaf(struct kunit *test)
{
char *ptr;
size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
- pr_info("kmalloc pagealloc allocation: use-after-free\n");
- ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
+ if (!IS_ENABLED(CONFIG_SLUB)) {
+ kunit_info(test, "CONFIG_SLUB is not enabled.");
return;
}
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
kfree(ptr);
- ptr[0] = 0;
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = 0);
}
-static noinline void __init kmalloc_pagealloc_invalid_free(void)
+static void kmalloc_pagealloc_invalid_free(struct kunit *test)
{
char *ptr;
size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
- pr_info("kmalloc pagealloc allocation: invalid-free\n");
- ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
+ if (!IS_ENABLED(CONFIG_SLUB)) {
+ kunit_info(test, "CONFIG_SLUB is not enabled.");
return;
}
- kfree(ptr + 1);
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kfree(ptr + 1));
}
-#endif
-static noinline void __init kmalloc_large_oob_right(void)
+static void kmalloc_large_oob_right(struct kunit *test)
{
char *ptr;
size_t size = KMALLOC_MAX_CACHE_SIZE - 256;
/* Allocate a chunk that is large enough, but still fits into a slab
* and does not trigger the page allocator fallback in SLUB.
*/
- pr_info("kmalloc large allocation: out-of-bounds to right\n");
ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
- ptr[size] = 0;
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0);
kfree(ptr);
}
-static noinline void __init kmalloc_oob_krealloc_more(void)
+static void kmalloc_oob_krealloc_more(struct kunit *test)
{
char *ptr1, *ptr2;
size_t size1 = 17;
size_t size2 = 19;
- pr_info("out-of-bounds after krealloc more\n");
ptr1 = kmalloc(size1, GFP_KERNEL);
- ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
- if (!ptr1 || !ptr2) {
- pr_err("Allocation failed\n");
- kfree(ptr1);
- kfree(ptr2);
- return;
- }
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
- ptr2[size2 + OOB_TAG_OFF] = 'x';
+ ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2 + OOB_TAG_OFF] = 'x');
kfree(ptr2);
}
-static noinline void __init kmalloc_oob_krealloc_less(void)
+static void kmalloc_oob_krealloc_less(struct kunit *test)
{
char *ptr1, *ptr2;
size_t size1 = 17;
size_t size2 = 15;
- pr_info("out-of-bounds after krealloc less\n");
ptr1 = kmalloc(size1, GFP_KERNEL);
- ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
- if (!ptr1 || !ptr2) {
- pr_err("Allocation failed\n");
- kfree(ptr1);
- return;
- }
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
- ptr2[size2 + OOB_TAG_OFF] = 'x';
+ ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2 + OOB_TAG_OFF] = 'x');
kfree(ptr2);
}
-static noinline void __init kmalloc_oob_16(void)
+static void kmalloc_oob_16(struct kunit *test)
{
struct {
u64 words[2];
} *ptr1, *ptr2;
- pr_info("kmalloc out-of-bounds for 16-bytes access\n");
ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+
ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL);
- if (!ptr1 || !ptr2) {
- pr_err("Allocation failed\n");
- kfree(ptr1);
- kfree(ptr2);
- return;
- }
- *ptr1 = *ptr2;
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2);
kfree(ptr1);
kfree(ptr2);
}
-static noinline void __init kmalloc_oob_memset_2(void)
+static void kmalloc_oob_memset_2(struct kunit *test)
{
char *ptr;
size_t size = 8;
- pr_info("out-of-bounds in memset2\n");
ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
-
- memset(ptr + 7 + OOB_TAG_OFF, 0, 2);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 7 + OOB_TAG_OFF, 0, 2));
kfree(ptr);
}
-static noinline void __init kmalloc_oob_memset_4(void)
+static void kmalloc_oob_memset_4(struct kunit *test)
{
char *ptr;
size_t size = 8;
- pr_info("out-of-bounds in memset4\n");
ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
-
- memset(ptr + 5 + OOB_TAG_OFF, 0, 4);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 5 + OOB_TAG_OFF, 0, 4));
kfree(ptr);
}
-static noinline void __init kmalloc_oob_memset_8(void)
+static void kmalloc_oob_memset_8(struct kunit *test)
{
char *ptr;
size_t size = 8;
- pr_info("out-of-bounds in memset8\n");
ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
-
- memset(ptr + 1 + OOB_TAG_OFF, 0, 8);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 1 + OOB_TAG_OFF, 0, 8));
kfree(ptr);
}
-static noinline void __init kmalloc_oob_memset_16(void)
+static void kmalloc_oob_memset_16(struct kunit *test)
{
char *ptr;
size_t size = 16;
- pr_info("out-of-bounds in memset16\n");
ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
-
- memset(ptr + 1 + OOB_TAG_OFF, 0, 16);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 1 + OOB_TAG_OFF, 0, 16));
kfree(ptr);
}
-static noinline void __init kmalloc_oob_in_memset(void)
+static void kmalloc_oob_in_memset(struct kunit *test)
{
char *ptr;
size_t size = 666;
- pr_info("out-of-bounds in memset\n");
ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
-
- memset(ptr, 0, size + 5 + OOB_TAG_OFF);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size + 5 + OOB_TAG_OFF));
kfree(ptr);
}
-static noinline void __init kmalloc_memmove_invalid_size(void)
+static void kmalloc_memmove_invalid_size(struct kunit *test)
{
char *ptr;
size_t size = 64;
volatile size_t invalid_size = -2;
- pr_info("invalid size in memmove\n");
ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
memset((char *)ptr, 0, 64);
- memmove((char *)ptr, (char *)ptr + 4, invalid_size);
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ memmove((char *)ptr, (char *)ptr + 4, invalid_size));
kfree(ptr);
}
-static noinline void __init kmalloc_uaf(void)
+static void kmalloc_uaf(struct kunit *test)
{
char *ptr;
size_t size = 10;
- pr_info("use-after-free\n");
ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
kfree(ptr);
- *(ptr + 8) = 'x';
+ KUNIT_EXPECT_KASAN_FAIL(test, *(ptr + 8) = 'x');
}
-static noinline void __init kmalloc_uaf_memset(void)
+static void kmalloc_uaf_memset(struct kunit *test)
{
char *ptr;
size_t size = 33;
- pr_info("use-after-free in memset\n");
ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
kfree(ptr);
- memset(ptr, 0, size);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size));
}
-static noinline void __init kmalloc_uaf2(void)
+static void kmalloc_uaf2(struct kunit *test)
{
char *ptr1, *ptr2;
size_t size = 43;
- pr_info("use-after-free after another kmalloc\n");
ptr1 = kmalloc(size, GFP_KERNEL);
- if (!ptr1) {
- pr_err("Allocation failed\n");
- return;
- }
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
kfree(ptr1);
+
ptr2 = kmalloc(size, GFP_KERNEL);
- if (!ptr2) {
- pr_err("Allocation failed\n");
- return;
- }
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr1[40] = 'x');
+ KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2);
- ptr1[40] = 'x';
- if (ptr1 == ptr2)
- pr_err("Could not detect use-after-free: ptr1 == ptr2\n");
kfree(ptr2);
}
-static noinline void __init kfree_via_page(void)
+static void kfree_via_page(struct kunit *test)
{
char *ptr;
size_t size = 8;
struct page *page;
unsigned long offset;
- pr_info("invalid-free false positive (via page)\n");
ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
page = virt_to_page(ptr);
offset = offset_in_page(ptr);
kfree(page_address(page) + offset);
}
-static noinline void __init kfree_via_phys(void)
+static void kfree_via_phys(struct kunit *test)
{
char *ptr;
size_t size = 8;
phys_addr_t phys;
- pr_info("invalid-free false positive (via phys)\n");
ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
phys = virt_to_phys(ptr);
kfree(phys_to_virt(phys));
}
-static noinline void __init kmem_cache_oob(void)
+static void kmem_cache_oob(struct kunit *test)
{
char *p;
size_t size = 200;
struct kmem_cache *cache = kmem_cache_create("test_cache",
size, 0,
0, NULL);
- if (!cache) {
- pr_err("Cache allocation failed\n");
- return;
- }
- pr_info("out-of-bounds in kmem_cache_alloc\n");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
p = kmem_cache_alloc(cache, GFP_KERNEL);
if (!p) {
- pr_err("Allocation failed\n");
+ kunit_err(test, "Allocation failed: %s\n", __func__);
kmem_cache_destroy(cache);
return;
}
- *p = p[size + OOB_TAG_OFF];
-
+ KUNIT_EXPECT_KASAN_FAIL(test, *p = p[size + OOB_TAG_OFF]);
kmem_cache_free(cache, p);
kmem_cache_destroy(cache);
}
-static noinline void __init memcg_accounted_kmem_cache(void)
+static void memcg_accounted_kmem_cache(struct kunit *test)
{
int i;
char *p;
@@ -451,12 +403,8 @@ static noinline void __init memcg_accounted_kmem_cache(void)
struct kmem_cache *cache;
cache = kmem_cache_create("test_cache", size, 0, SLAB_ACCOUNT, NULL);
- if (!cache) {
- pr_err("Cache allocation failed\n");
- return;
- }
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
- pr_info("allocate memcg accounted object\n");
/*
* Several allocations with a delay to allow for lazy per memcg kmem
* cache creation.
@@ -476,134 +424,93 @@ free_cache:
static char global_array[10];
-static noinline void __init kasan_global_oob(void)
+static void kasan_global_oob(struct kunit *test)
{
volatile int i = 3;
char *p = &global_array[ARRAY_SIZE(global_array) + i];
- pr_info("out-of-bounds global variable\n");
- *(volatile char *)p;
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
}
-static noinline void __init kasan_stack_oob(void)
-{
- char stack_array[10];
- volatile int i = OOB_TAG_OFF;
- char *p = &stack_array[ARRAY_SIZE(stack_array) + i];
-
- pr_info("out-of-bounds on stack\n");
- *(volatile char *)p;
-}
-
-static noinline void __init ksize_unpoisons_memory(void)
+static void ksize_unpoisons_memory(struct kunit *test)
{
char *ptr;
size_t size = 123, real_size;
- pr_info("ksize() unpoisons the whole allocated chunk\n");
ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
real_size = ksize(ptr);
/* This access doesn't trigger an error. */
ptr[size] = 'x';
/* This one does. */
- ptr[real_size] = 'y';
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[real_size] = 'y');
kfree(ptr);
}
-static noinline void __init copy_user_test(void)
+static void kasan_stack_oob(struct kunit *test)
{
- char *kmem;
- char __user *usermem;
- size_t size = 10;
- int unused;
-
- kmem = kmalloc(size, GFP_KERNEL);
- if (!kmem)
- return;
+ char stack_array[10];
+ volatile int i = OOB_TAG_OFF;
+ char *p = &stack_array[ARRAY_SIZE(stack_array) + i];
- usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_ANONYMOUS | MAP_PRIVATE, 0);
- if (IS_ERR(usermem)) {
- pr_err("Failed to allocate user memory\n");
- kfree(kmem);
+ if (!IS_ENABLED(CONFIG_KASAN_STACK)) {
+ kunit_info(test, "CONFIG_KASAN_STACK is not enabled");
return;
}
- pr_info("out-of-bounds in copy_from_user()\n");
- unused = copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
-
- pr_info("out-of-bounds in copy_to_user()\n");
- unused = copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF);
-
- pr_info("out-of-bounds in __copy_from_user()\n");
- unused = __copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
-
- pr_info("out-of-bounds in __copy_to_user()\n");
- unused = __copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF);
-
- pr_info("out-of-bounds in __copy_from_user_inatomic()\n");
- unused = __copy_from_user_inatomic(kmem, usermem, size + 1 + OOB_TAG_OFF);
-
- pr_info("out-of-bounds in __copy_to_user_inatomic()\n");
- unused = __copy_to_user_inatomic(usermem, kmem, size + 1 + OOB_TAG_OFF);
-
- pr_info("out-of-bounds in strncpy_from_user()\n");
- unused = strncpy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
-
- vm_munmap((unsigned long)usermem, PAGE_SIZE);
- kfree(kmem);
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
}
-static noinline void __init kasan_alloca_oob_left(void)
+static void kasan_alloca_oob_left(struct kunit *test)
{
volatile int i = 10;
char alloca_array[i];
char *p = alloca_array - 1;
- pr_info("out-of-bounds to left on alloca\n");
- *(volatile char *)p;
+ if (!IS_ENABLED(CONFIG_KASAN_STACK)) {
+ kunit_info(test, "CONFIG_KASAN_STACK is not enabled");
+ return;
+ }
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
}
-static noinline void __init kasan_alloca_oob_right(void)
+static void kasan_alloca_oob_right(struct kunit *test)
{
volatile int i = 10;
char alloca_array[i];
char *p = alloca_array + i;
- pr_info("out-of-bounds to right on alloca\n");
- *(volatile char *)p;
+ if (!IS_ENABLED(CONFIG_KASAN_STACK)) {
+ kunit_info(test, "CONFIG_KASAN_STACK is not enabled");
+ return;
+ }
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
}
-static noinline void __init kmem_cache_double_free(void)
+static void kmem_cache_double_free(struct kunit *test)
{
char *p;
size_t size = 200;
struct kmem_cache *cache;
cache = kmem_cache_create("test_cache", size, 0, 0, NULL);
- if (!cache) {
- pr_err("Cache allocation failed\n");
- return;
- }
- pr_info("double-free on heap object\n");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
p = kmem_cache_alloc(cache, GFP_KERNEL);
if (!p) {
- pr_err("Allocation failed\n");
+ kunit_err(test, "Allocation failed: %s\n", __func__);
kmem_cache_destroy(cache);
return;
}
kmem_cache_free(cache, p);
- kmem_cache_free(cache, p);
+ KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_free(cache, p));
kmem_cache_destroy(cache);
}
-static noinline void __init kmem_cache_invalid_free(void)
+static void kmem_cache_invalid_free(struct kunit *test)
{
char *p;
size_t size = 200;
@@ -611,20 +518,17 @@ static noinline void __init kmem_cache_invalid_free(void)
cache = kmem_cache_create("test_cache", size, 0, SLAB_TYPESAFE_BY_RCU,
NULL);
- if (!cache) {
- pr_err("Cache allocation failed\n");
- return;
- }
- pr_info("invalid-free of heap object\n");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
p = kmem_cache_alloc(cache, GFP_KERNEL);
if (!p) {
- pr_err("Allocation failed\n");
+ kunit_err(test, "Allocation failed: %s\n", __func__);
kmem_cache_destroy(cache);
return;
}
/* Trigger invalid free, the object doesn't get freed */
- kmem_cache_free(cache, p + 1);
+ KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_free(cache, p + 1));
/*
* Properly free the object to prevent the "Objects remaining in
@@ -635,45 +539,63 @@ static noinline void __init kmem_cache_invalid_free(void)
kmem_cache_destroy(cache);
}
-static noinline void __init kasan_memchr(void)
+static void kasan_memchr(struct kunit *test)
{
char *ptr;
size_t size = 24;
- pr_info("out-of-bounds in memchr\n");
- ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
- if (!ptr)
+ /* See https://bugzilla.kernel.org/show_bug.cgi?id=206337 */
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
+ kunit_info(test,
+ "str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT");
return;
+ }
+
+ ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ kasan_ptr_result = memchr(ptr, '1', size + 1));
- kasan_ptr_result = memchr(ptr, '1', size + 1);
kfree(ptr);
}
-static noinline void __init kasan_memcmp(void)
+static void kasan_memcmp(struct kunit *test)
{
char *ptr;
size_t size = 24;
int arr[9];
- pr_info("out-of-bounds in memcmp\n");
- ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
- if (!ptr)
+ /* See https://bugzilla.kernel.org/show_bug.cgi?id=206337 */
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
+ kunit_info(test,
+ "str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT");
return;
+ }
+ ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
memset(arr, 0, sizeof(arr));
- kasan_int_result = memcmp(ptr, arr, size + 1);
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ kasan_int_result = memcmp(ptr, arr, size+1));
kfree(ptr);
}
-static noinline void __init kasan_strings(void)
+static void kasan_strings(struct kunit *test)
{
char *ptr;
size_t size = 24;
- pr_info("use-after-free in strchr\n");
- ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
- if (!ptr)
+ /* See https://bugzilla.kernel.org/show_bug.cgi?id=206337 */
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
+ kunit_info(test,
+ "str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT");
return;
+ }
+
+ ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
kfree(ptr);
@@ -684,220 +606,164 @@ static noinline void __init kasan_strings(void)
* will likely point to zeroed byte.
*/
ptr += 16;
- kasan_ptr_result = strchr(ptr, '1');
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strchr(ptr, '1'));
- pr_info("use-after-free in strrchr\n");
- kasan_ptr_result = strrchr(ptr, '1');
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strrchr(ptr, '1'));
- pr_info("use-after-free in strcmp\n");
- kasan_int_result = strcmp(ptr, "2");
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strcmp(ptr, "2"));
- pr_info("use-after-free in strncmp\n");
- kasan_int_result = strncmp(ptr, "2", 1);
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strncmp(ptr, "2", 1));
- pr_info("use-after-free in strlen\n");
- kasan_int_result = strlen(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strlen(ptr));
- pr_info("use-after-free in strnlen\n");
- kasan_int_result = strnlen(ptr, 1);
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strnlen(ptr, 1));
}
-static noinline void __init kasan_bitops(void)
+static void kasan_bitops(struct kunit *test)
{
/*
* Allocate 1 more byte, which causes kzalloc to round up to 16-bytes;
* this way we do not actually corrupt other memory.
*/
long *bits = kzalloc(sizeof(*bits) + 1, GFP_KERNEL);
- if (!bits)
- return;
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bits);
/*
* Below calls try to access bit within allocated memory; however, the
* below accesses are still out-of-bounds, since bitops are defined to
* operate on the whole long the bit is in.
*/
- pr_info("out-of-bounds in set_bit\n");
- set_bit(BITS_PER_LONG, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test, set_bit(BITS_PER_LONG, bits));
- pr_info("out-of-bounds in __set_bit\n");
- __set_bit(BITS_PER_LONG, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test, __set_bit(BITS_PER_LONG, bits));
- pr_info("out-of-bounds in clear_bit\n");
- clear_bit(BITS_PER_LONG, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test, clear_bit(BITS_PER_LONG, bits));
- pr_info("out-of-bounds in __clear_bit\n");
- __clear_bit(BITS_PER_LONG, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit(BITS_PER_LONG, bits));
- pr_info("out-of-bounds in clear_bit_unlock\n");
- clear_bit_unlock(BITS_PER_LONG, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test, clear_bit_unlock(BITS_PER_LONG, bits));
- pr_info("out-of-bounds in __clear_bit_unlock\n");
- __clear_bit_unlock(BITS_PER_LONG, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit_unlock(BITS_PER_LONG, bits));
- pr_info("out-of-bounds in change_bit\n");
- change_bit(BITS_PER_LONG, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test, change_bit(BITS_PER_LONG, bits));
- pr_info("out-of-bounds in __change_bit\n");
- __change_bit(BITS_PER_LONG, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test, __change_bit(BITS_PER_LONG, bits));
/*
* Below calls try to access bit beyond allocated memory.
*/
- pr_info("out-of-bounds in test_and_set_bit\n");
- test_and_set_bit(BITS_PER_LONG + BITS_PER_BYTE, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ test_and_set_bit(BITS_PER_LONG + BITS_PER_BYTE, bits));
- pr_info("out-of-bounds in __test_and_set_bit\n");
- __test_and_set_bit(BITS_PER_LONG + BITS_PER_BYTE, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ __test_and_set_bit(BITS_PER_LONG + BITS_PER_BYTE, bits));
- pr_info("out-of-bounds in test_and_set_bit_lock\n");
- test_and_set_bit_lock(BITS_PER_LONG + BITS_PER_BYTE, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ test_and_set_bit_lock(BITS_PER_LONG + BITS_PER_BYTE, bits));
- pr_info("out-of-bounds in test_and_clear_bit\n");
- test_and_clear_bit(BITS_PER_LONG + BITS_PER_BYTE, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ test_and_clear_bit(BITS_PER_LONG + BITS_PER_BYTE, bits));
- pr_info("out-of-bounds in __test_and_clear_bit\n");
- __test_and_clear_bit(BITS_PER_LONG + BITS_PER_BYTE, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ __test_and_clear_bit(BITS_PER_LONG + BITS_PER_BYTE, bits));
- pr_info("out-of-bounds in test_and_change_bit\n");
- test_and_change_bit(BITS_PER_LONG + BITS_PER_BYTE, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ test_and_change_bit(BITS_PER_LONG + BITS_PER_BYTE, bits));
- pr_info("out-of-bounds in __test_and_change_bit\n");
- __test_and_change_bit(BITS_PER_LONG + BITS_PER_BYTE, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ __test_and_change_bit(BITS_PER_LONG + BITS_PER_BYTE, bits));
- pr_info("out-of-bounds in test_bit\n");
- kasan_int_result = test_bit(BITS_PER_LONG + BITS_PER_BYTE, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ kasan_int_result =
+ test_bit(BITS_PER_LONG + BITS_PER_BYTE, bits));
#if defined(clear_bit_unlock_is_negative_byte)
- pr_info("out-of-bounds in clear_bit_unlock_is_negative_byte\n");
- kasan_int_result = clear_bit_unlock_is_negative_byte(BITS_PER_LONG +
- BITS_PER_BYTE, bits);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ kasan_int_result = clear_bit_unlock_is_negative_byte(
+ BITS_PER_LONG + BITS_PER_BYTE, bits));
#endif
kfree(bits);
}
-static noinline void __init kmalloc_double_kzfree(void)
+static void kmalloc_double_kzfree(struct kunit *test)
{
char *ptr;
size_t size = 16;
- pr_info("double-free (kfree_sensitive)\n");
ptr = kmalloc(size, GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
kfree_sensitive(ptr);
- kfree_sensitive(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr));
}
-#ifdef CONFIG_KASAN_VMALLOC
-static noinline void __init vmalloc_oob(void)
+static void vmalloc_oob(struct kunit *test)
{
void *area;
- pr_info("vmalloc out-of-bounds\n");
+ if (!IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
+ kunit_info(test, "CONFIG_KASAN_VMALLOC is not enabled.");
+ return;
+ }
/*
* We have to be careful not to hit the guard page.
* The MMU will catch that and crash us.
*/
area = vmalloc(3000);
- if (!area) {
- pr_err("Allocation failed\n");
- return;
- }
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, area);
- ((volatile char *)area)[3100];
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)area)[3100]);
vfree(area);
}
-#else
-static void __init vmalloc_oob(void) {}
-#endif
-static struct kasan_rcu_info {
- int i;
- struct rcu_head rcu;
-} *global_rcu_ptr;
-
-static noinline void __init kasan_rcu_reclaim(struct rcu_head *rp)
-{
- struct kasan_rcu_info *fp = container_of(rp,
- struct kasan_rcu_info, rcu);
-
- kfree(fp);
- fp->i = 1;
-}
-
-static noinline void __init kasan_rcu_uaf(void)
-{
- struct kasan_rcu_info *ptr;
-
- pr_info("use-after-free in kasan_rcu_reclaim\n");
- ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
-
- global_rcu_ptr = rcu_dereference_protected(ptr, NULL);
- call_rcu(&global_rcu_ptr->rcu, kasan_rcu_reclaim);
-}
-
-static int __init kmalloc_tests_init(void)
-{
- /*
- * Temporarily enable multi-shot mode. Otherwise, we'd only get a
- * report for the first case.
- */
- bool multishot = kasan_save_enable_multi_shot();
-
- kmalloc_oob_right();
- kmalloc_oob_left();
- kmalloc_node_oob_right();
-#ifdef CONFIG_SLUB
- kmalloc_pagealloc_oob_right();
- kmalloc_pagealloc_uaf();
- kmalloc_pagealloc_invalid_free();
-#endif
- kmalloc_large_oob_right();
- kmalloc_oob_krealloc_more();
- kmalloc_oob_krealloc_less();
- kmalloc_oob_16();
- kmalloc_oob_in_memset();
- kmalloc_oob_memset_2();
- kmalloc_oob_memset_4();
- kmalloc_oob_memset_8();
- kmalloc_oob_memset_16();
- kmalloc_memmove_invalid_size();
- kmalloc_uaf();
- kmalloc_uaf_memset();
- kmalloc_uaf2();
- kfree_via_page();
- kfree_via_phys();
- kmem_cache_oob();
- memcg_accounted_kmem_cache();
- kasan_stack_oob();
- kasan_global_oob();
- kasan_alloca_oob_left();
- kasan_alloca_oob_right();
- ksize_unpoisons_memory();
- copy_user_test();
- kmem_cache_double_free();
- kmem_cache_invalid_free();
- kasan_memchr();
- kasan_memcmp();
- kasan_strings();
- kasan_bitops();
- kmalloc_double_kzfree();
- vmalloc_oob();
- kasan_rcu_uaf();
-
- kasan_restore_multi_shot(multishot);
-
- return -EAGAIN;
-}
+static struct kunit_case kasan_kunit_test_cases[] = {
+ KUNIT_CASE(kmalloc_oob_right),
+ KUNIT_CASE(kmalloc_oob_left),
+ KUNIT_CASE(kmalloc_node_oob_right),
+ KUNIT_CASE(kmalloc_pagealloc_oob_right),
+ KUNIT_CASE(kmalloc_pagealloc_uaf),
+ KUNIT_CASE(kmalloc_pagealloc_invalid_free),
+ KUNIT_CASE(kmalloc_large_oob_right),
+ KUNIT_CASE(kmalloc_oob_krealloc_more),
+ KUNIT_CASE(kmalloc_oob_krealloc_less),
+ KUNIT_CASE(kmalloc_oob_16),
+ KUNIT_CASE(kmalloc_oob_in_memset),
+ KUNIT_CASE(kmalloc_oob_memset_2),
+ KUNIT_CASE(kmalloc_oob_memset_4),
+ KUNIT_CASE(kmalloc_oob_memset_8),
+ KUNIT_CASE(kmalloc_oob_memset_16),
+ KUNIT_CASE(kmalloc_memmove_invalid_size),
+ KUNIT_CASE(kmalloc_uaf),
+ KUNIT_CASE(kmalloc_uaf_memset),
+ KUNIT_CASE(kmalloc_uaf2),
+ KUNIT_CASE(kfree_via_page),
+ KUNIT_CASE(kfree_via_phys),
+ KUNIT_CASE(kmem_cache_oob),
+ KUNIT_CASE(memcg_accounted_kmem_cache),
+ KUNIT_CASE(kasan_global_oob),
+ KUNIT_CASE(kasan_stack_oob),
+ KUNIT_CASE(kasan_alloca_oob_left),
+ KUNIT_CASE(kasan_alloca_oob_right),
+ KUNIT_CASE(ksize_unpoisons_memory),
+ KUNIT_CASE(kmem_cache_double_free),
+ KUNIT_CASE(kmem_cache_invalid_free),
+ KUNIT_CASE(kasan_memchr),
+ KUNIT_CASE(kasan_memcmp),
+ KUNIT_CASE(kasan_strings),
+ KUNIT_CASE(kasan_bitops),
+ KUNIT_CASE(kmalloc_double_kzfree),
+ KUNIT_CASE(vmalloc_oob),
+ {}
+};
+
+static struct kunit_suite kasan_kunit_test_suite = {
+ .name = "kasan",
+ .init = kasan_test_init,
+ .test_cases = kasan_kunit_test_cases,
+ .exit = kasan_test_exit,
+};
+
+kunit_test_suite(kasan_kunit_test_suite);
-module_init(kmalloc_tests_init);
MODULE_LICENSE("GPL");
diff --git a/lib/test_kasan_module.c b/lib/test_kasan_module.c
new file mode 100644
index 000000000000..2d68db6ae67b
--- /dev/null
+++ b/lib/test_kasan_module.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ */
+
+#define pr_fmt(fmt) "kasan test: %s " fmt, __func__
+
+#include <linux/mman.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "../mm/kasan/kasan.h"
+
+#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_SHADOW_SCALE_SIZE)
+
+static noinline void __init copy_user_test(void)
+{
+ char *kmem;
+ char __user *usermem;
+ size_t size = 10;
+ int unused;
+
+ kmem = kmalloc(size, GFP_KERNEL);
+ if (!kmem)
+ return;
+
+ usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0);
+ if (IS_ERR(usermem)) {
+ pr_err("Failed to allocate user memory\n");
+ kfree(kmem);
+ return;
+ }
+
+ pr_info("out-of-bounds in copy_from_user()\n");
+ unused = copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
+
+ pr_info("out-of-bounds in copy_to_user()\n");
+ unused = copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF);
+
+ pr_info("out-of-bounds in __copy_from_user()\n");
+ unused = __copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
+
+ pr_info("out-of-bounds in __copy_to_user()\n");
+ unused = __copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF);
+
+ pr_info("out-of-bounds in __copy_from_user_inatomic()\n");
+ unused = __copy_from_user_inatomic(kmem, usermem, size + 1 + OOB_TAG_OFF);
+
+ pr_info("out-of-bounds in __copy_to_user_inatomic()\n");
+ unused = __copy_to_user_inatomic(usermem, kmem, size + 1 + OOB_TAG_OFF);
+
+ pr_info("out-of-bounds in strncpy_from_user()\n");
+ unused = strncpy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
+
+ vm_munmap((unsigned long)usermem, PAGE_SIZE);
+ kfree(kmem);
+}
+
+static struct kasan_rcu_info {
+ int i;
+ struct rcu_head rcu;
+} *global_rcu_ptr;
+
+static noinline void __init kasan_rcu_reclaim(struct rcu_head *rp)
+{
+ struct kasan_rcu_info *fp = container_of(rp,
+ struct kasan_rcu_info, rcu);
+
+ kfree(fp);
+ fp->i = 1;
+}
+
+static noinline void __init kasan_rcu_uaf(void)
+{
+ struct kasan_rcu_info *ptr;
+
+ pr_info("use-after-free in kasan_rcu_reclaim\n");
+ ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL);
+ if (!ptr) {
+ pr_err("Allocation failed\n");
+ return;
+ }
+
+ global_rcu_ptr = rcu_dereference_protected(ptr, NULL);
+ call_rcu(&global_rcu_ptr->rcu, kasan_rcu_reclaim);
+}
+
+
+static int __init test_kasan_module_init(void)
+{
+ /*
+ * Temporarily enable multi-shot mode. Otherwise, we'd only get a
+ * report for the first case.
+ */
+ bool multishot = kasan_save_enable_multi_shot();
+
+ copy_user_test();
+ kasan_rcu_uaf();
+
+ kasan_restore_multi_shot(multishot);
+ return -EAGAIN;
+}
+
+module_init(test_kasan_module_init);
+MODULE_LICENSE("GPL");
diff --git a/mm/Kconfig b/mm/Kconfig
index e3ee7b32c637..8c60c49a123b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -831,10 +831,10 @@ config PERCPU_STATS
be used to help understand percpu memory usage.
config GUP_BENCHMARK
- bool "Enable infrastructure for get_user_pages_fast() benchmarking"
+ bool "Enable infrastructure for get_user_pages() and related calls benchmarking"
help
Provides /sys/kernel/debug/gup_benchmark that helps with testing
- performance of get_user_pages_fast().
+ performance of get_user_pages() and related calls.
See tools/testing/selftests/vm/gup_benchmark.c
diff --git a/mm/Makefile b/mm/Makefile
index d5649f1c12c0..d73aed0fc99c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -94,7 +94,6 @@ obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
-obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 176dcded298e..6c63844fc061 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -180,11 +180,10 @@ bool compaction_deferred(struct zone *zone, int order)
return false;
/* Avoid possible overflow */
- if (++zone->compact_considered > defer_limit)
+ if (++zone->compact_considered >= defer_limit) {
zone->compact_considered = defer_limit;
-
- if (zone->compact_considered >= defer_limit)
return false;
+ }
trace_mm_compaction_deferred(zone, order);
diff --git a/mm/debug.c b/mm/debug.c
index ca8d1cacdecc..ccca576b2899 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -102,12 +102,12 @@ void __dump_page(struct page *page, const char *reason)
if (hpage_pincount_available(page)) {
pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n",
head, compound_order(head),
- head_mapcount(head),
- head_pincount(head));
+ head_compound_mapcount(head),
+ head_compound_pincount(head));
} else {
pr_warn("head:%p order:%u compound_mapcount:%d\n",
head, compound_order(head),
- head_mapcount(head));
+ head_compound_mapcount(head));
}
}
if (PageKsm(page))
@@ -120,6 +120,7 @@ void __dump_page(struct page *page, const char *reason)
struct hlist_node *dentry_first;
struct dentry *dentry_ptr;
struct dentry dentry;
+ unsigned long ino;
/*
* mapping can be invalid pointer and we don't want to crash
@@ -136,21 +137,22 @@ void __dump_page(struct page *page, const char *reason)
goto out_mapping;
}
- if (get_kernel_nofault(dentry_first, &host->i_dentry.first)) {
+ if (get_kernel_nofault(dentry_first, &host->i_dentry.first) ||
+ get_kernel_nofault(ino, &host->i_ino)) {
pr_warn("aops:%ps with invalid host inode %px\n",
a_ops, host);
goto out_mapping;
}
if (!dentry_first) {
- pr_warn("aops:%ps ino:%lx\n", a_ops, host->i_ino);
+ pr_warn("aops:%ps ino:%lx\n", a_ops, ino);
goto out_mapping;
}
dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
if (get_kernel_nofault(dentry, dentry_ptr)) {
- pr_warn("aops:%ps with invalid dentry %px\n", a_ops,
- dentry_ptr);
+ pr_warn("aops:%ps ino:%lx with invalid dentry %px\n",
+ a_ops, ino, dentry_ptr);
} else {
/*
* if dentry is corrupted, the %pd handler may still
@@ -158,7 +160,7 @@ void __dump_page(struct page *page, const char *reason)
* corrupted struct page
*/
pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n",
- a_ops, host->i_ino, &dentry);
+ a_ops, ino, &dentry);
}
}
out_mapping:
diff --git a/mm/dmapool.c b/mm/dmapool.c
index f9fb9bbd733e..a97c97232337 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -266,6 +266,7 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
*/
void dma_pool_destroy(struct dma_pool *pool)
{
+ struct dma_page *page, *tmp;
bool empty = false;
if (unlikely(!pool))
@@ -281,17 +282,13 @@ void dma_pool_destroy(struct dma_pool *pool)
device_remove_file(pool->dev, &dev_attr_pools);
mutex_unlock(&pools_reg_lock);
- while (!list_empty(&pool->page_list)) {
- struct dma_page *page;
- page = list_entry(pool->page_list.next,
- struct dma_page, page_list);
+ list_for_each_entry_safe(page, tmp, &pool->page_list, page_list) {
if (is_page_busy(page)) {
if (pool->dev)
- dev_err(pool->dev,
- "dma_pool_destroy %s, %p busy\n",
+ dev_err(pool->dev, "%s %s, %p busy\n", __func__,
pool->name, page->vaddr);
else
- pr_err("dma_pool_destroy %s, %p busy\n",
+ pr_err("%s %s, %p busy\n", __func__,
pool->name, page->vaddr);
/* leak the still-in-use consistent memory */
list_del(&page->page_list);
@@ -355,12 +352,11 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
if (data[i] == POOL_POISON_FREED)
continue;
if (pool->dev)
- dev_err(pool->dev,
- "dma_pool_alloc %s, %p (corrupted)\n",
- pool->name, retval);
+ dev_err(pool->dev, "%s %s, %p (corrupted)\n",
+ __func__, pool->name, retval);
else
- pr_err("dma_pool_alloc %s, %p (corrupted)\n",
- pool->name, retval);
+ pr_err("%s %s, %p (corrupted)\n",
+ __func__, pool->name, retval);
/*
* Dump the first 4 bytes even if they are not
@@ -416,12 +412,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
if (!page) {
spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
- dev_err(pool->dev,
- "dma_pool_free %s, %p/%lx (bad dma)\n",
- pool->name, vaddr, (unsigned long)dma);
+ dev_err(pool->dev, "%s %s, %p/%pad (bad dma)\n",
+ __func__, pool->name, vaddr, &dma);
else
- pr_err("dma_pool_free %s, %p/%lx (bad dma)\n",
- pool->name, vaddr, (unsigned long)dma);
+ pr_err("%s %s, %p/%pad (bad dma)\n",
+ __func__, pool->name, vaddr, &dma);
return;
}
@@ -432,12 +427,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
if ((dma - page->dma) != offset) {
spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
- dev_err(pool->dev,
- "dma_pool_free %s, %p (bad vaddr)/%pad\n",
- pool->name, vaddr, &dma);
+ dev_err(pool->dev, "%s %s, %p (bad vaddr)/%pad\n",
+ __func__, pool->name, vaddr, &dma);
else
- pr_err("dma_pool_free %s, %p (bad vaddr)/%pad\n",
- pool->name, vaddr, &dma);
+ pr_err("%s %s, %p (bad vaddr)/%pad\n",
+ __func__, pool->name, vaddr, &dma);
return;
}
{
@@ -449,11 +443,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
}
spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
- dev_err(pool->dev, "dma_pool_free %s, dma %pad already free\n",
- pool->name, &dma);
+ dev_err(pool->dev, "%s %s, dma %pad already free\n",
+ __func__, pool->name, &dma);
else
- pr_err("dma_pool_free %s, dma %pad already free\n",
- pool->name, &dma);
+ pr_err("%s %s, dma %pad already free\n",
+ __func__, pool->name, &dma);
return;
}
}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 0e66f2aaeea3..d6baa4f451c5 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -141,7 +141,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
}
if (end_index >= start_index) {
- unsigned long count;
+ unsigned long nr_pagevec = 0;
/*
* It's common to FADV_DONTNEED right after
@@ -154,8 +154,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
*/
lru_add_drain();
- count = invalidate_mapping_pages(mapping,
- start_index, end_index);
+ invalidate_mapping_pagevec(mapping,
+ start_index, end_index,
+ &nr_pagevec);
/*
* If fewer pages were invalidated than expected then
@@ -163,7 +164,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
* a per-cpu pagevec for a remote CPU. Drain all
* pagevecs and try again.
*/
- if (count < (end_index - start_index + 1)) {
+ if (nr_pagevec) {
lru_add_drain_all();
invalidate_mapping_pages(mapping, start_index,
end_index);
diff --git a/mm/filemap.c b/mm/filemap.c
index 748b7b1b4f6d..38546dca58fe 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1645,19 +1645,19 @@ EXPORT_SYMBOL(page_cache_prev_miss);
/**
* find_get_entry - find and get a page cache entry
* @mapping: the address_space to search
- * @offset: the page cache index
+ * @index: The page cache index.
*
* Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned with an increased refcount.
+ * page cache page, the head page is returned with an increased refcount.
*
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * Return: the found page or shadow entry, %NULL if nothing is found.
+ * Return: The head page or shadow entry, %NULL if nothing is found.
*/
-struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
+struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
{
- XA_STATE(xas, &mapping->i_pages, offset);
+ XA_STATE(xas, &mapping->i_pages, index);
struct page *page;
rcu_read_lock();
@@ -1685,7 +1685,6 @@ repeat:
put_page(page);
goto repeat;
}
- page = find_subpage(page, offset);
out:
rcu_read_unlock();
@@ -1693,40 +1692,37 @@ out:
}
/**
- * find_lock_entry - locate, pin and lock a page cache entry
- * @mapping: the address_space to search
- * @offset: the page cache index
+ * find_lock_entry - Locate and lock a page cache entry.
+ * @mapping: The address_space to search.
+ * @index: The page cache index.
*
- * Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned locked and with an increased
- * refcount.
+ * Looks up the page at @mapping & @index. If there is a page in the
+ * cache, the head page is returned locked and with an increased refcount.
*
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * find_lock_entry() may sleep.
- *
- * Return: the found page or shadow entry, %NULL if nothing is found.
+ * Context: May sleep.
+ * Return: The head page or shadow entry, %NULL if nothing is found.
*/
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
+struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
{
struct page *page;
repeat:
- page = find_get_entry(mapping, offset);
+ page = find_get_entry(mapping, index);
if (page && !xa_is_value(page)) {
lock_page(page);
/* Has the page been truncated? */
- if (unlikely(page_mapping(page) != mapping)) {
+ if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
+ VM_BUG_ON_PAGE(!thp_contains(page, index), page);
}
return page;
}
-EXPORT_SYMBOL(find_lock_entry);
/**
* pagecache_get_page - Find and get a reference to a page.
@@ -1741,6 +1737,8 @@ EXPORT_SYMBOL(find_lock_entry);
*
* * %FGP_ACCESSED - The page will be marked accessed.
* * %FGP_LOCK - The page is returned locked.
+ * * %FGP_HEAD - If the page is present and a THP, return the head page
+ * rather than the exact page specified by the index.
* * %FGP_CREAT - If no page is present then a new page is allocated using
* @gfp_mask and added to the page cache and the VM's LRU list.
* The page is returned locked and with an increased refcount.
@@ -1781,12 +1779,12 @@ repeat:
}
/* Has the page been truncated? */
- if (unlikely(compound_head(page)->mapping != mapping)) {
+ if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page->index != index, page);
+ VM_BUG_ON_PAGE(!thp_contains(page, index), page);
}
if (fgp_flags & FGP_ACCESSED)
@@ -1796,6 +1794,8 @@ repeat:
if (page_is_idle(page))
clear_page_idle(page);
}
+ if (!(fgp_flags & FGP_HEAD))
+ page = find_subpage(page, index);
no_page:
if (!page && (fgp_flags & FGP_CREAT)) {
@@ -2793,42 +2793,42 @@ void filemap_map_pages(struct vm_fault *vmf,
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
- struct page *page;
+ struct page *head, *page;
unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
rcu_read_lock();
- xas_for_each(&xas, page, end_pgoff) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, head, end_pgoff) {
+ if (xas_retry(&xas, head))
continue;
- if (xa_is_value(page))
+ if (xa_is_value(head))
goto next;
/*
* Check for a locked page first, as a speculative
* reference may adversely influence page migration.
*/
- if (PageLocked(page))
+ if (PageLocked(head))
goto next;
- if (!page_cache_get_speculative(page))
+ if (!page_cache_get_speculative(head))
goto next;
/* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas)))
+ if (unlikely(head != xas_reload(&xas)))
goto skip;
- page = find_subpage(page, xas.xa_index);
+ page = find_subpage(head, xas.xa_index);
- if (!PageUptodate(page) ||
+ if (!PageUptodate(head) ||
PageReadahead(page) ||
PageHWPoison(page))
goto skip;
- if (!trylock_page(page))
+ if (!trylock_page(head))
goto skip;
- if (page->mapping != mapping || !PageUptodate(page))
+ if (head->mapping != mapping || !PageUptodate(head))
goto unlock;
max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
- if (page->index >= max_idx)
+ if (xas.xa_index >= max_idx)
goto unlock;
if (mmap_miss > 0)
@@ -2840,12 +2840,12 @@ void filemap_map_pages(struct vm_fault *vmf,
last_pgoff = xas.xa_index;
if (alloc_set_pte(vmf, page))
goto unlock;
- unlock_page(page);
+ unlock_page(head);
goto next;
unlock:
- unlock_page(page);
+ unlock_page(head);
skip:
- put_page(page);
+ put_page(head);
next:
/* Huge page is mapped? No need to proceed. */
if (pmd_trans_huge(*vmf->pmd))
diff --git a/mm/gup.c b/mm/gup.c
index e869c634cc9a..ad617e7f22f5 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -329,6 +329,13 @@ void unpin_user_pages(struct page **pages, unsigned long npages)
unsigned long index;
/*
+ * If this WARN_ON() fires, then the system *might* be leaking pages (by
+ * leaving them pinned), but probably not. More likely, gup/pup returned
+ * a hard -ERRNO error to the caller, who erroneously passed it here.
+ */
+ if (WARN_ON(IS_ERR_VALUE(npages)))
+ return;
+ /*
* TODO: this can be optimized for huge pages: if a series of pages is
* physically contiguous and part of the same compound page, then a
* single operation to the head page should suffice.
@@ -1747,6 +1754,25 @@ static __always_inline long __gup_longterm_locked(struct mm_struct *mm,
}
#endif /* CONFIG_FS_DAX || CONFIG_CMA */
+static bool is_valid_gup_flags(unsigned int gup_flags)
+{
+ /*
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+ * never directly by the caller, so enforce that with an assertion:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ return false;
+ /*
+ * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
+ * that is, FOLL_LONGTERM is a specific case, more restrictive case of
+ * FOLL_PIN.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return false;
+
+ return true;
+}
+
#ifdef CONFIG_MMU
static long __get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
@@ -1842,11 +1868,7 @@ long get_user_pages_remote(struct mm_struct *mm,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
- /*
- * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
- * never directly by the caller, so enforce that with an assertion:
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
@@ -1892,11 +1914,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas)
{
- /*
- * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
- * never directly by the caller, so enforce that with an assertion:
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
return __gup_longterm_locked(current->mm, start, nr_pages,
@@ -2786,11 +2804,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
- /*
- * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
- * never directly by the caller, so enforce that:
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
/*
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index be690fa66a46..464cae1fa3ea 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -6,10 +6,10 @@
#include <linux/debugfs.h>
#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
-#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
-#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
-#define PIN_FAST_BENCHMARK _IOWR('g', 4, struct gup_benchmark)
-#define PIN_BENCHMARK _IOWR('g', 5, struct gup_benchmark)
+#define GUP_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
+#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
+#define PIN_BENCHMARK _IOWR('g', 4, struct gup_benchmark)
+#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark)
struct gup_benchmark {
__u64 get_delta_usec;
@@ -28,7 +28,6 @@ static void put_back_pages(unsigned int cmd, struct page **pages,
switch (cmd) {
case GUP_FAST_BENCHMARK:
- case GUP_LONGTERM_BENCHMARK:
case GUP_BENCHMARK:
for (i = 0; i < nr_pages; i++)
put_page(pages[i]);
@@ -36,6 +35,7 @@ static void put_back_pages(unsigned int cmd, struct page **pages,
case PIN_FAST_BENCHMARK:
case PIN_BENCHMARK:
+ case PIN_LONGTERM_BENCHMARK:
unpin_user_pages(pages, nr_pages);
break;
}
@@ -50,6 +50,7 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
switch (cmd) {
case PIN_FAST_BENCHMARK:
case PIN_BENCHMARK:
+ case PIN_LONGTERM_BENCHMARK:
for (i = 0; i < nr_pages; i++) {
page = pages[i];
if (WARN(!page_maybe_dma_pinned(page),
@@ -101,11 +102,6 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = get_user_pages_fast(addr, nr, gup->flags,
pages + i);
break;
- case GUP_LONGTERM_BENCHMARK:
- nr = get_user_pages(addr, nr,
- gup->flags | FOLL_LONGTERM,
- pages + i, NULL);
- break;
case GUP_BENCHMARK:
nr = get_user_pages(addr, nr, gup->flags, pages + i,
NULL);
@@ -118,6 +114,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = pin_user_pages(addr, nr, gup->flags, pages + i,
NULL);
break;
+ case PIN_LONGTERM_BENCHMARK:
+ nr = pin_user_pages(addr, nr,
+ gup->flags | FOLL_LONGTERM,
+ pages + i, NULL);
+ break;
default:
kvfree(pages);
ret = -EINVAL;
@@ -162,10 +163,10 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
switch (cmd) {
case GUP_FAST_BENCHMARK:
- case GUP_LONGTERM_BENCHMARK:
case GUP_BENCHMARK:
case PIN_FAST_BENCHMARK:
case PIN_BENCHMARK:
+ case PIN_LONGTERM_BENCHMARK:
break;
default:
return -EINVAL;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ec0f0cc49545..65c289c13b58 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2306,13 +2306,13 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
/*
* If we're also updating the vma->vm_next->vm_start, if the new
- * vm_next->vm_start isn't page aligned and it could previously
+ * vm_next->vm_start isn't hpage aligned and it could previously
* contain an hugepage: check if we need to split an huge pmd.
*/
if (adjust_next > 0) {
struct vm_area_struct *next = vma->vm_next;
unsigned long nstart = next->vm_start;
- nstart += adjust_next << PAGE_SHIFT;
+ nstart += adjust_next;
if (nstart & ~HPAGE_PMD_MASK &&
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 67fc6383995b..2fb9a4c7a161 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -240,7 +240,6 @@ get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
resv->region_cache_count--;
nrg = list_first_entry(&resv->region_cache, struct file_region, link);
- VM_BUG_ON(!nrg);
list_del(&nrg->link);
nrg->from = from;
@@ -309,8 +308,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
list_del(&rg->link);
kfree(rg);
- coalesce_file_region(resv, prg);
- return;
+ rg = prg;
}
nrg = list_next_entry(rg, link);
@@ -320,22 +318,20 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
list_del(&rg->link);
kfree(rg);
-
- coalesce_file_region(resv, nrg);
- return;
}
}
-/* Must be called with resv->lock held. Calling this with count_only == true
- * will count the number of pages to be added but will not modify the linked
- * list. If regions_needed != NULL and count_only == true, then regions_needed
- * will indicate the number of file_regions needed in the cache to carry out to
- * add the regions for this range.
+/*
+ * Must be called with resv->lock held.
+ *
+ * Calling this with regions_needed != NULL will count the number of pages
+ * to be added but will not modify the linked list. And regions_needed will
+ * indicate the number of file_regions needed in the cache to carry out to add
+ * the regions for this range.
*/
static long add_reservation_in_range(struct resv_map *resv, long f, long t,
struct hugetlb_cgroup *h_cg,
- struct hstate *h, long *regions_needed,
- bool count_only)
+ struct hstate *h, long *regions_needed)
{
long add = 0;
struct list_head *head = &resv->regions;
@@ -371,14 +367,14 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
*/
if (rg->from > last_accounted_offset) {
add += rg->from - last_accounted_offset;
- if (!count_only) {
+ if (!regions_needed) {
nrg = get_file_region_entry_from_cache(
resv, last_accounted_offset, rg->from);
record_hugetlb_cgroup_uncharge_info(h_cg, h,
resv, nrg);
list_add(&nrg->link, rg->link.prev);
coalesce_file_region(resv, nrg);
- } else if (regions_needed)
+ } else
*regions_needed += 1;
}
@@ -390,13 +386,13 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
*/
if (last_accounted_offset < t) {
add += t - last_accounted_offset;
- if (!count_only) {
+ if (!regions_needed) {
nrg = get_file_region_entry_from_cache(
resv, last_accounted_offset, t);
record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
list_add(&nrg->link, rg->link.prev);
coalesce_file_region(resv, nrg);
- } else if (regions_needed)
+ } else
*regions_needed += 1;
}
@@ -448,11 +444,8 @@ static int allocate_file_region_entries(struct resv_map *resv,
spin_lock(&resv->lock);
- list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
- list_del(&rg->link);
- list_add(&rg->link, &resv->region_cache);
- resv->region_cache_count++;
- }
+ list_splice(&allocated_regions, &resv->region_cache);
+ resv->region_cache_count += to_allocate;
}
return 0;
@@ -492,8 +485,8 @@ static long region_add(struct resv_map *resv, long f, long t,
retry:
/* Count how many regions are actually needed to execute this add. */
- add_reservation_in_range(resv, f, t, NULL, NULL, &actual_regions_needed,
- true);
+ add_reservation_in_range(resv, f, t, NULL, NULL,
+ &actual_regions_needed);
/*
* Check for sufficient descriptors in the cache to accommodate
@@ -521,7 +514,7 @@ retry:
goto retry;
}
- add = add_reservation_in_range(resv, f, t, h_cg, h, NULL, false);
+ add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
resv->adds_in_progress -= in_regions_needed;
@@ -557,9 +550,9 @@ static long region_chg(struct resv_map *resv, long f, long t,
spin_lock(&resv->lock);
- /* Count how many hugepages in this range are NOT respresented. */
+ /* Count how many hugepages in this range are NOT represented. */
chg = add_reservation_in_range(resv, f, t, NULL, NULL,
- out_regions_needed, true);
+ out_regions_needed);
if (*out_regions_needed == 0)
*out_regions_needed = 1;
@@ -1047,21 +1040,17 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
if (nocma && is_migrate_cma_page(page))
continue;
- if (!PageHWPoison(page))
- break;
+ if (PageHWPoison(page))
+ continue;
+
+ list_move(&page->lru, &h->hugepage_activelist);
+ set_page_refcounted(page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ return page;
}
- /*
- * if 'non-isolated free hugepage' not found on the list,
- * the allocation fails.
- */
- if (&h->hugepage_freelists[nid] == &page->lru)
- return NULL;
- list_move(&page->lru, &h->hugepage_activelist);
- set_page_refcounted(page);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
- return page;
+ return NULL;
}
static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
@@ -1511,9 +1500,9 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
- spin_lock(&hugetlb_lock);
set_hugetlb_cgroup(page, NULL);
set_hugetlb_cgroup_rsvd(page, NULL);
+ spin_lock(&hugetlb_lock);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
spin_unlock(&hugetlb_lock);
@@ -2423,7 +2412,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
h->resv_huge_pages--;
}
spin_lock(&hugetlb_lock);
- list_move(&page->lru, &h->hugepage_activelist);
+ list_add(&page->lru, &h->hugepage_activelist);
/* Fall through */
}
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
@@ -3799,23 +3788,23 @@ bool is_hugetlb_entry_migration(pte_t pte)
if (huge_pte_none(pte) || pte_present(pte))
return false;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_migration_entry(swp))
+ if (is_migration_entry(swp))
return true;
else
return false;
}
-static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
{
swp_entry_t swp;
if (huge_pte_none(pte) || pte_present(pte))
- return 0;
+ return false;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_hwpoison_entry(swp))
- return 1;
+ if (is_hwpoison_entry(swp))
+ return true;
else
- return 0;
+ return false;
}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
@@ -5348,10 +5337,16 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* !shared pmd case because we can allocate the pmd later as well, it makes the
* code much cleaner.
*
- * This routine must be called with i_mmap_rwsem held in at least read mode.
- * For hugetlbfs, this prevents removal of any page table entries associated
- * with the address space. This is important as we are setting up sharing
- * based on existing page table entries (mappings).
+ * This routine must be called with i_mmap_rwsem held in at least read mode if
+ * sharing is possible. For hugetlbfs, this prevents removal of any page
+ * table entries associated with the address space. This is important as we
+ * are setting up sharing based on existing page table entries (mappings).
+ *
+ * NOTE: This routine is only called from huge_pte_alloc. Some callers of
+ * huge_pte_alloc know that sharing is not possible and do not take
+ * i_mmap_rwsem as a performance optimization. This is handled by the
+ * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
+ * only required for subsequent processing.
*/
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
@@ -5368,6 +5363,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
+ i_mmap_assert_locked(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
diff --git a/mm/internal.h b/mm/internal.h
index 10c677655912..a801a4d51f26 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -65,6 +65,9 @@ static inline void ra_submit(struct file_ra_state *ra,
ra->start, ra->size, ra->async_size);
}
+struct page *find_get_entry(struct address_space *mapping, pgoff_t index);
+struct page *find_lock_entry(struct address_space *mapping, pgoff_t index);
+
/**
* page_evictable - test whether a page is evictable
* @page: the page to test
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 4f49fa6cd1aa..00a53f1355ae 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -33,6 +33,8 @@
#include <asm/sections.h>
+#include <kunit/test.h>
+
#include "kasan.h"
#include "../slab.h"
@@ -93,7 +95,7 @@ static void end_report(unsigned long *flags)
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
- if (panic_on_warn) {
+ if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) {
/*
* This thread may hit another WARN() in the panic path.
* Resetting this prevents additional WARN() from panicking the
@@ -464,12 +466,37 @@ static bool report_enabled(void)
return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
}
+#if IS_ENABLED(CONFIG_KUNIT)
+static void kasan_update_kunit_status(struct kunit *cur_test)
+{
+ struct kunit_resource *resource;
+ struct kunit_kasan_expectation *kasan_data;
+
+ resource = kunit_find_named_resource(cur_test, "kasan_data");
+
+ if (!resource) {
+ kunit_set_failure(cur_test);
+ return;
+ }
+
+ kasan_data = (struct kunit_kasan_expectation *)resource->data;
+ kasan_data->report_found = true;
+ kunit_put_resource(resource);
+}
+#endif /* IS_ENABLED(CONFIG_KUNIT) */
+
void kasan_report_invalid_free(void *object, unsigned long ip)
{
unsigned long flags;
u8 tag = get_tag(object);
object = reset_tag(object);
+
+#if IS_ENABLED(CONFIG_KUNIT)
+ if (current->kunit_test)
+ kasan_update_kunit_status(current->kunit_test);
+#endif /* IS_ENABLED(CONFIG_KUNIT) */
+
start_report(&flags);
pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
print_tags(tag, object);
@@ -488,6 +515,11 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write,
void *untagged_addr;
unsigned long flags;
+#if IS_ENABLED(CONFIG_KUNIT)
+ if (current->kunit_test)
+ kasan_update_kunit_status(current->kunit_test);
+#endif /* IS_ENABLED(CONFIG_KUNIT) */
+
disable_trace_on_warning();
tagged_addr = (void *)addr;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 5e252d91eb14..c0014d3b91c1 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1471,15 +1471,15 @@ static void kmemleak_scan(void)
if (kmemleak_stack_scan) {
struct task_struct *p, *g;
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
void *stack = try_get_task_stack(p);
if (stack) {
scan_block(stack, stack + THREAD_SIZE, NULL);
put_task_stack(p);
}
- } while_each_thread(g, p);
- read_unlock(&tasklist_lock);
+ }
+ rcu_read_unlock();
}
/*
diff --git a/mm/madvise.c b/mm/madvise.c
index 0e0d61003fc6..9b065d412e5f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -224,25 +224,28 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct address_space *mapping)
{
- pgoff_t index;
+ XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
+ pgoff_t end_index = end / PAGE_SIZE;
struct page *page;
- swp_entry_t swap;
- for (; start < end; start += PAGE_SIZE) {
- index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ rcu_read_lock();
+ xas_for_each(&xas, page, end_index) {
+ swp_entry_t swap;
- page = find_get_entry(mapping, index);
- if (!xa_is_value(page)) {
- if (page)
- put_page(page);
+ if (!xa_is_value(page))
continue;
- }
+ xas_pause(&xas);
+ rcu_read_unlock();
+
swap = radix_to_swp_entry(page);
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
NULL, 0, false);
if (page)
put_page(page);
+
+ rcu_read_lock();
}
+ rcu_read_unlock();
lru_add_drain(); /* Push any new pages onto the LRU now */
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 45f198750be9..165f40a8a254 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -132,7 +132,26 @@ struct memblock_type physmem = {
};
#endif
-int memblock_debug __initdata_memblock;
+/*
+ * keep a pointer to &memblock.memory in the text section to use it in
+ * __next_mem_range() and its helpers.
+ * For architectures that do not keep memblock data after init, this
+ * pointer will be reset to NULL at memblock_discard()
+ */
+static __refdata struct memblock_type *memblock_memory = &memblock.memory;
+
+#define for_each_memblock_type(i, memblock_type, rgn) \
+ for (i = 0, rgn = &memblock_type->regions[0]; \
+ i < memblock_type->cnt; \
+ i++, rgn = &memblock_type->regions[i])
+
+#define memblock_dbg(fmt, ...) \
+ do { \
+ if (memblock_debug) \
+ pr_info(fmt, ##__VA_ARGS__); \
+ } while (0)
+
+static int memblock_debug __initdata_memblock;
static bool system_has_some_mirror __initdata_memblock = false;
static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock = 0;
@@ -391,6 +410,8 @@ void __init memblock_discard(void)
memblock.memory.max);
__memblock_free_late(addr, size);
}
+
+ memblock_memory = NULL;
}
#endif
@@ -941,42 +962,16 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP);
}
-/**
- * __next_reserved_mem_region - next function for for_each_reserved_region()
- * @idx: pointer to u64 loop variable
- * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL
- * @out_end: ptr to phys_addr_t for end address of the region, can be %NULL
- *
- * Iterate over all reserved memory regions.
- */
-void __init_memblock __next_reserved_mem_region(u64 *idx,
- phys_addr_t *out_start,
- phys_addr_t *out_end)
-{
- struct memblock_type *type = &memblock.reserved;
-
- if (*idx < type->cnt) {
- struct memblock_region *r = &type->regions[*idx];
- phys_addr_t base = r->base;
- phys_addr_t size = r->size;
-
- if (out_start)
- *out_start = base;
- if (out_end)
- *out_end = base + size - 1;
-
- *idx += 1;
- return;
- }
-
- /* signal end of iteration */
- *idx = ULLONG_MAX;
-}
-
-static bool should_skip_region(struct memblock_region *m, int nid, int flags)
+static bool should_skip_region(struct memblock_type *type,
+ struct memblock_region *m,
+ int nid, int flags)
{
int m_nid = memblock_get_region_node(m);
+ /* we never skip regions when iterating memblock.reserved or physmem */
+ if (type != memblock_memory)
+ return false;
+
/* only memory regions are associated with nodes, check it */
if (nid != NUMA_NO_NODE && nid != m_nid)
return true;
@@ -1041,7 +1036,7 @@ void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags,
phys_addr_t m_end = m->base + m->size;
int m_nid = memblock_get_region_node(m);
- if (should_skip_region(m, nid, flags))
+ if (should_skip_region(type_a, m, nid, flags))
continue;
if (!type_b) {
@@ -1145,7 +1140,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
phys_addr_t m_end = m->base + m->size;
int m_nid = memblock_get_region_node(m);
- if (should_skip_region(m, nid, flags))
+ if (should_skip_region(type_a, m, nid, flags))
continue;
if (!type_b) {
@@ -1649,23 +1644,6 @@ phys_addr_t __init_memblock memblock_reserved_size(void)
return memblock.reserved.total_size;
}
-phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
-{
- unsigned long pages = 0;
- struct memblock_region *r;
- unsigned long start_pfn, end_pfn;
-
- for_each_memblock(memory, r) {
- start_pfn = memblock_region_memory_base_pfn(r);
- end_pfn = memblock_region_memory_end_pfn(r);
- start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
- end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
- pages += end_pfn - start_pfn;
- }
-
- return PFN_PHYS(pages);
-}
-
/* lowest address */
phys_addr_t __init_memblock memblock_start_of_DRAM(void)
{
@@ -1689,7 +1667,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
* the memory memblock regions, if the @limit exceeds the total size
* of those regions, max_addr will keep original value PHYS_ADDR_MAX
*/
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
if (limit <= r->size) {
max_addr = r->base + limit;
break;
@@ -1859,7 +1837,7 @@ void __init_memblock memblock_trim_memory(phys_addr_t align)
phys_addr_t start, end, orig_start, orig_end;
struct memblock_region *r;
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
orig_start = r->base;
orig_end = r->base + r->size;
start = round_up(orig_start, align);
@@ -1915,7 +1893,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
}
}
-void __init_memblock __memblock_dump_all(void)
+static void __init_memblock __memblock_dump_all(void)
{
pr_info("MEMBLOCK configuration:\n");
pr_info(" memory size = %pa reserved size = %pa\n",
@@ -1929,6 +1907,12 @@ void __init_memblock __memblock_dump_all(void)
#endif
}
+void __init_memblock memblock_dump_all(void)
+{
+ if (memblock_debug)
+ __memblock_dump_all();
+}
+
void __init memblock_allow_resize(void)
{
memblock_can_resize = 1;
@@ -1981,7 +1965,7 @@ static unsigned long __init free_low_memory_core_early(void)
memblock_clear_hotplug(0, -1);
- for_each_reserved_mem_region(i, &start, &end)
+ for_each_reserved_mem_range(i, &start, &end)
reserve_bootmem_region(start, end);
/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5c1983c84395..7f74a158cfa8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -197,14 +197,6 @@ static struct move_charge_struct {
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
-enum charge_type {
- MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
- MEM_CGROUP_CHARGE_TYPE_ANON,
- MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
- MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
- NR_CHARGE_TYPE,
-};
-
/* for encoding cft->private value on file */
enum res_type {
_MEM,
@@ -1102,9 +1094,9 @@ static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
* invocations for reference counting, or use mem_cgroup_iter_break()
* to cancel a hierarchy walk before the round-trip is complete.
*
- * Reclaimers can specify a node and a priority level in @reclaim to
- * divide up the memcgs in the hierarchy among all concurrent
- * reclaimers operating on the same node and priority.
+ * Reclaimers can specify a node in @reclaim to divide up the memcgs
+ * in the hierarchy among all concurrent reclaimers operating on the
+ * same node.
*/
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@ -1456,6 +1448,70 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
return false;
}
+struct memory_stat {
+ const char *name;
+ unsigned int ratio;
+ unsigned int idx;
+};
+
+static struct memory_stat memory_stats[] = {
+ { "anon", PAGE_SIZE, NR_ANON_MAPPED },
+ { "file", PAGE_SIZE, NR_FILE_PAGES },
+ { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
+ { "percpu", 1, MEMCG_PERCPU_B },
+ { "sock", PAGE_SIZE, MEMCG_SOCK },
+ { "shmem", PAGE_SIZE, NR_SHMEM },
+ { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
+ { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
+ { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /*
+ * The ratio will be initialized in memory_stats_init(). Because
+ * on some architectures, the macro of HPAGE_PMD_SIZE is not
+ * constant(e.g. powerpc).
+ */
+ { "anon_thp", 0, NR_ANON_THPS },
+#endif
+ { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
+ { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
+ { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
+ { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
+ { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
+
+ /*
+ * Note: The slab_reclaimable and slab_unreclaimable must be
+ * together and slab_reclaimable must be in front.
+ */
+ { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
+ { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
+
+ /* The memory events */
+ { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
+ { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
+ { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
+ { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
+ { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
+ { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
+ { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
+};
+
+static int __init memory_stats_init(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (memory_stats[i].idx == NR_ANON_THPS)
+ memory_stats[i].ratio = HPAGE_PMD_SIZE;
+#endif
+ VM_BUG_ON(!memory_stats[i].ratio);
+ VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
+ }
+
+ return 0;
+}
+pure_initcall(memory_stats_init);
+
static char *memory_stat_format(struct mem_cgroup *memcg)
{
struct seq_buf s;
@@ -1476,52 +1532,19 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
* Current memory state:
*/
- seq_buf_printf(&s, "anon %llu\n",
- (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
- PAGE_SIZE);
- seq_buf_printf(&s, "file %llu\n",
- (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
- PAGE_SIZE);
- seq_buf_printf(&s, "kernel_stack %llu\n",
- (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
- 1024);
- seq_buf_printf(&s, "slab %llu\n",
- (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
- memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
- seq_buf_printf(&s, "percpu %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_PERCPU_B));
- seq_buf_printf(&s, "sock %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_SOCK) *
- PAGE_SIZE);
-
- seq_buf_printf(&s, "shmem %llu\n",
- (u64)memcg_page_state(memcg, NR_SHMEM) *
- PAGE_SIZE);
- seq_buf_printf(&s, "file_mapped %llu\n",
- (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
- PAGE_SIZE);
- seq_buf_printf(&s, "file_dirty %llu\n",
- (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
- PAGE_SIZE);
- seq_buf_printf(&s, "file_writeback %llu\n",
- (u64)memcg_page_state(memcg, NR_WRITEBACK) *
- PAGE_SIZE);
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+ u64 size;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- seq_buf_printf(&s, "anon_thp %llu\n",
- (u64)memcg_page_state(memcg, NR_ANON_THPS) *
- HPAGE_PMD_SIZE);
-#endif
-
- for (i = 0; i < NR_LRU_LISTS; i++)
- seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
- (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
- PAGE_SIZE);
+ size = memcg_page_state(memcg, memory_stats[i].idx);
+ size *= memory_stats[i].ratio;
+ seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
- seq_buf_printf(&s, "slab_reclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B));
- seq_buf_printf(&s, "slab_unreclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B));
+ if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
+ size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
+ seq_buf_printf(&s, "slab %llu\n", size);
+ }
+ }
/* Accumulated memory events */
@@ -1529,22 +1552,6 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
memcg_events(memcg, PGFAULT));
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
memcg_events(memcg, PGMAJFAULT));
-
- seq_buf_printf(&s, "workingset_refault_anon %lu\n",
- memcg_page_state(memcg, WORKINGSET_REFAULT_ANON));
- seq_buf_printf(&s, "workingset_refault_file %lu\n",
- memcg_page_state(memcg, WORKINGSET_REFAULT_FILE));
- seq_buf_printf(&s, "workingset_activate_anon %lu\n",
- memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON));
- seq_buf_printf(&s, "workingset_activate_file %lu\n",
- memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));
- seq_buf_printf(&s, "workingset_restore_anon %lu\n",
- memcg_page_state(memcg, WORKINGSET_RESTORE_ANON));
- seq_buf_printf(&s, "workingset_restore_file %lu\n",
- memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));
- seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
- memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
-
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
memcg_events(memcg, PGREFILL));
seq_buf_printf(&s, "pgscan %lu\n",
@@ -1641,17 +1648,19 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
*/
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
- unsigned long max;
+ unsigned long max = READ_ONCE(memcg->memory.max);
- max = READ_ONCE(memcg->memory.max);
- if (mem_cgroup_swappiness(memcg)) {
- unsigned long memsw_max;
- unsigned long swap_max;
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ if (mem_cgroup_swappiness(memcg))
+ max += min(READ_ONCE(memcg->swap.max),
+ (unsigned long)total_swap_pages);
+ } else { /* v1 */
+ if (mem_cgroup_swappiness(memcg)) {
+ /* Calculate swap excess capacity from memsw limit */
+ unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
- memsw_max = memcg->memsw.max;
- swap_max = READ_ONCE(memcg->swap.max);
- swap_max = min(swap_max, (unsigned long)total_swap_pages);
- max = min(max + swap_max, memsw_max);
+ max += min(swap, (unsigned long)total_swap_pages);
+ }
}
return max;
}
@@ -1817,8 +1826,8 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
struct mem_cgroup *iter;
/*
- * When a new child is created while the hierarchy is under oom,
- * mem_cgroup_oom_lock() may not be called. Watch for underflow.
+ * Be careful about under_oom underflows becase a child memcg
+ * could have been added after mem_cgroup_mark_under_oom.
*/
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
@@ -2888,6 +2897,17 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
page = virt_to_head_page(p);
/*
+ * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
+ * or a pointer to obj_cgroup vector. In the latter case the lowest
+ * bit of the pointer is set.
+ * The page->mem_cgroup pointer can be asynchronously changed
+ * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
+ * from a valid memcg pointer to objcg vector or back.
+ */
+ if (!page->mem_cgroup)
+ return NULL;
+
+ /*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
* the page->obj_cgroups.
@@ -4255,17 +4275,16 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
new->size = size;
/* Copy thresholds (if any) to new array */
- if (thresholds->primary) {
- memcpy(new->entries, thresholds->primary->entries, (size - 1) *
- sizeof(struct mem_cgroup_threshold));
- }
+ if (thresholds->primary)
+ memcpy(new->entries, thresholds->primary->entries,
+ flex_array_size(new, entries, size - 1));
/* Add new threshold */
new->entries[size - 1].eventfd = eventfd;
new->entries[size - 1].threshold = threshold;
/* Sort thresholds. Registering of new threshold isn't time-critical */
- sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
+ sort(new->entries, size, sizeof(*new->entries),
compare_thresholds, NULL);
/* Find current threshold */
@@ -5291,13 +5310,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
memcg->use_hierarchy = true;
page_counter_init(&memcg->memory, &parent->memory);
page_counter_init(&memcg->swap, &parent->swap);
- page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
page_counter_init(&memcg->tcpmem, &parent->tcpmem);
} else {
page_counter_init(&memcg->memory, NULL);
page_counter_init(&memcg->swap, NULL);
- page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
page_counter_init(&memcg->tcpmem, NULL);
/*
@@ -5426,7 +5443,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
- page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
page_counter_set_min(&memcg->memory, 0);
@@ -5500,7 +5516,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
struct page *page = NULL;
swp_entry_t ent = pte_to_swp_entry(ptent);
- if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
+ if (!(mc.flags & MOVE_ANON))
return NULL;
/*
@@ -5519,6 +5535,9 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
return page;
}
+ if (non_swap_entry(ent))
+ return NULL;
+
/*
* Because lookup_swap_cache() updates some statistics counter,
* we call find_get_page() with swapper_space directly.
@@ -5539,35 +5558,15 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
- struct page *page = NULL;
- struct address_space *mapping;
- pgoff_t pgoff;
-
if (!vma->vm_file) /* anonymous vma */
return NULL;
if (!(mc.flags & MOVE_FILE))
return NULL;
- mapping = vma->vm_file->f_mapping;
- pgoff = linear_page_index(vma, addr);
-
/* page is moved even if it's not RSS of this task(page-faulted). */
-#ifdef CONFIG_SWAP
/* shmem/tmpfs may report page out on swap: account for that too. */
- if (shmem_mapping(mapping)) {
- page = find_get_entry(mapping, pgoff);
- if (xa_is_value(page)) {
- swp_entry_t swp = radix_to_swp_entry(page);
- *entry = swp;
- page = find_get_page(swap_address_space(swp),
- swp_offset(swp));
- }
- } else
- page = find_get_page(mapping, pgoff);
-#else
- page = find_get_page(mapping, pgoff);
-#endif
- return page;
+ return find_get_incore_page(vma->vm_file->f_mapping,
+ linear_page_index(vma, addr));
}
/**
@@ -6393,6 +6392,35 @@ static int memory_stat_show(struct seq_file *m, void *v)
return 0;
}
+#ifdef CONFIG_NUMA
+static int memory_numa_stat_show(struct seq_file *m, void *v)
+{
+ int i;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+ int nid;
+
+ if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
+ continue;
+
+ seq_printf(m, "%s", memory_stats[i].name);
+ for_each_node_state(nid, N_MEMORY) {
+ u64 size;
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ size = lruvec_page_state(lruvec, memory_stats[i].idx);
+ size *= memory_stats[i].ratio;
+ seq_printf(m, " N%d=%llu", nid, size);
+ }
+ seq_putc(m, '\n');
+ }
+
+ return 0;
+}
+#endif
+
static int memory_oom_group_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
@@ -6470,6 +6498,12 @@ static struct cftype memory_files[] = {
.name = "stat",
.seq_show = memory_stat_show,
},
+#ifdef CONFIG_NUMA
+ {
+ .name = "numa_stat",
+ .seq_show = memory_numa_stat_show,
+ },
+#endif
{
.name = "oom.group",
.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a1e73943445e..990e3b2e37d5 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -484,11 +484,12 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
struct vm_area_struct *vma;
struct task_struct *tsk;
struct address_space *mapping = page->mapping;
+ pgoff_t pgoff;
i_mmap_lock_read(mapping);
read_lock(&tasklist_lock);
+ pgoff = page_to_pgoff(page);
for_each_process(tsk) {
- pgoff_t pgoff = page_to_pgoff(page);
struct task_struct *t = task_early_kill(tsk, force_early);
if (!t)
@@ -824,7 +825,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
#define unevict (1UL << PG_unevictable)
#define mlock (1UL << PG_mlocked)
-#define writeback (1UL << PG_writeback)
#define lru (1UL << PG_lru)
#define head (1UL << PG_head)
#define slab (1UL << PG_slab)
@@ -873,7 +873,6 @@ static struct page_state {
#undef sc
#undef unevict
#undef mlock
-#undef writeback
#undef lru
#undef head
#undef slab
diff --git a/mm/memory.c b/mm/memory.c
index eeae590e526a..f482af8bc828 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -794,15 +794,14 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* lock.
*/
static inline int
-copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pte_t *dst_pte, pte_t *src_pte,
- struct vm_area_struct *vma, struct vm_area_struct *new,
- unsigned long addr, int *rss, struct page **prealloc,
- pte_t pte, struct page *page)
+copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
+ struct page **prealloc, pte_t pte, struct page *page)
{
+ struct mm_struct *src_mm = src_vma->vm_mm;
struct page *new_page;
- if (!is_cow_mapping(vma->vm_flags))
+ if (!is_cow_mapping(src_vma->vm_flags))
return 1;
/*
@@ -832,16 +831,16 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* over and copy the page & arm it.
*/
*prealloc = NULL;
- copy_user_highpage(new_page, page, addr, vma);
+ copy_user_highpage(new_page, page, addr, src_vma);
__SetPageUptodate(new_page);
- page_add_new_anon_rmap(new_page, new, addr, false);
- lru_cache_add_inactive_or_unevictable(new_page, new);
+ page_add_new_anon_rmap(new_page, dst_vma, addr, false);
+ lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
rss[mm_counter(new_page)]++;
/* All done, just insert the new page copy in the child */
- pte = mk_pte(new_page, new->vm_page_prot);
- pte = maybe_mkwrite(pte_mkdirty(pte), new);
- set_pte_at(dst_mm, addr, dst_pte, pte);
+ pte = mk_pte(new_page, dst_vma->vm_page_prot);
+ pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
+ set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}
@@ -850,24 +849,21 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* is required to copy this pte.
*/
static inline int
-copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
- struct vm_area_struct *new,
- unsigned long addr, int *rss, struct page **prealloc)
+copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
+ struct page **prealloc)
{
- unsigned long vm_flags = vma->vm_flags;
+ struct mm_struct *src_mm = src_vma->vm_mm;
+ unsigned long vm_flags = src_vma->vm_flags;
pte_t pte = *src_pte;
struct page *page;
- page = vm_normal_page(vma, addr, pte);
+ page = vm_normal_page(src_vma, addr, pte);
if (page) {
int retval;
- retval = copy_present_page(dst_mm, src_mm,
- dst_pte, src_pte,
- vma, new,
- addr, rss, prealloc,
- pte, page);
+ retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
+ addr, rss, prealloc, pte, page);
if (retval <= 0)
return retval;
@@ -901,7 +897,7 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (!(vm_flags & VM_UFFD_WP))
pte = pte_clear_uffd_wp(pte);
- set_pte_at(dst_mm, addr, dst_pte, pte);
+ set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}
@@ -924,11 +920,13 @@ page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
return new_page;
}
-static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
- struct vm_area_struct *new,
- unsigned long addr, unsigned long end)
+static int
+copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+ unsigned long end)
{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
@@ -971,15 +969,15 @@ again:
if (unlikely(!pte_present(*src_pte))) {
entry.val = copy_nonpresent_pte(dst_mm, src_mm,
dst_pte, src_pte,
- vma, addr, rss);
+ src_vma, addr, rss);
if (entry.val)
break;
progress += 8;
continue;
}
/* copy_present_pte() will clear `*prealloc' if consumed */
- ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte,
- vma, new, addr, rss, &prealloc);
+ ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
+ addr, rss, &prealloc);
/*
* If we need a pre-allocated page for this pte, drop the
* locks, allocate, and try again.
@@ -1014,7 +1012,7 @@ again:
entry.val = 0;
} else if (ret) {
WARN_ON_ONCE(ret != -EAGAIN);
- prealloc = page_copy_prealloc(src_mm, vma, addr);
+ prealloc = page_copy_prealloc(src_mm, src_vma, addr);
if (!prealloc)
return -ENOMEM;
/* We've captured and resolved the error. Reset, try again. */
@@ -1028,11 +1026,13 @@ out:
return ret;
}
-static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
- struct vm_area_struct *new,
- unsigned long addr, unsigned long end)
+static inline int
+copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
+ unsigned long end)
{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
pmd_t *src_pmd, *dst_pmd;
unsigned long next;
@@ -1045,9 +1045,9 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
|| pmd_devmap(*src_pmd)) {
int err;
- VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
+ VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
err = copy_huge_pmd(dst_mm, src_mm,
- dst_pmd, src_pmd, addr, vma);
+ dst_pmd, src_pmd, addr, src_vma);
if (err == -ENOMEM)
return -ENOMEM;
if (!err)
@@ -1056,18 +1056,20 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
}
if (pmd_none_or_clear_bad(src_pmd))
continue;
- if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
- vma, new, addr, next))
+ if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
+ addr, next))
return -ENOMEM;
} while (dst_pmd++, src_pmd++, addr = next, addr != end);
return 0;
}
-static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
- struct vm_area_struct *new,
- unsigned long addr, unsigned long end)
+static inline int
+copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
+ unsigned long end)
{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
pud_t *src_pud, *dst_pud;
unsigned long next;
@@ -1080,9 +1082,9 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
int err;
- VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
+ VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
err = copy_huge_pud(dst_mm, src_mm,
- dst_pud, src_pud, addr, vma);
+ dst_pud, src_pud, addr, src_vma);
if (err == -ENOMEM)
return -ENOMEM;
if (!err)
@@ -1091,18 +1093,19 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
}
if (pud_none_or_clear_bad(src_pud))
continue;
- if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
- vma, new, addr, next))
+ if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
+ addr, next))
return -ENOMEM;
} while (dst_pud++, src_pud++, addr = next, addr != end);
return 0;
}
-static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
- struct vm_area_struct *new,
- unsigned long addr, unsigned long end)
+static inline int
+copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
+ unsigned long end)
{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
p4d_t *src_p4d, *dst_p4d;
unsigned long next;
@@ -1114,20 +1117,22 @@ static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(src_p4d))
continue;
- if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
- vma, new, addr, next))
+ if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
+ addr, next))
return -ENOMEM;
} while (dst_p4d++, src_p4d++, addr = next, addr != end);
return 0;
}
-int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- struct vm_area_struct *vma, struct vm_area_struct *new)
+int
+copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
pgd_t *src_pgd, *dst_pgd;
unsigned long next;
- unsigned long addr = vma->vm_start;
- unsigned long end = vma->vm_end;
+ unsigned long addr = src_vma->vm_start;
+ unsigned long end = src_vma->vm_end;
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
bool is_cow;
int ret;
@@ -1138,19 +1143,19 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* readonly mappings. The tradeoff is that copy_page_range is more
* efficient than faulting.
*/
- if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
- !vma->anon_vma)
+ if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
+ !src_vma->anon_vma)
return 0;
- if (is_vm_hugetlb_page(vma))
- return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+ if (is_vm_hugetlb_page(src_vma))
+ return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
- if (unlikely(vma->vm_flags & VM_PFNMAP)) {
+ if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
/*
* We do not free on error cases below as remove_vma
* gets called on error from higher level routine
*/
- ret = track_pfn_copy(vma);
+ ret = track_pfn_copy(src_vma);
if (ret)
return ret;
}
@@ -1161,11 +1166,11 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* parent mm. And a permission downgrade will only happen if
* is_cow_mapping() returns true.
*/
- is_cow = is_cow_mapping(vma->vm_flags);
+ is_cow = is_cow_mapping(src_vma->vm_flags);
if (is_cow) {
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
- 0, vma, src_mm, addr, end);
+ 0, src_vma, src_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
}
@@ -1176,8 +1181,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(src_pgd))
continue;
- if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
- vma, new, addr, next))) {
+ if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
+ addr, next))) {
ret = -ENOMEM;
break;
}
@@ -3589,7 +3594,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
* unlock_page(A)
* lock_page(B)
* lock_page(B)
- * pte_alloc_pne
+ * pte_alloc_one
* shrink_page_list
* wait_on_page_writeback(A)
* SetPageWriteback(B)
@@ -3597,7 +3602,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
* # flush A, B to clear the writeback
*/
if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
- vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
@@ -3764,7 +3769,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
/**
* alloc_set_pte - setup new PTE entry for given page and add reverse page
- * mapping. If needed, the fucntion allocates page table or use pre-allocated.
+ * mapping. If needed, the function allocates page table or use pre-allocated.
*
* @vmf: fault environment
* @page: page to map
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ce3e73e3a5c1..8e9e2d44cdad 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -353,11 +353,19 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
#ifdef CONFIG_NUMA
int __weak memory_add_physaddr_to_nid(u64 start)
{
- pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
+ pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n",
start);
return 0;
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+
+int __weak phys_to_target_node(u64 start)
+{
+ pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
+ start);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
#endif
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eddbe4e56c73..3fde772ef5ef 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -875,13 +875,12 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
goto out;
}
- task_lock(current);
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
- task_unlock(current);
mpol_put(new);
goto out;
}
+ task_lock(current);
old = current->mempolicy;
current->mempolicy = new;
if (new && new->mode == MPOL_INTERLEAVE)
@@ -1324,9 +1323,7 @@ static long do_mbind(unsigned long start, unsigned long len,
NODEMASK_SCRATCH(scratch);
if (scratch) {
mmap_write_lock(mm);
- task_lock(current);
err = mpol_set_nodemask(new, nmask, scratch);
- task_unlock(current);
if (err)
mmap_write_unlock(mm);
} else
@@ -1885,8 +1882,7 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
}
/* Return the node id preferred by the given mempolicy, or the given id */
-static int policy_node(gfp_t gfp, struct mempolicy *policy,
- int nd)
+static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{
if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
nd = policy->v.preferred_node;
diff --git a/mm/mempool.c b/mm/mempool.c
index 79bff63ecf27..f473cdddaff0 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -58,11 +58,10 @@ static void __check_element(mempool_t *pool, void *element, size_t size)
static void check_element(mempool_t *pool, void *element)
{
/* Mempools backed by slab allocator */
- if (pool->free == mempool_free_slab || pool->free == mempool_kfree)
+ if (pool->free == mempool_free_slab || pool->free == mempool_kfree) {
__check_element(pool, element, ksize(element));
-
- /* Mempools backed by page allocator */
- if (pool->free == mempool_free_pages) {
+ } else if (pool->free == mempool_free_pages) {
+ /* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
void *addr = kmap_atomic((struct page *)element);
@@ -82,11 +81,10 @@ static void __poison_element(void *element, size_t size)
static void poison_element(mempool_t *pool, void *element)
{
/* Mempools backed by slab allocator */
- if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+ if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) {
__poison_element(element, ksize(element));
-
- /* Mempools backed by page allocator */
- if (pool->alloc == mempool_alloc_pages) {
+ } else if (pool->alloc == mempool_alloc_pages) {
+ /* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
void *addr = kmap_atomic((struct page *)element);
@@ -107,7 +105,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_poison_kfree(element, _RET_IP_);
- if (pool->alloc == mempool_alloc_pages)
+ else if (pool->alloc == mempool_alloc_pages)
kasan_free_pages(element, (unsigned long)pool->pool_data);
}
@@ -115,7 +113,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_unpoison_slab(element);
- if (pool->alloc == mempool_alloc_pages)
+ else if (pool->alloc == mempool_alloc_pages)
kasan_alloc_pages(element, (unsigned long)pool->pool_data);
}
diff --git a/mm/memremap.c b/mm/memremap.c
index 006dace60b1a..198083453182 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -40,12 +40,10 @@ EXPORT_SYMBOL_GPL(memremap_compat_align);
#ifdef CONFIG_DEV_PAGEMAP_OPS
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
EXPORT_SYMBOL(devmap_managed_key);
-static atomic_t devmap_managed_enable;
static void devmap_managed_enable_put(void)
{
- if (atomic_dec_and_test(&devmap_managed_enable))
- static_branch_disable(&devmap_managed_key);
+ static_branch_dec(&devmap_managed_key);
}
static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
@@ -56,8 +54,7 @@ static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
return -EINVAL;
}
- if (atomic_inc_return(&devmap_managed_enable) == 1)
- static_branch_enable(&devmap_managed_key);
+ static_branch_inc(&devmap_managed_key);
return 0;
}
#else
@@ -70,24 +67,28 @@ static void devmap_managed_enable_put(void)
}
#endif /* CONFIG_DEV_PAGEMAP_OPS */
-static void pgmap_array_delete(struct resource *res)
+static void pgmap_array_delete(struct range *range)
{
- xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
+ xa_store_range(&pgmap_array, PHYS_PFN(range->start), PHYS_PFN(range->end),
NULL, GFP_KERNEL);
synchronize_rcu();
}
-static unsigned long pfn_first(struct dev_pagemap *pgmap)
+static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id)
{
- return PHYS_PFN(pgmap->res.start) +
- vmem_altmap_offset(pgmap_altmap(pgmap));
+ struct range *range = &pgmap->ranges[range_id];
+ unsigned long pfn = PHYS_PFN(range->start);
+
+ if (range_id)
+ return pfn;
+ return pfn + vmem_altmap_offset(pgmap_altmap(pgmap));
}
-static unsigned long pfn_end(struct dev_pagemap *pgmap)
+static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
{
- const struct resource *res = &pgmap->res;
+ const struct range *range = &pgmap->ranges[range_id];
- return (res->start + resource_size(res)) >> PAGE_SHIFT;
+ return (range->start + range_len(range)) >> PAGE_SHIFT;
}
static unsigned long pfn_next(unsigned long pfn)
@@ -97,8 +98,8 @@ static unsigned long pfn_next(unsigned long pfn)
return pfn + 1;
}
-#define for_each_device_pfn(pfn, map) \
- for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
+#define for_each_device_pfn(pfn, map, i) \
+ for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
static void dev_pagemap_kill(struct dev_pagemap *pgmap)
{
@@ -124,39 +125,49 @@ static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
pgmap->ref = NULL;
}
-void memunmap_pages(struct dev_pagemap *pgmap)
+static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
{
- struct resource *res = &pgmap->res;
+ struct range *range = &pgmap->ranges[range_id];
struct page *first_page;
- unsigned long pfn;
int nid;
- dev_pagemap_kill(pgmap);
- for_each_device_pfn(pfn, pgmap)
- put_page(pfn_to_page(pfn));
- dev_pagemap_cleanup(pgmap);
-
/* make sure to access a memmap that was actually initialized */
- first_page = pfn_to_page(pfn_first(pgmap));
+ first_page = pfn_to_page(pfn_first(pgmap, range_id));
/* pages are dead and unused, undo the arch mapping */
nid = page_to_nid(first_page);
mem_hotplug_begin();
- remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)));
+ remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)));
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- __remove_pages(PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), NULL);
+ __remove_pages(PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)), NULL);
} else {
- arch_remove_memory(nid, res->start, resource_size(res),
+ arch_remove_memory(nid, range->start, range_len(range),
pgmap_altmap(pgmap));
- kasan_remove_zero_shadow(__va(res->start), resource_size(res));
+ kasan_remove_zero_shadow(__va(range->start), range_len(range));
}
mem_hotplug_done();
- untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
- pgmap_array_delete(res);
+ untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
+ pgmap_array_delete(range);
+}
+
+void memunmap_pages(struct dev_pagemap *pgmap)
+{
+ unsigned long pfn;
+ int i;
+
+ dev_pagemap_kill(pgmap);
+ for (i = 0; i < pgmap->nr_range; i++)
+ for_each_device_pfn(pfn, pgmap, i)
+ put_page(pfn_to_page(pfn));
+ dev_pagemap_cleanup(pgmap);
+
+ for (i = 0; i < pgmap->nr_range; i++)
+ pageunmap_range(pgmap, i);
+
WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
devmap_managed_enable_put();
}
@@ -175,6 +186,114 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref)
complete(&pgmap->done);
}
+static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
+ int range_id, int nid)
+{
+ struct range *range = &pgmap->ranges[range_id];
+ struct dev_pagemap *conflict_pgmap;
+ int error, is_ram;
+
+ if (WARN_ONCE(pgmap_altmap(pgmap) && range_id > 0,
+ "altmap not supported for multiple ranges\n"))
+ return -EINVAL;
+
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL);
+ if (conflict_pgmap) {
+ WARN(1, "Conflicting mapping in same section\n");
+ put_dev_pagemap(conflict_pgmap);
+ return -ENOMEM;
+ }
+
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL);
+ if (conflict_pgmap) {
+ WARN(1, "Conflicting mapping in same section\n");
+ put_dev_pagemap(conflict_pgmap);
+ return -ENOMEM;
+ }
+
+ is_ram = region_intersects(range->start, range_len(range),
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
+
+ if (is_ram != REGION_DISJOINT) {
+ WARN_ONCE(1, "attempted on %s region %#llx-%#llx\n",
+ is_ram == REGION_MIXED ? "mixed" : "ram",
+ range->start, range->end);
+ return -ENXIO;
+ }
+
+ error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(range->start),
+ PHYS_PFN(range->end), pgmap, GFP_KERNEL));
+ if (error)
+ return error;
+
+ if (nid < 0)
+ nid = numa_mem_id();
+
+ error = track_pfn_remap(NULL, &params->pgprot, PHYS_PFN(range->start), 0,
+ range_len(range));
+ if (error)
+ goto err_pfn_remap;
+
+ mem_hotplug_begin();
+
+ /*
+ * For device private memory we call add_pages() as we only need to
+ * allocate and initialize struct page for the device memory. More-
+ * over the device memory is un-accessible thus we do not want to
+ * create a linear mapping for the memory like arch_add_memory()
+ * would do.
+ *
+ * For all other device memory types, which are accessible by
+ * the CPU, we do want the linear mapping and thus use
+ * arch_add_memory().
+ */
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ error = add_pages(nid, PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)), params);
+ } else {
+ error = kasan_add_zero_shadow(__va(range->start), range_len(range));
+ if (error) {
+ mem_hotplug_done();
+ goto err_kasan;
+ }
+
+ error = arch_add_memory(nid, range->start, range_len(range),
+ params);
+ }
+
+ if (!error) {
+ struct zone *zone;
+
+ zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
+ move_pfn_range_to_zone(zone, PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)), params->altmap);
+ }
+
+ mem_hotplug_done();
+ if (error)
+ goto err_add_memory;
+
+ /*
+ * Initialization of the pages has been deferred until now in order
+ * to allow us to do the work while not holding the hotplug lock.
+ */
+ memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)), pgmap);
+ percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
+ - pfn_first(pgmap, range_id));
+ return 0;
+
+err_add_memory:
+ kasan_remove_zero_shadow(__va(range->start), range_len(range));
+err_kasan:
+ untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
+err_pfn_remap:
+ pgmap_array_delete(range);
+ return error;
+}
+
+
/*
* Not device managed version of dev_memremap_pages, undone by
* memunmap_pages(). Please use dev_memremap_pages if you have a struct
@@ -182,17 +301,16 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref)
*/
void *memremap_pages(struct dev_pagemap *pgmap, int nid)
{
- struct resource *res = &pgmap->res;
- struct dev_pagemap *conflict_pgmap;
struct mhp_params params = {
- /*
- * We do not want any optional features only our own memmap
- */
.altmap = pgmap_altmap(pgmap),
.pgprot = PAGE_KERNEL,
};
- int error, is_ram;
+ const int nr_range = pgmap->nr_range;
bool need_devmap_managed = true;
+ int error, i;
+
+ if (WARN_ONCE(!nr_range, "nr_range must be specified\n"))
+ return ERR_PTR(-EINVAL);
switch (pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
@@ -251,105 +369,27 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
return ERR_PTR(error);
}
- conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL);
- if (conflict_pgmap) {
- WARN(1, "Conflicting mapping in same section\n");
- put_dev_pagemap(conflict_pgmap);
- error = -ENOMEM;
- goto err_array;
- }
-
- conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL);
- if (conflict_pgmap) {
- WARN(1, "Conflicting mapping in same section\n");
- put_dev_pagemap(conflict_pgmap);
- error = -ENOMEM;
- goto err_array;
- }
-
- is_ram = region_intersects(res->start, resource_size(res),
- IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
-
- if (is_ram != REGION_DISJOINT) {
- WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
- is_ram == REGION_MIXED ? "mixed" : "ram", res);
- error = -ENXIO;
- goto err_array;
- }
-
- error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
- PHYS_PFN(res->end), pgmap, GFP_KERNEL));
- if (error)
- goto err_array;
-
- if (nid < 0)
- nid = numa_mem_id();
-
- error = track_pfn_remap(NULL, &params.pgprot, PHYS_PFN(res->start),
- 0, resource_size(res));
- if (error)
- goto err_pfn_remap;
-
- mem_hotplug_begin();
-
/*
- * For device private memory we call add_pages() as we only need to
- * allocate and initialize struct page for the device memory. More-
- * over the device memory is un-accessible thus we do not want to
- * create a linear mapping for the memory like arch_add_memory()
- * would do.
- *
- * For all other device memory types, which are accessible by
- * the CPU, we do want the linear mapping and thus use
- * arch_add_memory().
+ * Clear the pgmap nr_range as it will be incremented for each
+ * successfully processed range. This communicates how many
+ * regions to unwind in the abort case.
*/
- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- error = add_pages(nid, PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), &params);
- } else {
- error = kasan_add_zero_shadow(__va(res->start), resource_size(res));
- if (error) {
- mem_hotplug_done();
- goto err_kasan;
- }
-
- error = arch_add_memory(nid, res->start, resource_size(res),
- &params);
+ pgmap->nr_range = 0;
+ error = 0;
+ for (i = 0; i < nr_range; i++) {
+ error = pagemap_range(pgmap, &params, i, nid);
+ if (error)
+ break;
+ pgmap->nr_range++;
}
- if (!error) {
- struct zone *zone;
-
- zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
- move_pfn_range_to_zone(zone, PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), params.altmap);
+ if (i < nr_range) {
+ memunmap_pages(pgmap);
+ pgmap->nr_range = nr_range;
+ return ERR_PTR(error);
}
- mem_hotplug_done();
- if (error)
- goto err_add_memory;
-
- /*
- * Initialization of the pages has been deferred until now in order
- * to allow us to do the work while not holding the hotplug lock.
- */
- memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
- PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), pgmap);
- percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
- return __va(res->start);
-
- err_add_memory:
- kasan_remove_zero_shadow(__va(res->start), resource_size(res));
- err_kasan:
- untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
- err_pfn_remap:
- pgmap_array_delete(res);
- err_array:
- dev_pagemap_kill(pgmap);
- dev_pagemap_cleanup(pgmap);
- devmap_managed_enable_put();
- return ERR_PTR(error);
+ return __va(pgmap->ranges[0].start);
}
EXPORT_SYMBOL_GPL(memremap_pages);
@@ -369,7 +409,7 @@ EXPORT_SYMBOL_GPL(memremap_pages);
* 'live' on entry and will be killed and reaped at
* devm_memremap_pages_release() time, or if this routine fails.
*
- * 4/ res is expected to be a host memory range that could feasibly be
+ * 4/ range is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but
* this is not enforced.
*/
@@ -426,7 +466,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
* In the cached case we're already holding a live reference.
*/
if (pgmap) {
- if (phys >= pgmap->res.start && phys <= pgmap->res.end)
+ if (phys >= pgmap->range.start && phys <= pgmap->range.end)
return pgmap;
put_dev_pagemap(pgmap);
}
@@ -451,8 +491,6 @@ void free_devmap_managed_page(struct page *page)
return;
}
- /* Clear Active bit in case of parallel mark_page_accessed */
- __ClearPageActive(page);
__ClearPageWaiters(page);
mem_cgroup_uncharge(page);
diff --git a/mm/migrate.c b/mm/migrate.c
index 4de11dfd730b..f94d7c7eeddf 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -381,7 +381,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
int expected_count = 1;
/*
- * Device public or private pages have an extra refcount as they are
+ * Device private pages have an extra refcount as they are
* ZONE_DEVICE pages.
*/
expected_count += is_device_private_page(page);
@@ -3077,7 +3077,6 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
remove_migration_ptes(page, newpage, false);
unlock_page(page);
- migrate->cpages--;
if (is_zone_device_page(page))
put_page(page);
diff --git a/mm/mincore.c b/mm/mincore.c
index 453ff112470f..02db1a834021 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -48,7 +48,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
* and is up to date; i.e. that no page-in operation would be required
* at this time if an application were to map and access this page.
*/
-static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
+static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
{
unsigned char present = 0;
struct page *page;
@@ -59,31 +59,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
-#ifdef CONFIG_SWAP
- if (shmem_mapping(mapping)) {
- page = find_get_entry(mapping, pgoff);
- /*
- * shmem/tmpfs may return swap: account for swapcache
- * page too.
- */
- if (xa_is_value(page)) {
- swp_entry_t swp = radix_to_swp_entry(page);
- struct swap_info_struct *si;
-
- /* Prevent swap device to being swapoff under us */
- si = get_swap_device(swp);
- if (si) {
- page = find_get_page(swap_address_space(swp),
- swp_offset(swp));
- put_swap_device(si);
- } else
- page = NULL;
- }
- } else
- page = find_get_page(mapping, pgoff);
-#else
- page = find_get_page(mapping, pgoff);
-#endif
+ page = find_get_incore_page(mapping, index);
if (page) {
present = PageUptodate(page);
put_page(page);
diff --git a/mm/mmap.c b/mm/mmap.c
index e71d2d471416..67d11ad6df24 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -143,7 +143,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
{
if (vma->vm_flags & VM_DENYWRITE)
- atomic_inc(&file_inode(file)->i_writecount);
+ allow_write_access(file);
if (vma->vm_flags & VM_SHARED)
mapping_unmap_writable(mapping);
@@ -474,8 +474,12 @@ static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
{
/*
* All rb_subtree_gap values must be consistent prior to erase,
- * with the possible exception of the "next" vma being erased if
- * next->vm_start was reduced.
+ * with the possible exception of
+ *
+ * a. the "next" vma being erased if next->vm_start was reduced in
+ * __vma_adjust() -> __vma_unlink()
+ * b. the vma being erased in detach_vmas_to_be_unmapped() ->
+ * vma_rb_erase()
*/
validate_mm_rb(root, ignore);
@@ -485,13 +489,7 @@ static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
struct rb_root *root)
{
- /*
- * All rb_subtree_gap values must be consistent prior to erase,
- * with the possible exception of the vma being erased.
- */
- validate_mm_rb(root, vma);
-
- __vma_rb_erase(vma, root);
+ vma_rb_erase_ignore(vma, root, vma);
}
/*
@@ -623,7 +621,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
if (vma->vm_flags & VM_DENYWRITE)
atomic_dec(&file_inode(file)->i_writecount);
if (vma->vm_flags & VM_SHARED)
- atomic_inc(&mapping->i_mmap_writable);
+ mapping_allow_writable(mapping);
flush_dcache_mmap_lock(mapping);
vma_interval_tree_insert(vma, &mapping->i_mmap);
@@ -677,7 +675,7 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
mm->map_count++;
}
-static __always_inline void __vma_unlink_common(struct mm_struct *mm,
+static __always_inline void __vma_unlink(struct mm_struct *mm,
struct vm_area_struct *vma,
struct vm_area_struct *ignore)
{
@@ -760,7 +758,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* vma expands, overlapping part of the next:
* mprotect case 5 shifting the boundary up.
*/
- adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
+ adjust_next = (end - next->vm_start);
exporter = next;
importer = vma;
VM_WARN_ON(expand != importer);
@@ -770,7 +768,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* split_vma inserting another: so it must be
* mprotect case 4 shifting the boundary down.
*/
- adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
+ adjust_next = -(vma->vm_end - end);
exporter = vma;
importer = next;
VM_WARN_ON(expand != importer);
@@ -825,7 +823,7 @@ again:
anon_vma_interval_tree_pre_update_vma(next);
}
- if (root) {
+ if (file) {
flush_dcache_mmap_lock(mapping);
vma_interval_tree_remove(vma, root);
if (adjust_next)
@@ -842,11 +840,11 @@ again:
}
vma->vm_pgoff = pgoff;
if (adjust_next) {
- next->vm_start += adjust_next << PAGE_SHIFT;
- next->vm_pgoff += adjust_next;
+ next->vm_start += adjust_next;
+ next->vm_pgoff += adjust_next >> PAGE_SHIFT;
}
- if (root) {
+ if (file) {
if (adjust_next)
vma_interval_tree_insert(next, root);
vma_interval_tree_insert(vma, root);
@@ -859,7 +857,7 @@ again:
* us to remove next before dropping the locks.
*/
if (remove_next != 3)
- __vma_unlink_common(mm, next, next);
+ __vma_unlink(mm, next, next);
else
/*
* vma is not before next if they've been
@@ -870,7 +868,7 @@ again:
* "next" (which is stored in post-swap()
* "vma").
*/
- __vma_unlink_common(mm, next, vma);
+ __vma_unlink(mm, next, vma);
if (file)
__remove_shared_vm_struct(next, file, mapping);
} else if (insert) {
@@ -897,10 +895,9 @@ again:
anon_vma_interval_tree_post_update_vma(next);
anon_vma_unlock_write(anon_vma);
}
- if (mapping)
- i_mmap_unlock_write(mapping);
- if (root) {
+ if (file) {
+ i_mmap_unlock_write(mapping);
uprobe_mmap(vma);
if (adjust_next)
@@ -3236,7 +3233,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
* By setting it to reflect the virtual start address of the
* vma, merges and splits can happen in a seamless way, just
* using the existing file pgoff checks and manipulations.
- * Similarly in do_mmap and in do_brk.
+ * Similarly in do_mmap and in do_brk_flags.
*/
if (vma_is_anonymous(vma)) {
BUG_ON(vma->anon_vma);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e90f25d6385d..8b84661a6410 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -64,6 +64,8 @@ int sysctl_oom_dump_tasks = 1;
* and mark_oom_victim
*/
DEFINE_MUTEX(oom_lock);
+/* Serializes oom_score_adj and oom_score_adj_min updates */
+DEFINE_MUTEX(oom_adj_mutex);
static inline bool is_memcg_oom(struct oom_control *oc)
{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 780c8f023b28..05fe1ddb033c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -156,16 +156,16 @@ static int __init early_init_on_alloc(char *buf)
int ret;
bool bool_result;
- if (!buf)
- return -EINVAL;
ret = kstrtobool(buf, &bool_result);
+ if (ret)
+ return ret;
if (bool_result && page_poisoning_enabled())
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
if (bool_result)
static_branch_enable(&init_on_alloc);
else
static_branch_disable(&init_on_alloc);
- return ret;
+ return 0;
}
early_param("init_on_alloc", early_init_on_alloc);
@@ -174,16 +174,16 @@ static int __init early_init_on_free(char *buf)
int ret;
bool bool_result;
- if (!buf)
- return -EINVAL;
ret = kstrtobool(buf, &bool_result);
+ if (ret)
+ return ret;
if (bool_result && page_poisoning_enabled())
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
if (bool_result)
static_branch_enable(&init_on_free);
else
static_branch_disable(&init_on_free);
- return ret;
+ return 0;
}
early_param("init_on_free", early_init_on_free);
@@ -3741,8 +3741,8 @@ retry:
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
- for_next_zone_zonelist_nodemask(zone, z, ac->zonelist,
- ac->highest_zoneidx, ac->nodemask) {
+ for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
+ ac->nodemask) {
struct page *page;
unsigned long mark;
@@ -3986,8 +3986,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
* success so it is time to admit defeat. We will skip the OOM killer
* because it is very likely that the caller has a more reasonable
* fallback than shooting a random task.
+ *
+ * The OOM killer may not free memory on a specific node.
*/
- if (gfp_mask & __GFP_RETRY_MAYFAIL)
+ if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
if (ac->highest_zoneidx < ZONE_NORMAL)
@@ -4004,10 +4006,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
* failures more gracefully we should just bail out here.
*/
- /* The OOM killer may not free memory on a specific node */
- if (gfp_mask & __GFP_THISNODE)
- goto out;
-
/* Exhausted what can be done so it's blame time */
if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
*did_some_progress = 1;
@@ -4255,13 +4253,12 @@ EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
/* Perform direct synchronous page reclaim */
-static int
+static unsigned long
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac)
{
- int progress;
unsigned int noreclaim_flag;
- unsigned long pflags;
+ unsigned long pflags, progress;
cond_resched();
@@ -4840,12 +4837,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
*alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
- return true;
-}
-
-/* Determine whether to spread dirty pages and what the first usable zone */
-static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
-{
/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
@@ -4856,6 +4847,8 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
+
+ return true;
}
/*
@@ -4884,8 +4877,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
- finalise_ac(gfp_mask, &ac);
-
/*
* Forbid the first pass from falling back to types that fragment
* memory until all local zones are considered.
@@ -4961,6 +4952,9 @@ void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page))
free_the_page(page, order);
+ else if (!PageHead(page))
+ while (order-- > 0)
+ free_the_page(page + (1 << order), order);
}
EXPORT_SYMBOL(__free_pages);
@@ -5651,7 +5645,6 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
int n, val;
int min_val = INT_MAX;
int best_node = NUMA_NO_NODE;
- const struct cpumask *tmp = cpumask_of_node(0);
/* Use the local node if we haven't already */
if (!node_isset(node, *used_node_mask)) {
@@ -5672,8 +5665,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
val += (n < node);
/* Give preference to headless and unused nodes */
- tmp = cpumask_of_node(n);
- if (!cpumask_empty(tmp))
+ if (!cpumask_empty(cpumask_of_node(n)))
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
@@ -5969,7 +5961,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
if (*pfn < memblock_region_memory_end_pfn(r))
break;
}
@@ -6554,7 +6546,7 @@ static unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long start_pfn, end_pfn;
struct memblock_region *r;
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
start_pfn = clamp(memblock_region_memory_base_pfn(r),
zone_start_pfn, zone_end_pfn);
end_pfn = clamp(memblock_region_memory_end_pfn(r),
@@ -6998,8 +6990,7 @@ static void __init init_unavailable_mem(void)
* Loop through unavailable ranges not covered by memblock.memory.
*/
pgcnt = 0;
- for_each_mem_range(i, &memblock.memory, NULL,
- NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
+ for_each_mem_range(i, &start, &end) {
if (next < start)
pgcnt += init_unavailable_range(PFN_DOWN(next),
PFN_UP(start));
@@ -7149,7 +7140,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
* options.
*/
if (movable_node_is_enabled()) {
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
if (!memblock_is_hotpluggable(r))
continue;
@@ -7170,7 +7161,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
if (mirrored_kernelcore) {
bool mem_below_4gb_not_mirrored = false;
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
if (memblock_is_mirror(r))
continue;
@@ -8234,14 +8225,7 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
{
unsigned long iter = 0;
unsigned long pfn = page_to_pfn(page);
-
- /*
- * TODO we could make this much more efficient by not checking every
- * page in the range if we know all of them are in MOVABLE_ZONE and
- * that the movable zone guarantees that pages are migratable but
- * the later is not the case right now unfortunatelly. E.g. movablecore
- * can still lead to having bootmem allocations in zone_movable.
- */
+ unsigned long offset = pfn % pageblock_nr_pages;
if (is_migrate_cma_page(page)) {
/*
@@ -8255,12 +8239,18 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
return page;
}
- for (; iter < pageblock_nr_pages; iter++) {
+ for (; iter < pageblock_nr_pages - offset; iter++) {
if (!pfn_valid_within(pfn + iter))
continue;
page = pfn_to_page(pfn + iter);
+ /*
+ * Both, bootmem allocations and memory holes are marked
+ * PG_reserved and are unmovable. We can even have unmovable
+ * allocations inside ZONE_MOVABLE, for example when
+ * specifying "movablecore".
+ */
if (PageReserved(page))
return page;
@@ -8334,14 +8324,6 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
* it. But now, memory offline itself doesn't call
* shrink_node_slabs() and it still to be fixed.
*/
- /*
- * If the page is not RAM, page_count()should be 0.
- * we don't need more check. This is an _used_ not-movable page.
- *
- * The problematic thing here is PG_reserved pages. PG_reserved
- * is set to both of a memory hole page and a _used_ kernel
- * page at boot.
- */
return page;
}
return NULL;
diff --git a/mm/page_counter.c b/mm/page_counter.c
index afe22ad335cc..b24a60b28bb0 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -109,7 +109,7 @@ bool page_counter_try_charge(struct page_counter *counter,
*
* The atomic_long_add_return() implies a full memory
* barrier between incrementing the count and reading
- * the limit. When racing with page_counter_limit(),
+ * the limit. When racing with page_counter_set_max(),
* we either see the new limit or the setter sees the
* counter has changed and retries.
*/
diff --git a/mm/page_io.c b/mm/page_io.c
index f9e9267f296f..433df1263349 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -312,7 +312,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
struct swap_info_struct *sis = page_swap_info(page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- if (data_race(sis->flags & SWP_FS)) {
+ if (data_race(sis->flags & SWP_FS_OPS)) {
struct kiocb kiocb;
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -359,13 +359,11 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
return 0;
}
- ret = 0;
bio = get_swap_bio(GFP_NOIO, page, end_write_func);
if (bio == NULL) {
set_page_dirty(page);
unlock_page(page);
- ret = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
bio_associate_blkg_from_page(bio, page);
@@ -373,8 +371,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
set_page_writeback(page);
unlock_page(page);
submit_bio(bio);
-out:
- return ret;
+
+ return 0;
}
int swap_readpage(struct page *page, bool synchronous)
@@ -403,7 +401,7 @@ int swap_readpage(struct page *page, bool synchronous)
goto out;
}
- if (data_race(sis->flags & SWP_FS)) {
+ if (data_race(sis->flags & SWP_FS_OPS)) {
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -467,7 +465,7 @@ int swap_set_page_dirty(struct page *page)
{
struct swap_info_struct *sis = page_swap_info(page);
- if (data_race(sis->flags & SWP_FS)) {
+ if (data_race(sis->flags & SWP_FS_OPS)) {
struct address_space *mapping = sis->swap_file->f_mapping;
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 63a3db10a8c0..aa94afb63823 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -17,22 +17,21 @@
static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
{
- struct page *unmovable = NULL;
- struct zone *zone;
+ struct zone *zone = page_zone(page);
+ struct page *unmovable;
unsigned long flags;
- int ret = -EBUSY;
-
- zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
/*
* We assume the caller intended to SET migrate type to isolate.
* If it is already set, then someone else must have raced and
- * set it before us. Return -EBUSY
+ * set it before us.
*/
- if (is_migrate_isolate_page(page))
- goto out;
+ if (is_migrate_isolate_page(page)) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return -EBUSY;
+ }
/*
* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
@@ -49,25 +48,21 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
NULL);
__mod_zone_freepage_state(zone, -nr_pages, mt);
- ret = 0;
+ spin_unlock_irqrestore(&zone->lock, flags);
+ drain_all_pages(zone);
+ return 0;
}
-out:
spin_unlock_irqrestore(&zone->lock, flags);
- if (!ret) {
- drain_all_pages(zone);
- } else {
- WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
-
- if ((isol_flags & REPORT_FAILURE) && unmovable)
- /*
- * printk() with zone->lock held will likely trigger a
- * lockdep splat, so defer it here.
- */
- dump_page(unmovable, "unmovable page");
+ if (isol_flags & REPORT_FAILURE) {
+ /*
+ * printk() with zone->lock held will likely trigger a
+ * lockdep splat, so defer it here.
+ */
+ dump_page(unmovable, "unmovable page");
}
- return ret;
+ return -EBUSY;
}
static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
diff --git a/mm/shmem.c b/mm/shmem.c
index d42c27e4769f..6d4ddef4a24f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1830,6 +1830,8 @@ repeat:
return error;
}
+ if (page)
+ hindex = page->index;
if (page && sgp == SGP_WRITE)
mark_page_accessed(page);
@@ -1840,11 +1842,10 @@ repeat:
unlock_page(page);
put_page(page);
page = NULL;
+ hindex = index;
}
- if (page || sgp == SGP_READ) {
- *pagep = page;
- return 0;
- }
+ if (page || sgp == SGP_READ)
+ goto out;
/*
* Fast cache lookup did not find it:
@@ -1969,14 +1970,13 @@ clear:
* it now, lest undo on failure cancel our earlier guarantee.
*/
if (sgp != SGP_WRITE && !PageUptodate(page)) {
- struct page *head = compound_head(page);
int i;
- for (i = 0; i < compound_nr(head); i++) {
- clear_highpage(head + i);
- flush_dcache_page(head + i);
+ for (i = 0; i < compound_nr(page); i++) {
+ clear_highpage(page + i);
+ flush_dcache_page(page + i);
}
- SetPageUptodate(head);
+ SetPageUptodate(page);
}
/* Perhaps the file has been truncated since we checked */
@@ -1992,6 +1992,7 @@ clear:
error = -EINVAL;
goto unlock;
}
+out:
*pagep = page + index - hindex;
return 0;
diff --git a/mm/slab.c b/mm/slab.c
index f658e86ec8ce..399a9d185b0f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2305,8 +2305,6 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
/* Slab management obj is off-slab. */
freelist = kmem_cache_alloc_node(cachep->freelist_cache,
local_flags, nodeid);
- if (!freelist)
- return NULL;
} else {
/* We will use last bytes at the slab for freelist */
freelist = addr + (PAGE_SIZE << cachep->gfporder) -
@@ -3440,7 +3438,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
memset(objp, 0, cachep->object_size);
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
- memcg_slab_free_hook(cachep, virt_to_head_page(objp), objp);
+ memcg_slab_free_hook(cachep, &objp, 1);
/*
* Skip calling cache_free_alien() when the platform is not numa.
diff --git a/mm/slab.h b/mm/slab.h
index 6cc323f1313a..6dd4b702888a 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -345,30 +345,42 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
obj_cgroup_put(objcg);
}
-static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
- void *p)
+static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
+ void **p, int objects)
{
+ struct kmem_cache *s;
struct obj_cgroup *objcg;
+ struct page *page;
unsigned int off;
+ int i;
if (!memcg_kmem_enabled())
return;
- if (!page_has_obj_cgroups(page))
- return;
+ for (i = 0; i < objects; i++) {
+ if (unlikely(!p[i]))
+ continue;
- off = obj_to_index(s, page, p);
- objcg = page_obj_cgroups(page)[off];
- page_obj_cgroups(page)[off] = NULL;
+ page = virt_to_head_page(p[i]);
+ if (!page_has_obj_cgroups(page))
+ continue;
- if (!objcg)
- return;
+ if (!s_orig)
+ s = page->slab_cache;
+ else
+ s = s_orig;
- obj_cgroup_uncharge(objcg, obj_full_size(s));
- mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
- -obj_full_size(s));
+ off = obj_to_index(s, page, p[i]);
+ objcg = page_obj_cgroups(page)[off];
+ if (!objcg)
+ continue;
- obj_cgroup_put(objcg);
+ page_obj_cgroups(page)[off] = NULL;
+ obj_cgroup_uncharge(objcg, obj_full_size(s));
+ mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
+ -obj_full_size(s));
+ obj_cgroup_put(objcg);
+ }
}
#else /* CONFIG_MEMCG_KMEM */
@@ -406,8 +418,8 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
{
}
-static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
- void *p)
+static inline void memcg_slab_free_hook(struct kmem_cache *s,
+ void **p, int objects)
{
}
#endif /* CONFIG_MEMCG_KMEM */
diff --git a/mm/slub.c b/mm/slub.c
index 6d3574013b2f..61d0d2968413 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2245,7 +2245,8 @@ redo:
}
} else {
m = M_FULL;
- if (kmem_cache_debug(s) && !lock) {
+#ifdef CONFIG_SLUB_DEBUG
+ if ((s->flags & SLAB_STORE_USER) && !lock) {
lock = 1;
/*
* This also ensures that the scanning of full
@@ -2254,6 +2255,7 @@ redo:
*/
spin_lock(&n->list_lock);
}
+#endif
}
if (l != m) {
@@ -2661,6 +2663,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
void *freelist;
struct page *page;
+ stat(s, ALLOC_SLOWPATH);
+
page = c->page;
if (!page) {
/*
@@ -2850,7 +2854,6 @@ redo:
page = c->page;
if (unlikely(!object || !node_match(page, node))) {
object = __slab_alloc(s, gfpflags, node, addr, c);
- stat(s, ALLOC_SLOWPATH);
} else {
void *next_object = get_freepointer_safe(s, object);
@@ -3019,20 +3022,21 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
if (likely(!n)) {
- /*
- * If we just froze the page then put it onto the
- * per cpu partial list.
- */
- if (new.frozen && !was_frozen) {
+ if (likely(was_frozen)) {
+ /*
+ * The list lock was not taken therefore no list
+ * activity can be necessary.
+ */
+ stat(s, FREE_FROZEN);
+ } else if (new.frozen) {
+ /*
+ * If we just froze the page then put it onto the
+ * per cpu partial list.
+ */
put_cpu_partial(s, page, 1);
stat(s, CPU_PARTIAL_FREE);
}
- /*
- * The list lock was not taken therefore no list
- * activity can be necessary.
- */
- if (was_frozen)
- stat(s, FREE_FROZEN);
+
return;
}
@@ -3091,7 +3095,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
struct kmem_cache_cpu *c;
unsigned long tid;
- memcg_slab_free_hook(s, page, head);
+ memcg_slab_free_hook(s, &head, 1);
redo:
/*
* Determine the currently cpus per cpu slab.
@@ -3253,6 +3257,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
if (WARN_ON(!size))
return;
+ memcg_slab_free_hook(s, p, size);
do {
struct detached_freelist df;
diff --git a/mm/sparse.c b/mm/sparse.c
index fcc3d176f1ea..b25ad8e64839 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -291,13 +291,11 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en
*/
static void __init memblocks_present(void)
{
- struct memblock_region *reg;
+ unsigned long start, end;
+ int i, nid;
- for_each_memblock(memory, reg) {
- memory_present(memblock_get_region_node(reg),
- memblock_region_memory_base_pfn(reg),
- memblock_region_memory_end_pfn(reg));
- }
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
+ memory_present(nid, start, end);
}
/*
diff --git a/mm/swap.c b/mm/swap.c
index 65ef7e3525bf..47a47681c86b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -348,7 +348,7 @@ static bool need_activate_page_drain(int cpu)
return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
}
-void activate_page(struct page *page)
+static void activate_page(struct page *page)
{
page = compound_head(page);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -368,7 +368,7 @@ static inline void activate_page_drain(int cpu)
{
}
-void activate_page(struct page *page)
+static void activate_page(struct page *page)
{
pg_data_t *pgdat = page_pgdat(page);
@@ -481,9 +481,7 @@ EXPORT_SYMBOL(lru_cache_add);
* @vma: vma in which page is mapped for determining reclaimability
*
* Place @page on the inactive or unevictable LRU list, depending on its
- * evictability. Note that if the page is not evictable, it goes
- * directly back onto it's zone's unevictable list, it does NOT use a
- * per cpu pagevec.
+ * evictability.
*/
void lru_cache_add_inactive_or_unevictable(struct page *page,
struct vm_area_struct *vma)
@@ -891,6 +889,7 @@ void release_pages(struct page **pages, int nr)
locked_pgdat = NULL;
}
+ page = compound_head(page);
if (is_huge_zero_page(page))
continue;
@@ -902,7 +901,7 @@ void release_pages(struct page **pages, int nr)
}
/*
* ZONE_DEVICE pages that return 'false' from
- * put_devmap_managed_page() do not require special
+ * page_is_devmap_managed() do not require special
* processing, and instead, expect a call to
* put_page_testzero().
*/
@@ -912,7 +911,6 @@ void release_pages(struct page **pages, int nr)
}
}
- page = compound_head(page);
if (!put_page_testzero(page))
continue;
@@ -943,8 +941,6 @@ void release_pages(struct page **pages, int nr)
del_page_from_lru_list(page, lruvec, page_off_lru(page));
}
- /* Clear Active bit in case of parallel mark_page_accessed */
- __ClearPageActive(page);
__ClearPageWaiters(page);
list_add(&page->lru, &pages_to_free);
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 3e6453573a89..0357fbe70645 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -237,7 +237,7 @@ static int free_slot_cache(unsigned int cpu)
return 0;
}
-int enable_swap_slots_cache(void)
+void enable_swap_slots_cache(void)
{
mutex_lock(&swap_slots_cache_enable_mutex);
if (!swap_slot_cache_initialized) {
@@ -255,7 +255,6 @@ int enable_swap_slots_cache(void)
__reenable_swap_slots_cache();
out_unlock:
mutex_unlock(&swap_slots_cache_enable_mutex);
- return 0;
}
/* called with swap slot cache's alloc lock held */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c16eebb81d8b..aa40e706604c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -21,6 +21,7 @@
#include <linux/vmalloc.h>
#include <linux/swap_slots.h>
#include <linux/huge_mm.h>
+#include <linux/shmem_fs.h>
#include "internal.h"
/*
@@ -414,6 +415,39 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
return page;
}
+/**
+ * find_get_incore_page - Find and get a page from the page or swap caches.
+ * @mapping: The address_space to search.
+ * @index: The page cache index.
+ *
+ * This differs from find_get_page() in that it will also look for the
+ * page in the swap cache.
+ *
+ * Return: The found page or %NULL.
+ */
+struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
+{
+ swp_entry_t swp;
+ struct swap_info_struct *si;
+ struct page *page = find_get_entry(mapping, index);
+
+ if (!page)
+ return page;
+ if (!xa_is_value(page))
+ return find_subpage(page, index);
+ if (!shmem_mapping(mapping))
+ return NULL;
+
+ swp = radix_to_swp_entry(page);
+ /* Prevent swapoff from happening to us */
+ si = get_swap_device(swp);
+ if (!si)
+ return NULL;
+ page = find_get_page(swap_address_space(swp), swp_offset(swp));
+ put_swap_device(si);
+ return page;
+}
+
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
bool *new_page_allocated)
@@ -631,7 +665,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
goto skip;
/* Test swap type to make sure the dereference is safe */
- if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) {
+ if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
struct inode *inode = si->swap_file->f_mapping->host;
if (inode_read_congested(inode))
goto skip;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ced4635d924c..c4a613688a17 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1184,7 +1184,6 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
bad_free:
pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
- goto out;
out:
return NULL;
}
@@ -1929,11 +1928,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
lru_cache_add_inactive_or_unevictable(page, vma);
}
swap_free(entry);
- /*
- * Move the page to the active list so it is not
- * immediately swapped out again after swapon.
- */
- activate_page(page);
out:
pte_unmap_unlock(pte, ptl);
if (page != swapcache) {
@@ -2437,7 +2431,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
if (ret >= 0)
sis->flags |= SWP_ACTIVATED;
if (!ret) {
- sis->flags |= SWP_FS;
+ sis->flags |= SWP_FS_OPS;
ret = add_swap_extent(sis, 0, sis->max, 0);
*span = sis->pages;
}
@@ -3348,7 +3342,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = inode_drain_writes(inode);
if (error) {
inode->i_flags &= ~S_SWAPFILE;
- goto bad_swap_unlock_inode;
+ goto free_swap_address_space;
}
mutex_lock(&swapon_mutex);
@@ -3373,6 +3367,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = 0;
goto out;
+free_swap_address_space:
+ exit_swap_address_space(p->type);
bad_swap_unlock_inode:
inode_unlock(inode);
bad_swap:
diff --git a/mm/truncate.c b/mm/truncate.c
index dd9ebc1da356..6bbe0f0b3ce9 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -528,23 +528,8 @@ void truncate_inode_pages_final(struct address_space *mapping)
}
EXPORT_SYMBOL(truncate_inode_pages_final);
-/**
- * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
- * @mapping: the address_space which holds the pages to invalidate
- * @start: the offset 'from' which to invalidate
- * @end: the offset 'to' which to invalidate (inclusive)
- *
- * This function only removes the unlocked pages, if you want to
- * remove all the pages of one inode, you must call truncate_inode_pages.
- *
- * invalidate_mapping_pages() will not block on IO activity. It will not
- * invalidate pages which are dirty, locked, under writeback or mapped into
- * pagetables.
- *
- * Return: the number of the pages that were invalidated
- */
-unsigned long invalidate_mapping_pages(struct address_space *mapping,
- pgoff_t start, pgoff_t end)
+unsigned long __invalidate_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
{
pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
@@ -610,8 +595,13 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
* Invalidation is a hint that the page is no longer
* of interest and try to speed up its reclaim.
*/
- if (!ret)
+ if (!ret) {
deactivate_file_page(page);
+ /* It is likely on the pagevec of a remote CPU */
+ if (nr_pagevec)
+ (*nr_pagevec)++;
+ }
+
if (PageTransHuge(page))
put_page(page);
count += ret;
@@ -623,8 +613,40 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
}
return count;
}
+
+/**
+ * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
+ * @mapping: the address_space which holds the pages to invalidate
+ * @start: the offset 'from' which to invalidate
+ * @end: the offset 'to' which to invalidate (inclusive)
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ *
+ * invalidate_mapping_pages() will not block on IO activity. It will not
+ * invalidate pages which are dirty, locked, under writeback or mapped into
+ * pagetables.
+ *
+ * Return: the number of the pages that were invalidated
+ */
+unsigned long invalidate_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ return __invalidate_mapping_pages(mapping, start, end, NULL);
+}
EXPORT_SYMBOL(invalidate_mapping_pages);
+/**
+ * This helper is similar with the above one, except that it accounts for pages
+ * that are likely on a pagevec and count them in @nr_pagevec, which will used by
+ * the caller.
+ */
+void invalidate_mapping_pagevec(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
+{
+ __invalidate_mapping_pages(mapping, start, end, nr_pagevec);
+}
+
/*
* This is like invalidate_complete_page(), except it ignores the page's
* refcount. We do this because invalidate_inode_pages2() needs stronger
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index be4724b916b3..04ac98bf5045 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2133,7 +2133,7 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
* It is up to the caller to do all required locking to keep the returned
* pointer valid.
*
- * Return: pointer to the found area or %NULL on faulure
+ * Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *find_vm_area(const void *addr)
{
@@ -2154,7 +2154,7 @@ struct vm_struct *find_vm_area(const void *addr)
* This function returns the found VM area, but using it is NOT safe
* on SMP machines, except for its size or flags.
*
- * Return: pointer to the found area or %NULL on faulure
+ * Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *remove_vm_area(const void *addr)
{
@@ -2447,7 +2447,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
if (unlikely(!page)) {
- /* Successfully allocated i pages, free them in __vunmap() */
+ /* Successfully allocated i pages, free them in __vfree() */
area->nr_pages = i;
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
goto fail;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 466fc3144fff..879fb57c5045 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -699,6 +699,9 @@ void drop_slab_node(int nid)
do {
struct mem_cgroup *memcg = NULL;
+ if (fatal_signal_pending(current))
+ return;
+
freed = 0;
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
@@ -1751,7 +1754,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* Restrictions:
*
* (1) Must be called with an elevated refcount on the page. This is a
- * fundamentnal difference from isolate_lru_pages (which is called
+ * fundamental difference from isolate_lru_pages (which is called
* without a stable reference).
* (2) the lru_lock must not be held.
* (3) interrupts must be enabled.
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 460b0feced26..18feaa0bc537 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -212,13 +212,12 @@ static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
{
struct z3fold_buddy_slots *slots;
- slots = kmem_cache_alloc(pool->c_handle,
+ slots = kmem_cache_zalloc(pool->c_handle,
(gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
if (slots) {
/* It will be freed separately in free_handle(). */
kmemleak_not_leak(slots);
- memset(slots->slot, 0, sizeof(slots->slot));
slots->pool = (unsigned long)pool;
rwlock_init(&slots->lock);
}
diff --git a/mm/zbud.c b/mm/zbud.c
index bc93aa4e46fc..c49966ece674 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -367,7 +367,6 @@ int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
spin_lock(&pool->lock);
/* First, try to find an unbuddied zbud page. */
- zhdr = NULL;
for_each_unbuddied_list(i, chunks) {
if (!list_empty(&pool->unbuddied[i])) {
zhdr = list_first_entry(&pool->unbuddied[i],
diff --git a/samples/Makefile b/samples/Makefile
index 754553597581..c3392a595e4b 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -28,3 +28,4 @@ subdir-$(CONFIG_SAMPLE_VFS) += vfs
obj-$(CONFIG_SAMPLE_INTEL_MEI) += mei/
subdir-$(CONFIG_SAMPLE_WATCHDOG) += watchdog
subdir-$(CONFIG_SAMPLE_WATCH_QUEUE) += watch_queue
+obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak/
diff --git a/samples/kmemleak/Makefile b/samples/kmemleak/Makefile
new file mode 100644
index 000000000000..16b6132c540c
--- /dev/null
+++ b/samples/kmemleak/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/kmemleak-test.c b/samples/kmemleak/kmemleak-test.c
index e19279ff6aa3..7b476eb8285f 100644
--- a/mm/kmemleak-test.c
+++ b/samples/kmemleak/kmemleak-test.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * mm/kmemleak-test.c
+ * samples/kmemleak/kmemleak-test.c
*
* Copyright (C) 2008 ARM Limited
* Written by Catalin Marinas <catalin.marinas@arm.com>
diff --git a/scripts/decodecode b/scripts/decodecode
index fbdb325cdf4f..31d884e35f2f 100755
--- a/scripts/decodecode
+++ b/scripts/decodecode
@@ -6,6 +6,7 @@
# options: set env. variable AFLAGS=options to pass options to "as";
# e.g., to decode an i386 oops on an x86_64 system, use:
# AFLAGS=--32 decodecode < 386.oops
+# PC=hex - the PC (program counter) the oops points to
cleanup() {
rm -f $T $T.s $T.o $T.oo $T.aa $T.dis
@@ -67,15 +68,19 @@ if [ -z "$ARCH" ]; then
esac
fi
+# Params: (tmp_file, pc_sub)
disas() {
- ${CROSS_COMPILE}as $AFLAGS -o $1.o $1.s > /dev/null 2>&1
+ t=$1
+ pc_sub=$2
+
+ ${CROSS_COMPILE}as $AFLAGS -o $t.o $t.s > /dev/null 2>&1
if [ "$ARCH" = "arm" ]; then
if [ $width -eq 2 ]; then
OBJDUMPFLAGS="-M force-thumb"
fi
- ${CROSS_COMPILE}strip $1.o
+ ${CROSS_COMPILE}strip $t.o
fi
if [ "$ARCH" = "arm64" ]; then
@@ -83,11 +88,18 @@ disas() {
type=inst
fi
- ${CROSS_COMPILE}strip $1.o
+ ${CROSS_COMPILE}strip $t.o
fi
- ${CROSS_COMPILE}objdump $OBJDUMPFLAGS -S $1.o | \
- grep -v "/tmp\|Disassembly\|\.text\|^$" > $1.dis 2>&1
+ if [ $pc_sub -ne 0 ]; then
+ if [ $PC ]; then
+ adj_vma=$(( $PC - $pc_sub ))
+ OBJDUMPFLAGS="$OBJDUMPFLAGS --adjust-vma=$adj_vma"
+ fi
+ fi
+
+ ${CROSS_COMPILE}objdump $OBJDUMPFLAGS -S $t.o | \
+ grep -v "/tmp\|Disassembly\|\.text\|^$" > $t.dis 2>&1
}
marker=`expr index "$code" "\<"`
@@ -95,14 +107,17 @@ if [ $marker -eq 0 ]; then
marker=`expr index "$code" "\("`
fi
+
touch $T.oo
if [ $marker -ne 0 ]; then
+ # 2 opcode bytes and a single space
+ pc_sub=$(( $marker / 3 ))
echo All code >> $T.oo
echo ======== >> $T.oo
beforemark=`echo "$code"`
echo -n " .$type 0x" > $T.s
echo $beforemark | sed -e 's/ /,0x/g; s/[<>()]//g' >> $T.s
- disas $T
+ disas $T $pc_sub
cat $T.dis >> $T.oo
rm -f $T.o $T.s $T.dis
@@ -114,7 +129,7 @@ echo =========================================== >> $T.aa
code=`echo $code | sed -e 's/ [<(]/ /;s/[>)] / /;s/ /,0x/g; s/[>)]$//'`
echo -n " .$type 0x" > $T.s
echo $code >> $T.s
-disas $T
+disas $T 0
cat $T.dis >> $T.aa
# (lines of whole $T.oo) - (lines of $T.aa, i.e. "Code starting") + 3,
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index feb2efaaa5e6..cb46a79665f5 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -9,6 +9,7 @@
#
abandonning||abandoning
abigious||ambiguous
+abitrary||arbitrary
abitrate||arbitrate
abnornally||abnormally
abnrormal||abnormal
@@ -482,6 +483,7 @@ disgest||digest
dispalying||displaying
diplay||display
directon||direction
+direcly||directly
direectly||directly
diregard||disregard
disassocation||disassociation
@@ -871,6 +873,7 @@ malplace||misplace
managable||manageable
managment||management
mangement||management
+manger||manager
manoeuvering||maneuvering
manufaucturing||manufacturing
mappping||mapping
@@ -1478,6 +1481,7 @@ unsolicitied||unsolicited
unsuccessfull||unsuccessful
unsuported||unsupported
untill||until
+ununsed||unused
unuseful||useless
unvalid||invalid
upate||update
diff --git a/tools/testing/nvdimm/dax-dev.c b/tools/testing/nvdimm/dax-dev.c
index 7e5d979e73cb..fb342a8c98d3 100644
--- a/tools/testing/nvdimm/dax-dev.c
+++ b/tools/testing/nvdimm/dax-dev.c
@@ -9,12 +9,19 @@
phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
unsigned long size)
{
- struct resource *res = &dev_dax->region->res;
- phys_addr_t addr;
+ int i;
- addr = pgoff * PAGE_SIZE + res->start;
- if (addr >= res->start && addr <= res->end) {
- if (addr + size - 1 <= res->end) {
+ for (i = 0; i < dev_dax->nr_range; i++) {
+ struct dev_dax_range *dax_range = &dev_dax->ranges[i];
+ struct range *range = &dax_range->range;
+ unsigned long long pgoff_end;
+ phys_addr_t addr;
+
+ pgoff_end = dax_range->pgoff + PHYS_PFN(range_len(range)) - 1;
+ if (pgoff < dax_range->pgoff || pgoff > pgoff_end)
+ continue;
+ addr = PFN_PHYS(pgoff - dax_range->pgoff) + range->start;
+ if (addr + size - 1 <= range->end) {
if (get_nfit_res(addr)) {
struct page *page;
@@ -23,9 +30,10 @@ phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
page = vmalloc_to_page((void *)addr);
return PFN_PHYS(page_to_pfn(page));
- } else
- return addr;
+ }
+ return addr;
}
+ break;
}
return -1;
}
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index 03e40b3b0106..c62d372d426f 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -126,7 +126,7 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref)
void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
{
int error;
- resource_size_t offset = pgmap->res.start;
+ resource_size_t offset = pgmap->range.start;
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
if (!nfit_res)
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index a9026706d597..30873b19d04b 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -3,6 +3,23 @@
uname_M := $(shell uname -m 2>/dev/null || echo not)
MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/')
+# Without this, failed build products remain, with up-to-date timestamps,
+# thus tricking Make (and you!) into believing that All Is Well, in subsequent
+# make invocations:
+.DELETE_ON_ERROR:
+
+# Avoid accidental wrong builds, due to built-in rules working just a little
+# bit too well--but not quite as well as required for our situation here.
+#
+# In other words, "make userfaultfd" is supposed to fail to build at all,
+# because this Makefile only supports either "make" (all), or "make /full/path".
+# However, the built-in rules, if not suppressed, will pick up CFLAGS and the
+# initial LDLIBS (but not the target-specific LDLIBS, because those are only
+# set for the full path target!). This causes it to get pretty far into building
+# things despite using incorrect values such as an *occasionally* incomplete
+# LDLIBS.
+MAKEFLAGS += --no-builtin-rules
+
CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS)
LDLIBS = -lrt
TEST_GEN_FILES = compaction_test
diff --git a/tools/testing/selftests/vm/compaction_test.c b/tools/testing/selftests/vm/compaction_test.c
index bcec71250873..9b420140ba2b 100644
--- a/tools/testing/selftests/vm/compaction_test.c
+++ b/tools/testing/selftests/vm/compaction_test.c
@@ -18,7 +18,8 @@
#include "../kselftest.h"
-#define MAP_SIZE 1048576
+#define MAP_SIZE_MB 100
+#define MAP_SIZE (MAP_SIZE_MB * 1024 * 1024)
struct map_list {
void *map;
@@ -165,7 +166,7 @@ int main(int argc, char **argv)
void *map = NULL;
unsigned long mem_free = 0;
unsigned long hugepage_size = 0;
- unsigned long mem_fragmentable = 0;
+ long mem_fragmentable_MB = 0;
if (prereq() != 0) {
printf("Either the sysctl compact_unevictable_allowed is not\n"
@@ -190,9 +191,9 @@ int main(int argc, char **argv)
return -1;
}
- mem_fragmentable = mem_free * 0.8 / 1024;
+ mem_fragmentable_MB = mem_free * 0.8 / 1024;
- while (mem_fragmentable > 0) {
+ while (mem_fragmentable_MB > 0) {
map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0);
if (map == MAP_FAILED)
@@ -213,7 +214,7 @@ int main(int argc, char **argv)
for (i = 0; i < MAP_SIZE; i += page_size)
*(unsigned long *)(map + i) = (unsigned long)map + i;
- mem_fragmentable--;
+ mem_fragmentable_MB -= MAP_SIZE_MB;
}
for (entry = list; entry != NULL; entry = entry->next) {
diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c
index 43b4dfe161a2..31f8bb086907 100644
--- a/tools/testing/selftests/vm/gup_benchmark.c
+++ b/tools/testing/selftests/vm/gup_benchmark.c
@@ -15,12 +15,12 @@
#define PAGE_SIZE sysconf(_SC_PAGESIZE)
#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
-#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
-#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
+#define GUP_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
/* Similar to above, but use FOLL_PIN instead of FOLL_GET. */
-#define PIN_FAST_BENCHMARK _IOWR('g', 4, struct gup_benchmark)
-#define PIN_BENCHMARK _IOWR('g', 5, struct gup_benchmark)
+#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
+#define PIN_BENCHMARK _IOWR('g', 4, struct gup_benchmark)
+#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark)
/* Just the flags we need, copied from mm.h: */
#define FOLL_WRITE 0x01 /* check pte is writable */
@@ -52,6 +52,9 @@ int main(int argc, char **argv)
case 'b':
cmd = PIN_BENCHMARK;
break;
+ case 'L':
+ cmd = PIN_LONGTERM_BENCHMARK;
+ break;
case 'm':
size = atoi(optarg) * MB;
break;
@@ -67,9 +70,6 @@ int main(int argc, char **argv)
case 'T':
thp = 0;
break;
- case 'L':
- cmd = GUP_LONGTERM_BENCHMARK;
- break;
case 'U':
cmd = GUP_BENCHMARK;
break;
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
index 93fc5cadce61..0a28a6a29581 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -680,7 +680,7 @@ TEST_F(hmm, anon_write_hugetlbfs)
n = gethugepagesizes(pagesizes, 4);
if (n <= 0)
- return;
+ SKIP(return, "Huge page size could not be determined");
for (idx = 0; --n > 0; ) {
if (pagesizes[n] < pagesizes[idx])
idx = n;
@@ -694,7 +694,7 @@ TEST_F(hmm, anon_write_hugetlbfs)
buffer->ptr = get_hugepage_region(size, GHR_STRICT);
if (buffer->ptr == NULL) {
free(buffer);
- return;
+ SKIP(return, "Huge page could not be allocated");
}
buffer->fd = -1;