summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt2
-rw-r--r--Documentation/dev-tools/kasan.rst10
-rw-r--r--Documentation/filesystems/dlmfs.rst2
-rw-r--r--Documentation/filesystems/ocfs2.rst2
-rw-r--r--Documentation/filesystems/tmpfs.rst18
-rw-r--r--Documentation/vm/arch_pgtable_helpers.rst258
-rw-r--r--Documentation/vm/memory-model.rst9
-rw-r--r--Documentation/vm/slub.rst37
-rw-r--r--arch/alpha/include/asm/pgalloc.h21
-rw-r--r--arch/alpha/include/asm/tlbflush.h1
-rw-r--r--arch/alpha/kernel/core_irongate.c1
-rw-r--r--arch/alpha/kernel/core_marvel.c1
-rw-r--r--arch/alpha/kernel/core_titan.c1
-rw-r--r--arch/alpha/kernel/machvec_impl.h2
-rw-r--r--arch/alpha/kernel/smp.c1
-rw-r--r--arch/alpha/mm/numa.c1
-rw-r--r--arch/arc/mm/fault.c1
-rw-r--r--arch/arc/mm/init.c1
-rw-r--r--arch/arm/include/asm/pgalloc.h12
-rw-r--r--arch/arm/include/asm/tlb.h1
-rw-r--r--arch/arm/kernel/machine_kexec.c1
-rw-r--r--arch/arm/kernel/smp.c1
-rw-r--r--arch/arm/kernel/suspend.c1
-rw-r--r--arch/arm/mach-omap2/omap-mpuss-lowpower.c1
-rw-r--r--arch/arm/mm/hugetlbpage.c1
-rw-r--r--arch/arm/mm/init.c9
-rw-r--r--arch/arm/mm/mmu.c1
-rw-r--r--arch/arm64/include/asm/pgalloc.h39
-rw-r--r--arch/arm64/kernel/setup.c2
-rw-r--r--arch/arm64/kernel/smp.c1
-rw-r--r--arch/arm64/mm/hugetlbpage.c1
-rw-r--r--arch/arm64/mm/init.c6
-rw-r--r--arch/arm64/mm/ioremap.c1
-rw-r--r--arch/arm64/mm/mmu.c59
-rw-r--r--arch/csky/include/asm/pgalloc.h7
-rw-r--r--arch/csky/kernel/smp.c1
-rw-r--r--arch/hexagon/include/asm/pgalloc.h7
-rw-r--r--arch/ia64/include/asm/pgalloc.h24
-rw-r--r--arch/ia64/include/asm/tlb.h1
-rw-r--r--arch/ia64/kernel/process.c1
-rw-r--r--arch/ia64/kernel/smp.c1
-rw-r--r--arch/ia64/kernel/smpboot.c1
-rw-r--r--arch/ia64/mm/contig.c1
-rw-r--r--arch/ia64/mm/discontig.c4
-rw-r--r--arch/ia64/mm/hugetlbpage.c1
-rw-r--r--arch/ia64/mm/tlb.c1
-rw-r--r--arch/m68k/include/asm/mmu_context.h2
-rw-r--r--arch/m68k/include/asm/sun3_pgalloc.h7
-rw-r--r--arch/m68k/kernel/dma.c2
-rw-r--r--arch/m68k/kernel/traps.c3
-rw-r--r--arch/m68k/mm/cache.c2
-rw-r--r--arch/m68k/mm/fault.c1
-rw-r--r--arch/m68k/mm/kmap.c2
-rw-r--r--arch/m68k/mm/mcfmmu.c1
-rw-r--r--arch/m68k/mm/memory.c1
-rw-r--r--arch/m68k/sun3x/dvma.c2
-rw-r--r--arch/microblaze/include/asm/pgalloc.h6
-rw-r--r--arch/microblaze/include/asm/tlbflush.h1
-rw-r--r--arch/microblaze/kernel/process.c1
-rw-r--r--arch/microblaze/kernel/signal.c1
-rw-r--r--arch/microblaze/mm/init.c3
-rw-r--r--arch/mips/include/asm/pgalloc.h19
-rw-r--r--arch/mips/kernel/setup.c8
-rw-r--r--arch/mips/loongson64/numa.c1
-rw-r--r--arch/mips/sgi-ip27/ip27-memory.c2
-rw-r--r--arch/mips/sgi-ip32/ip32-memory.c1
-rw-r--r--arch/nds32/mm/mm-nds32.c2
-rw-r--r--arch/nios2/include/asm/pgalloc.h7
-rw-r--r--arch/openrisc/include/asm/pgalloc.h33
-rw-r--r--arch/openrisc/include/asm/tlbflush.h1
-rw-r--r--arch/openrisc/kernel/or32_ksyms.c1
-rw-r--r--arch/parisc/include/asm/mmu_context.h1
-rw-r--r--arch/parisc/include/asm/pgalloc.h12
-rw-r--r--arch/parisc/kernel/cache.c1
-rw-r--r--arch/parisc/kernel/pci-dma.c1
-rw-r--r--arch/parisc/kernel/process.c1
-rw-r--r--arch/parisc/kernel/signal.c1
-rw-r--r--arch/parisc/kernel/smp.c1
-rw-r--r--arch/parisc/mm/hugetlbpage.c1
-rw-r--r--arch/parisc/mm/init.c5
-rw-r--r--arch/parisc/mm/ioremap.c2
-rw-r--r--arch/powerpc/include/asm/tlb.h1
-rw-r--r--arch/powerpc/mm/book3s64/hash_hugetlbpage.c1
-rw-r--r--arch/powerpc/mm/book3s64/hash_pgtable.c1
-rw-r--r--arch/powerpc/mm/book3s64/hash_tlb.c1
-rw-r--r--arch/powerpc/mm/book3s64/radix_hugetlbpage.c1
-rw-r--r--arch/powerpc/mm/init_32.c1
-rw-r--r--arch/powerpc/mm/init_64.c4
-rw-r--r--arch/powerpc/mm/kasan/8xx.c1
-rw-r--r--arch/powerpc/mm/kasan/book3s_32.c1
-rw-r--r--arch/powerpc/mm/mem.c3
-rw-r--r--arch/powerpc/mm/nohash/40x.c1
-rw-r--r--arch/powerpc/mm/nohash/8xx.c1
-rw-r--r--arch/powerpc/mm/nohash/fsl_booke.c1
-rw-r--r--arch/powerpc/mm/nohash/kaslr_booke.c1
-rw-r--r--arch/powerpc/mm/nohash/tlb.c1
-rw-r--r--arch/powerpc/mm/numa.c1
-rw-r--r--arch/powerpc/mm/pgtable.c1
-rw-r--r--arch/powerpc/mm/pgtable_64.c1
-rw-r--r--arch/powerpc/mm/ptdump/hashpagetable.c2
-rw-r--r--arch/powerpc/mm/ptdump/ptdump.c1
-rw-r--r--arch/powerpc/platforms/pseries/cmm.c1
-rw-r--r--arch/riscv/include/asm/pgalloc.h18
-rw-r--r--arch/riscv/mm/fault.c1
-rw-r--r--arch/riscv/mm/init.c3
-rw-r--r--arch/s390/crypto/prng.c4
-rw-r--r--arch/s390/include/asm/tlb.h1
-rw-r--r--arch/s390/include/asm/tlbflush.h1
-rw-r--r--arch/s390/kernel/machine_kexec.c1
-rw-r--r--arch/s390/kernel/ptrace.c1
-rw-r--r--arch/s390/kvm/diag.c1
-rw-r--r--arch/s390/kvm/priv.c1
-rw-r--r--arch/s390/kvm/pv.c1
-rw-r--r--arch/s390/mm/cmm.c1
-rw-r--r--arch/s390/mm/init.c1
-rw-r--r--arch/s390/mm/mmap.c1
-rw-r--r--arch/s390/mm/pgtable.c1
-rw-r--r--arch/sh/include/asm/pgalloc.h4
-rw-r--r--arch/sh/kernel/idle.c1
-rw-r--r--arch/sh/kernel/machine_kexec.c1
-rw-r--r--arch/sh/mm/cache-sh3.c1
-rw-r--r--arch/sh/mm/cache-sh7705.c1
-rw-r--r--arch/sh/mm/hugetlbpage.c1
-rw-r--r--arch/sh/mm/init.c7
-rw-r--r--arch/sh/mm/ioremap_fixed.c1
-rw-r--r--arch/sh/mm/numa.c3
-rw-r--r--arch/sh/mm/tlb-sh3.c1
-rw-r--r--arch/sparc/include/asm/ide.h1
-rw-r--r--arch/sparc/include/asm/tlb_64.h1
-rw-r--r--arch/sparc/kernel/leon_smp.c1
-rw-r--r--arch/sparc/kernel/process_32.c1
-rw-r--r--arch/sparc/kernel/signal_32.c1
-rw-r--r--arch/sparc/kernel/smp_32.c1
-rw-r--r--arch/sparc/kernel/smp_64.c1
-rw-r--r--arch/sparc/kernel/sun4m_irq.c1
-rw-r--r--arch/sparc/mm/highmem.c1
-rw-r--r--arch/sparc/mm/init_64.c1
-rw-r--r--arch/sparc/mm/io-unit.c1
-rw-r--r--arch/sparc/mm/iommu.c1
-rw-r--r--arch/sparc/mm/tlb.c1
-rw-r--r--arch/um/include/asm/pgalloc.h9
-rw-r--r--arch/um/include/asm/pgtable-3level.h3
-rw-r--r--arch/um/kernel/mem.c17
-rw-r--r--arch/x86/ia32/ia32_aout.c1
-rw-r--r--arch/x86/include/asm/mmu_context.h1
-rw-r--r--arch/x86/include/asm/pgalloc.h42
-rw-r--r--arch/x86/kernel/alternative.c1
-rw-r--r--arch/x86/kernel/apic/apic.c1
-rw-r--r--arch/x86/kernel/mpparse.c1
-rw-r--r--arch/x86/kernel/traps.c1
-rw-r--r--arch/x86/mm/fault.c1
-rw-r--r--arch/x86/mm/hugetlbpage.c1
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/init_64.c12
-rw-r--r--arch/x86/mm/kaslr.c1
-rw-r--r--arch/x86/mm/pgtable_32.c1
-rw-r--r--arch/x86/mm/pti.c1
-rw-r--r--arch/x86/platform/uv/bios_uv.c1
-rw-r--r--arch/x86/power/hibernate.c2
-rw-r--r--arch/xtensa/include/asm/pgalloc.h40
-rw-r--r--arch/xtensa/kernel/xtensa_ksyms.c1
-rw-r--r--arch/xtensa/mm/cache.c1
-rw-r--r--arch/xtensa/mm/fault.c1
-rw-r--r--crypto/adiantum.c2
-rw-r--r--crypto/ahash.c4
-rw-r--r--crypto/api.c2
-rw-r--r--crypto/asymmetric_keys/verify_pefile.c4
-rw-r--r--crypto/deflate.c2
-rw-r--r--crypto/drbg.c10
-rw-r--r--crypto/ecc.c8
-rw-r--r--crypto/ecdh.c2
-rw-r--r--crypto/gcm.c2
-rw-r--r--crypto/gf128mul.c4
-rw-r--r--crypto/jitterentropy-kcapi.c2
-rw-r--r--crypto/rng.c2
-rw-r--r--crypto/rsa-pkcs1pad.c6
-rw-r--r--crypto/seqiv.c2
-rw-r--r--crypto/shash.c2
-rw-r--r--crypto/skcipher.c2
-rw-r--r--crypto/testmgr.c6
-rw-r--r--crypto/zstd.c2
-rw-r--r--drivers/base/node.c10
-rw-r--r--drivers/block/xen-blkback/common.h1
-rw-r--r--drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c2
-rw-r--r--drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c2
-rw-r--r--drivers/crypto/amlogic/amlogic-gxl-cipher.c4
-rw-r--r--drivers/crypto/atmel-ecc.c2
-rw-r--r--drivers/crypto/caam/caampkc.c28
-rw-r--r--drivers/crypto/cavium/cpt/cptvf_main.c6
-rw-r--r--drivers/crypto/cavium/cpt/cptvf_reqmanager.c12
-rw-r--r--drivers/crypto/cavium/nitrox/nitrox_lib.c4
-rw-r--r--drivers/crypto/cavium/zip/zip_crypto.c6
-rw-r--r--drivers/crypto/ccp/ccp-crypto-rsa.c6
-rw-r--r--drivers/crypto/ccree/cc_aead.c4
-rw-r--r--drivers/crypto/ccree/cc_buffer_mgr.c4
-rw-r--r--drivers/crypto/ccree/cc_cipher.c6
-rw-r--r--drivers/crypto/ccree/cc_hash.c8
-rw-r--r--drivers/crypto/ccree/cc_request_mgr.c2
-rw-r--r--drivers/crypto/marvell/cesa/hash.c2
-rw-r--r--drivers/crypto/marvell/octeontx/otx_cptvf_main.c6
-rw-r--r--drivers/crypto/marvell/octeontx/otx_cptvf_reqmgr.h2
-rw-r--r--drivers/crypto/nx/nx.c4
-rw-r--r--drivers/crypto/virtio/virtio_crypto_algs.c12
-rw-r--r--drivers/crypto/virtio/virtio_crypto_core.c2
-rw-r--r--drivers/iommu/ipmmu-vmsa.c1
-rw-r--r--drivers/md/dm-crypt.c32
-rw-r--r--drivers/md/dm-integrity.c6
-rw-r--r--drivers/misc/ibmvmc.c6
-rw-r--r--drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c2
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c6
-rw-r--r--drivers/net/ppp/ppp_mppe.c6
-rw-r--r--drivers/net/wireguard/noise.c4
-rw-r--r--drivers/net/wireguard/peer.c2
-rw-r--r--drivers/net/wireless/intel/iwlwifi/pcie/rx.c2
-rw-r--r--drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c6
-rw-r--r--drivers/net/wireless/intel/iwlwifi/pcie/tx.c6
-rw-r--r--drivers/net/wireless/intersil/orinoco/wext.c4
-rw-r--r--drivers/s390/crypto/ap_bus.h4
-rw-r--r--drivers/staging/ks7010/ks_hostif.c2
-rw-r--r--drivers/staging/rtl8723bs/core/rtw_security.c2
-rw-r--r--drivers/staging/wlan-ng/p80211netdev.c2
-rw-r--r--drivers/target/iscsi/iscsi_target_auth.c2
-rw-r--r--drivers/xen/balloon.c1
-rw-r--r--drivers/xen/privcmd.c1
-rw-r--r--fs/Kconfig21
-rw-r--r--fs/aio.c6
-rw-r--r--fs/binfmt_elf_fdpic.c1
-rw-r--r--fs/cifs/cifsencrypt.c2
-rw-r--r--fs/cifs/connect.c10
-rw-r--r--fs/cifs/dfs_cache.c2
-rw-r--r--fs/cifs/misc.c8
-rw-r--r--fs/crypto/inline_crypt.c5
-rw-r--r--fs/crypto/keyring.c6
-rw-r--r--fs/crypto/keysetup_v1.c4
-rw-r--r--fs/ecryptfs/keystore.c4
-rw-r--r--fs/ecryptfs/messaging.c2
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/ntfs/dir.c2
-rw-r--r--fs/ntfs/inode.c27
-rw-r--r--fs/ntfs/inode.h4
-rw-r--r--fs/ntfs/mft.c4
-rw-r--r--fs/ocfs2/Kconfig6
-rw-r--r--fs/ocfs2/acl.c2
-rw-r--r--fs/ocfs2/blockcheck.c2
-rw-r--r--fs/ocfs2/dlmglue.c8
-rw-r--r--fs/ocfs2/ocfs2.h4
-rw-r--r--fs/ocfs2/suballoc.c4
-rw-r--r--fs/ocfs2/suballoc.h2
-rw-r--r--fs/ocfs2/super.c4
-rw-r--r--fs/proc/meminfo.c10
-rw-r--r--include/asm-generic/pgalloc.h80
-rw-r--r--include/asm-generic/tlb.h1
-rw-r--r--include/crypto/aead.h2
-rw-r--r--include/crypto/akcipher.h2
-rw-r--r--include/crypto/gf128mul.h2
-rw-r--r--include/crypto/hash.h2
-rw-r--r--include/crypto/internal/acompress.h2
-rw-r--r--include/crypto/kpp.h2
-rw-r--r--include/crypto/skcipher.h2
-rw-r--r--include/linux/efi.h4
-rw-r--r--include/linux/fs.h17
-rw-r--r--include/linux/huge_mm.h2
-rw-r--r--include/linux/kasan.h4
-rw-r--r--include/linux/memcontrol.h203
-rw-r--r--include/linux/mm.h86
-rw-r--r--include/linux/mm_types.h5
-rw-r--r--include/linux/mman.h4
-rw-r--r--include/linux/mmu_notifier.h13
-rw-r--r--include/linux/mmzone.h52
-rw-r--r--include/linux/pageblock-flags.h24
-rw-r--r--include/linux/percpu_counter.h4
-rw-r--r--include/linux/sched/mm.h8
-rw-r--r--include/linux/shmem_fs.h3
-rw-r--r--include/linux/slab.h9
-rw-r--r--include/linux/slab_def.h9
-rw-r--r--include/linux/slub_def.h31
-rw-r--r--include/linux/swap.h2
-rw-r--r--include/linux/vmstat.h14
-rw-r--r--init/Kconfig9
-rw-r--r--init/main.c2
-rw-r--r--ipc/shm.c2
-rw-r--r--kernel/fork.c54
-rw-r--r--kernel/kthread.c8
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/scs.c2
-rw-r--r--kernel/sysctl.c2
-rw-r--r--lib/Kconfig.kasan39
-rw-r--r--lib/Makefile1
-rw-r--r--lib/mpi/mpiutil.c6
-rw-r--r--lib/percpu_counter.c19
-rw-r--r--lib/test_kasan.c87
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile2
-rw-r--r--mm/debug.c83
-rw-r--r--mm/debug_vm_pgtable.c666
-rw-r--r--mm/filemap.c9
-rw-r--r--mm/gup.c3
-rw-r--r--mm/huge_memory.c12
-rw-r--r--mm/hugetlb.c25
-rw-r--r--mm/ioremap.c (renamed from lib/ioremap.c)2
-rw-r--r--mm/kasan/common.c41
-rw-r--r--mm/kasan/generic.c43
-rw-r--r--mm/kasan/generic_report.c1
-rw-r--r--mm/kasan/kasan.h23
-rw-r--r--mm/kasan/quarantine.c1
-rw-r--r--mm/kasan/report.c54
-rw-r--r--mm/kasan/tags.c37
-rw-r--r--mm/khugepaged.c75
-rw-r--r--mm/memcontrol.c752
-rw-r--r--mm/memory.c7
-rw-r--r--mm/memory_hotplug.c11
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/mm_init.c22
-rw-r--r--mm/mmap.c45
-rw-r--r--mm/mremap.c17
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c6
-rw-r--r--mm/page_alloc.c220
-rw-r--r--mm/page_counter.c6
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/pgalloc-track.h51
-rw-r--r--mm/shmem.c129
-rw-r--r--mm/shuffle.c46
-rw-r--r--mm/shuffle.h17
-rw-r--r--mm/slab.c103
-rw-r--r--mm/slab.h395
-rw-r--r--mm/slab_common.c703
-rw-r--r--mm/slob.c12
-rw-r--r--mm/slub.c588
-rw-r--r--mm/sparse-vmemmap.c56
-rw-r--r--mm/sparse.c31
-rw-r--r--mm/swap_slots.c45
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/util.c52
-rw-r--r--mm/vmalloc.c176
-rw-r--r--mm/vmscan.c39
-rw-r--r--mm/vmstat.c38
-rw-r--r--mm/workingset.c6
-rw-r--r--net/atm/mpoa_caches.c4
-rw-r--r--net/bluetooth/ecdh_helper.c6
-rw-r--r--net/bluetooth/smp.c24
-rw-r--r--net/core/sock.c2
-rw-r--r--net/ipv4/tcp_fastopen.c2
-rw-r--r--net/mac80211/aead_api.c4
-rw-r--r--net/mac80211/aes_gmac.c2
-rw-r--r--net/mac80211/key.c2
-rw-r--r--net/mac802154/llsec.c20
-rw-r--r--net/sctp/auth.c2
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c4
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_keys.c6
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c2
-rw-r--r--net/tipc/crypto.c10
-rw-r--r--net/wireless/core.c2
-rw-r--r--net/wireless/ibss.c4
-rw-r--r--net/wireless/lib80211_crypt_tkip.c2
-rw-r--r--net/wireless/lib80211_crypt_wep.c2
-rw-r--r--net/wireless/nl80211.c24
-rw-r--r--net/wireless/sme.c6
-rw-r--r--net/wireless/util.c2
-rw-r--r--net/wireless/wext-sme.c2
-rw-r--r--scripts/Makefile.kasan3
-rwxr-xr-xscripts/bloat-o-meter2
-rw-r--r--scripts/coccinelle/free/devm_free.cocci4
-rw-r--r--scripts/coccinelle/free/ifnullfree.cocci4
-rw-r--r--scripts/coccinelle/free/kfree.cocci6
-rw-r--r--scripts/coccinelle/free/kfreeaddr.cocci2
-rw-r--r--scripts/const_structs.checkpatch1
-rwxr-xr-xscripts/decode_stacktrace.sh79
-rw-r--r--scripts/spelling.txt19
-rwxr-xr-xscripts/tags.sh18
-rw-r--r--security/apparmor/domain.c4
-rw-r--r--security/apparmor/include/file.h2
-rw-r--r--security/apparmor/policy.c24
-rw-r--r--security/apparmor/policy_ns.c6
-rw-r--r--security/apparmor/policy_unpack.c14
-rw-r--r--security/keys/big_key.c6
-rw-r--r--security/keys/dh.c14
-rw-r--r--security/keys/encrypted-keys/encrypted.c14
-rw-r--r--security/keys/trusted-keys/trusted_tpm1.c34
-rw-r--r--security/keys/user_defined.c6
-rw-r--r--tools/cgroup/memcg_slabinfo.py226
-rw-r--r--tools/include/linux/jhash.h2
-rw-r--r--tools/lib/rbtree.c2
-rw-r--r--tools/lib/traceevent/event-parse.h2
-rw-r--r--tools/testing/ktest/examples/README2
-rw-r--r--tools/testing/ktest/examples/crosstests.conf2
-rw-r--r--tools/testing/selftests/Makefile1
-rw-r--r--tools/testing/selftests/cgroup/.gitignore1
-rw-r--r--tools/testing/selftests/cgroup/Makefile2
-rw-r--r--tools/testing/selftests/cgroup/cgroup_util.c2
-rw-r--r--tools/testing/selftests/cgroup/test_kmem.c382
-rw-r--r--tools/testing/selftests/mincore/.gitignore2
-rw-r--r--tools/testing/selftests/mincore/Makefile6
-rw-r--r--tools/testing/selftests/mincore/mincore_selftest.c361
396 files changed, 4868 insertions, 3399 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 00b993aa9365..98ea67f27809 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4693,7 +4693,7 @@
fragmentation. Defaults to 1 for systems with
more than 32MB of RAM, 0 otherwise.
- slub_debug[=options[,slabs]] [MM, SLUB]
+ slub_debug[=options[,slabs][;[options[,slabs]]...] [MM, SLUB]
Enabling slub_debug allows one to determine the
culprit if slab objects become corrupted. Enabling
slub_debug can create guard zones around objects and
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index c652d740735d..38fd5681fade 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -13,11 +13,8 @@ KASAN uses compile-time instrumentation to insert validity checks before every
memory access, and therefore requires a compiler version that supports that.
Generic KASAN is supported in both GCC and Clang. With GCC it requires version
-4.9.2 or later for basic support and version 5.0 or later for detection of
-out-of-bounds accesses for stack and global variables and for inline
-instrumentation mode (see the Usage section). With Clang it requires version
-7.0.0 or later and it doesn't support detection of out-of-bounds accesses for
-global variables yet.
+8.3.0 or later. With Clang it requires version 7.0.0 or later, but detection of
+out-of-bounds accesses for global variables is only supported since Clang 11.
Tag-based KASAN is only supported in Clang and requires version 7.0.0 or later.
@@ -193,6 +190,9 @@ function calls GCC directly inserts the code to check the shadow memory.
This option significantly enlarges kernel but it gives x1.1-x2 performance
boost over outline instrumented kernel.
+Generic KASAN prints up to 2 call_rcu() call stacks in reports, the last one
+and the second to last.
+
Software tag-based KASAN
~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Documentation/filesystems/dlmfs.rst b/Documentation/filesystems/dlmfs.rst
index 68daaa7facf9..28dd41a63be2 100644
--- a/Documentation/filesystems/dlmfs.rst
+++ b/Documentation/filesystems/dlmfs.rst
@@ -12,7 +12,7 @@ dlmfs is built with OCFS2 as it requires most of its infrastructure.
:Project web page: http://ocfs2.wiki.kernel.org
:Tools web page: https://github.com/markfasheh/ocfs2-tools
-:OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+:OCFS2 mailing lists: https://oss.oracle.com/projects/ocfs2/mailman/
All code copyright 2005 Oracle except when otherwise noted.
diff --git a/Documentation/filesystems/ocfs2.rst b/Documentation/filesystems/ocfs2.rst
index 412386bc6506..42ca9a3d4c6e 100644
--- a/Documentation/filesystems/ocfs2.rst
+++ b/Documentation/filesystems/ocfs2.rst
@@ -14,7 +14,7 @@ get "mount.ocfs2" and "ocfs2_hb_ctl".
Project web page: http://ocfs2.wiki.kernel.org
Tools git tree: https://github.com/markfasheh/ocfs2-tools
-OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+OCFS2 mailing lists: https://oss.oracle.com/projects/ocfs2/mailman/
All code copyright 2005 Oracle except when otherwise noted.
diff --git a/Documentation/filesystems/tmpfs.rst b/Documentation/filesystems/tmpfs.rst
index 4e95929301a5..c44f8b1d3cab 100644
--- a/Documentation/filesystems/tmpfs.rst
+++ b/Documentation/filesystems/tmpfs.rst
@@ -150,6 +150,22 @@ These options do not have any effect on remount. You can change these
parameters with chmod(1), chown(1) and chgrp(1) on a mounted filesystem.
+tmpfs has a mount option to select whether it will wrap at 32- or 64-bit inode
+numbers:
+
+======= ========================
+inode64 Use 64-bit inode numbers
+inode32 Use 32-bit inode numbers
+======= ========================
+
+On a 32-bit kernel, inode32 is implicit, and inode64 is refused at mount time.
+On a 64-bit kernel, CONFIG_TMPFS_INODE64 sets the default. inode64 avoids the
+possibility of multiple files with the same inode number on a single device;
+but risks glibc failing with EOVERFLOW once 33-bit inode numbers are reached -
+if a long-lived tmpfs is accessed by 32-bit applications so ancient that
+opening a file larger than 2GiB fails with EINVAL.
+
+
So 'mount -t tmpfs -o size=10G,nr_inodes=10k,mode=700 tmpfs /mytmpfs'
will give you tmpfs instance on /mytmpfs which can allocate 10GB
RAM/SWAP in 10240 inodes and it is only accessible by root.
@@ -161,3 +177,5 @@ RAM/SWAP in 10240 inodes and it is only accessible by root.
Hugh Dickins, 4 June 2007
:Updated:
KOSAKI Motohiro, 16 Mar 2010
+:Updated:
+ Chris Down, 13 July 2020
diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst
new file mode 100644
index 000000000000..f3591ee3aaa8
--- /dev/null
+++ b/Documentation/vm/arch_pgtable_helpers.rst
@@ -0,0 +1,258 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. _arch_page_table_helpers:
+
+===============================
+Architecture Page Table Helpers
+===============================
+
+Generic MM expects architectures (with MMU) to provide helpers to create, access
+and modify page table entries at various level for different memory functions.
+These page table helpers need to conform to a common semantics across platforms.
+Following tables describe the expected semantics which can also be tested during
+boot via CONFIG_DEBUG_VM_PGTABLE option. All future changes in here or the debug
+test need to be in sync.
+
+======================
+PTE Page Table Helpers
+======================
+
++---------------------------+--------------------------------------------------+
+| pte_same | Tests whether both PTE entries are the same |
++---------------------------+--------------------------------------------------+
+| pte_bad | Tests a non-table mapped PTE |
++---------------------------+--------------------------------------------------+
+| pte_present | Tests a valid mapped PTE |
++---------------------------+--------------------------------------------------+
+| pte_young | Tests a young PTE |
++---------------------------+--------------------------------------------------+
+| pte_dirty | Tests a dirty PTE |
++---------------------------+--------------------------------------------------+
+| pte_write | Tests a writable PTE |
++---------------------------+--------------------------------------------------+
+| pte_special | Tests a special PTE |
++---------------------------+--------------------------------------------------+
+| pte_protnone | Tests a PROT_NONE PTE |
++---------------------------+--------------------------------------------------+
+| pte_devmap | Tests a ZONE_DEVICE mapped PTE |
++---------------------------+--------------------------------------------------+
+| pte_soft_dirty | Tests a soft dirty PTE |
++---------------------------+--------------------------------------------------+
+| pte_swp_soft_dirty | Tests a soft dirty swapped PTE |
++---------------------------+--------------------------------------------------+
+| pte_mkyoung | Creates a young PTE |
++---------------------------+--------------------------------------------------+
+| pte_mkold | Creates an old PTE |
++---------------------------+--------------------------------------------------+
+| pte_mkdirty | Creates a dirty PTE |
++---------------------------+--------------------------------------------------+
+| pte_mkclean | Creates a clean PTE |
++---------------------------+--------------------------------------------------+
+| pte_mkwrite | Creates a writable PTE |
++---------------------------+--------------------------------------------------+
+| pte_mkwrprotect | Creates a write protected PTE |
++---------------------------+--------------------------------------------------+
+| pte_mkspecial | Creates a special PTE |
++---------------------------+--------------------------------------------------+
+| pte_mkdevmap | Creates a ZONE_DEVICE mapped PTE |
++---------------------------+--------------------------------------------------+
+| pte_mksoft_dirty | Creates a soft dirty PTE |
++---------------------------+--------------------------------------------------+
+| pte_clear_soft_dirty | Clears a soft dirty PTE |
++---------------------------+--------------------------------------------------+
+| pte_swp_mksoft_dirty | Creates a soft dirty swapped PTE |
++---------------------------+--------------------------------------------------+
+| pte_swp_clear_soft_dirty | Clears a soft dirty swapped PTE |
++---------------------------+--------------------------------------------------+
+| pte_mknotpresent | Invalidates a mapped PTE |
++---------------------------+--------------------------------------------------+
+| ptep_get_and_clear | Clears a PTE |
++---------------------------+--------------------------------------------------+
+| ptep_get_and_clear_full | Clears a PTE |
++---------------------------+--------------------------------------------------+
+| ptep_test_and_clear_young | Clears young from a PTE |
++---------------------------+--------------------------------------------------+
+| ptep_set_wrprotect | Converts into a write protected PTE |
++---------------------------+--------------------------------------------------+
+| ptep_set_access_flags | Converts into a more permissive PTE |
++---------------------------+--------------------------------------------------+
+
+======================
+PMD Page Table Helpers
+======================
+
++---------------------------+--------------------------------------------------+
+| pmd_same | Tests whether both PMD entries are the same |
++---------------------------+--------------------------------------------------+
+| pmd_bad | Tests a non-table mapped PMD |
++---------------------------+--------------------------------------------------+
+| pmd_leaf | Tests a leaf mapped PMD |
++---------------------------+--------------------------------------------------+
+| pmd_huge | Tests a HugeTLB mapped PMD |
++---------------------------+--------------------------------------------------+
+| pmd_trans_huge | Tests a Transparent Huge Page (THP) at PMD |
++---------------------------+--------------------------------------------------+
+| pmd_present | Tests a valid mapped PMD |
++---------------------------+--------------------------------------------------+
+| pmd_young | Tests a young PMD |
++---------------------------+--------------------------------------------------+
+| pmd_dirty | Tests a dirty PMD |
++---------------------------+--------------------------------------------------+
+| pmd_write | Tests a writable PMD |
++---------------------------+--------------------------------------------------+
+| pmd_special | Tests a special PMD |
++---------------------------+--------------------------------------------------+
+| pmd_protnone | Tests a PROT_NONE PMD |
++---------------------------+--------------------------------------------------+
+| pmd_devmap | Tests a ZONE_DEVICE mapped PMD |
++---------------------------+--------------------------------------------------+
+| pmd_soft_dirty | Tests a soft dirty PMD |
++---------------------------+--------------------------------------------------+
+| pmd_swp_soft_dirty | Tests a soft dirty swapped PMD |
++---------------------------+--------------------------------------------------+
+| pmd_mkyoung | Creates a young PMD |
++---------------------------+--------------------------------------------------+
+| pmd_mkold | Creates an old PMD |
++---------------------------+--------------------------------------------------+
+| pmd_mkdirty | Creates a dirty PMD |
++---------------------------+--------------------------------------------------+
+| pmd_mkclean | Creates a clean PMD |
++---------------------------+--------------------------------------------------+
+| pmd_mkwrite | Creates a writable PMD |
++---------------------------+--------------------------------------------------+
+| pmd_mkwrprotect | Creates a write protected PMD |
++---------------------------+--------------------------------------------------+
+| pmd_mkspecial | Creates a special PMD |
++---------------------------+--------------------------------------------------+
+| pmd_mkdevmap | Creates a ZONE_DEVICE mapped PMD |
++---------------------------+--------------------------------------------------+
+| pmd_mksoft_dirty | Creates a soft dirty PMD |
++---------------------------+--------------------------------------------------+
+| pmd_clear_soft_dirty | Clears a soft dirty PMD |
++---------------------------+--------------------------------------------------+
+| pmd_swp_mksoft_dirty | Creates a soft dirty swapped PMD |
++---------------------------+--------------------------------------------------+
+| pmd_swp_clear_soft_dirty | Clears a soft dirty swapped PMD |
++---------------------------+--------------------------------------------------+
+| pmd_mkinvalid | Invalidates a mapped PMD [1] |
++---------------------------+--------------------------------------------------+
+| pmd_set_huge | Creates a PMD huge mapping |
++---------------------------+--------------------------------------------------+
+| pmd_clear_huge | Clears a PMD huge mapping |
++---------------------------+--------------------------------------------------+
+| pmdp_get_and_clear | Clears a PMD |
++---------------------------+--------------------------------------------------+
+| pmdp_get_and_clear_full | Clears a PMD |
++---------------------------+--------------------------------------------------+
+| pmdp_test_and_clear_young | Clears young from a PMD |
++---------------------------+--------------------------------------------------+
+| pmdp_set_wrprotect | Converts into a write protected PMD |
++---------------------------+--------------------------------------------------+
+| pmdp_set_access_flags | Converts into a more permissive PMD |
++---------------------------+--------------------------------------------------+
+
+======================
+PUD Page Table Helpers
+======================
+
++---------------------------+--------------------------------------------------+
+| pud_same | Tests whether both PUD entries are the same |
++---------------------------+--------------------------------------------------+
+| pud_bad | Tests a non-table mapped PUD |
++---------------------------+--------------------------------------------------+
+| pud_leaf | Tests a leaf mapped PUD |
++---------------------------+--------------------------------------------------+
+| pud_huge | Tests a HugeTLB mapped PUD |
++---------------------------+--------------------------------------------------+
+| pud_trans_huge | Tests a Transparent Huge Page (THP) at PUD |
++---------------------------+--------------------------------------------------+
+| pud_present | Tests a valid mapped PUD |
++---------------------------+--------------------------------------------------+
+| pud_young | Tests a young PUD |
++---------------------------+--------------------------------------------------+
+| pud_dirty | Tests a dirty PUD |
++---------------------------+--------------------------------------------------+
+| pud_write | Tests a writable PUD |
++---------------------------+--------------------------------------------------+
+| pud_devmap | Tests a ZONE_DEVICE mapped PUD |
++---------------------------+--------------------------------------------------+
+| pud_mkyoung | Creates a young PUD |
++---------------------------+--------------------------------------------------+
+| pud_mkold | Creates an old PUD |
++---------------------------+--------------------------------------------------+
+| pud_mkdirty | Creates a dirty PUD |
++---------------------------+--------------------------------------------------+
+| pud_mkclean | Creates a clean PUD |
++---------------------------+--------------------------------------------------+
+| pud_mkwrite | Creates a writable PUD |
++---------------------------+--------------------------------------------------+
+| pud_mkwrprotect | Creates a write protected PUD |
++---------------------------+--------------------------------------------------+
+| pud_mkdevmap | Creates a ZONE_DEVICE mapped PUD |
++---------------------------+--------------------------------------------------+
+| pud_mkinvalid | Invalidates a mapped PUD [1] |
++---------------------------+--------------------------------------------------+
+| pud_set_huge | Creates a PUD huge mapping |
++---------------------------+--------------------------------------------------+
+| pud_clear_huge | Clears a PUD huge mapping |
++---------------------------+--------------------------------------------------+
+| pudp_get_and_clear | Clears a PUD |
++---------------------------+--------------------------------------------------+
+| pudp_get_and_clear_full | Clears a PUD |
++---------------------------+--------------------------------------------------+
+| pudp_test_and_clear_young | Clears young from a PUD |
++---------------------------+--------------------------------------------------+
+| pudp_set_wrprotect | Converts into a write protected PUD |
++---------------------------+--------------------------------------------------+
+| pudp_set_access_flags | Converts into a more permissive PUD |
++---------------------------+--------------------------------------------------+
+
+==========================
+HugeTLB Page Table Helpers
+==========================
+
++---------------------------+--------------------------------------------------+
+| pte_huge | Tests a HugeTLB |
++---------------------------+--------------------------------------------------+
+| pte_mkhuge | Creates a HugeTLB |
++---------------------------+--------------------------------------------------+
+| huge_pte_dirty | Tests a dirty HugeTLB |
++---------------------------+--------------------------------------------------+
+| huge_pte_write | Tests a writable HugeTLB |
++---------------------------+--------------------------------------------------+
+| huge_pte_mkdirty | Creates a dirty HugeTLB |
++---------------------------+--------------------------------------------------+
+| huge_pte_mkwrite | Creates a writable HugeTLB |
++---------------------------+--------------------------------------------------+
+| huge_pte_mkwrprotect | Creates a write protected HugeTLB |
++---------------------------+--------------------------------------------------+
+| huge_ptep_get_and_clear | Clears a HugeTLB |
++---------------------------+--------------------------------------------------+
+| huge_ptep_set_wrprotect | Converts into a write protected HugeTLB |
++---------------------------+--------------------------------------------------+
+| huge_ptep_set_access_flags | Converts into a more permissive HugeTLB |
++---------------------------+--------------------------------------------------+
+
+========================
+SWAP Page Table Helpers
+========================
+
++---------------------------+--------------------------------------------------+
+| __pte_to_swp_entry | Creates a swapped entry (arch) from a mapped PTE |
++---------------------------+--------------------------------------------------+
+| __swp_to_pte_entry | Creates a mapped PTE from a swapped entry (arch) |
++---------------------------+--------------------------------------------------+
+| __pmd_to_swp_entry | Creates a swapped entry (arch) from a mapped PMD |
++---------------------------+--------------------------------------------------+
+| __swp_to_pmd_entry | Creates a mapped PMD from a swapped entry (arch) |
++---------------------------+--------------------------------------------------+
+| is_migration_entry | Tests a migration (read or write) swapped entry |
++---------------------------+--------------------------------------------------+
+| is_write_migration_entry | Tests a write migration swapped entry |
++---------------------------+--------------------------------------------------+
+| make_migration_entry_read | Converts into read migration swapped entry |
++---------------------------+--------------------------------------------------+
+| make_migration_entry | Creates a migration swapped entry (read or write)|
++---------------------------+--------------------------------------------------+
+
+[1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/
diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst
index cc65bc85d260..769449734573 100644
--- a/Documentation/vm/memory-model.rst
+++ b/Documentation/vm/memory-model.rst
@@ -141,11 +141,8 @@ sections:
`mem_section` objects and the number of rows is calculated to fit
all the memory sections.
-The architecture setup code should call :c:func:`memory_present` for
-each active memory range or use :c:func:`memblocks_present` or
-:c:func:`sparse_memory_present_with_active_regions` wrappers to
-initialize the memory sections. Next, the actual memory maps should be
-set up using :c:func:`sparse_init`.
+The architecture setup code should call sparse_init() to
+initialize the memory sections and the memory maps.
With SPARSEMEM there are two possible ways to convert a PFN to the
corresponding `struct page` - a "classic sparse" and "sparse
@@ -178,7 +175,7 @@ for persistent memory devices in pre-allocated storage on those
devices. This storage is represented with :c:type:`struct vmem_altmap`
that is eventually passed to vmemmap_populate() through a long chain
of function calls. The vmemmap_populate() implementation may use the
-`vmem_altmap` along with :c:func:`altmap_alloc_block_buf` helper to
+`vmem_altmap` along with :c:func:`vmemmap_alloc_block_buf` helper to
allocate memory map on the persistent memory device.
ZONE_DEVICE
diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst
index 4eee598555c9..289d231cee97 100644
--- a/Documentation/vm/slub.rst
+++ b/Documentation/vm/slub.rst
@@ -41,6 +41,11 @@ slub_debug=<Debug-Options>,<slab name1>,<slab name2>,...
Enable options only for select slabs (no spaces
after a comma)
+Multiple blocks of options for all slabs or selected slabs can be given, with
+blocks of options delimited by ';'. The last of "all slabs" blocks is applied
+to all slabs except those that match one of the "select slabs" block. Options
+of the first "select slabs" blocks that matches the slab's name are applied.
+
Possible debug options are::
F Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS
@@ -83,17 +88,33 @@ switch off debugging for such caches by default, use::
slub_debug=O
-In case you forgot to enable debugging on the kernel command line: It is
-possible to enable debugging manually when the kernel is up. Look at the
-contents of::
+You can apply different options to different list of slab names, using blocks
+of options. This will enable red zoning for dentry and user tracking for
+kmalloc. All other slabs will not get any debugging enabled::
+
+ slub_debug=Z,dentry;U,kmalloc-*
+
+You can also enable options (e.g. sanity checks and poisoning) for all caches
+except some that are deemed too performance critical and don't need to be
+debugged by specifying global debug options followed by a list of slab names
+with "-" as options::
+
+ slub_debug=FZ;-,zs_handle,zspage
+
+The state of each debug option for a slab can be found in the respective files
+under::
/sys/kernel/slab/<slab name>/
-Look at the writable files. Writing 1 to them will enable the
-corresponding debug option. All options can be set on a slab that does
-not contain objects. If the slab already contains objects then sanity checks
-and tracing may only be enabled. The other options may cause the realignment
-of objects.
+If the file contains 1, the option is enabled, 0 means disabled. The debug
+options from the ``slub_debug`` parameter translate to the following files::
+
+ F sanity_checks
+ Z red_zone
+ P poison
+ U store_user
+ T trace
+ A failslab
Careful with tracing: It may spew out lots of information and never stop if
used on the wrong slab.
diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h
index a1a29f60934c..9c6a24fe493d 100644
--- a/arch/alpha/include/asm/pgalloc.h
+++ b/arch/alpha/include/asm/pgalloc.h
@@ -5,7 +5,7 @@
#include <linux/mm.h>
#include <linux/mmzone.h>
-#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
+#include <asm-generic/pgalloc.h>
/*
* Allocate and free page tables. The xxx_kernel() versions are
@@ -34,23 +34,4 @@ pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
extern pgd_t *pgd_alloc(struct mm_struct *mm);
-static inline void
-pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- free_page((unsigned long)pgd);
-}
-
-static inline pmd_t *
-pmd_alloc_one(struct mm_struct *mm, unsigned long address)
-{
- pmd_t *ret = (pmd_t *)__get_free_page(GFP_PGTABLE_USER);
- return ret;
-}
-
-static inline void
-pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
- free_page((unsigned long)pmd);
-}
-
#endif /* _ALPHA_PGALLOC_H */
diff --git a/arch/alpha/include/asm/tlbflush.h b/arch/alpha/include/asm/tlbflush.h
index f8b492408f51..94dc37cf873a 100644
--- a/arch/alpha/include/asm/tlbflush.h
+++ b/arch/alpha/include/asm/tlbflush.h
@@ -5,7 +5,6 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <asm/compiler.h>
-#include <asm/pgalloc.h>
#ifndef __EXTERN_INLINE
#define __EXTERN_INLINE extern inline
diff --git a/arch/alpha/kernel/core_irongate.c b/arch/alpha/kernel/core_irongate.c
index a9fd133a7fb2..72af1e72d833 100644
--- a/arch/alpha/kernel/core_irongate.c
+++ b/arch/alpha/kernel/core_irongate.c
@@ -302,7 +302,6 @@ irongate_init_arch(void)
#include <linux/agp_backend.h>
#include <linux/agpgart.h>
#include <linux/export.h>
-#include <asm/pgalloc.h>
#define GET_PAGE_DIR_OFF(addr) (addr >> 22)
#define GET_PAGE_DIR_IDX(addr) (GET_PAGE_DIR_OFF(addr))
diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c
index 1db9d0eb2922..4c80d992a659 100644
--- a/arch/alpha/kernel/core_marvel.c
+++ b/arch/alpha/kernel/core_marvel.c
@@ -23,7 +23,6 @@
#include <asm/ptrace.h>
#include <asm/smp.h>
#include <asm/gct.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/vga.h>
diff --git a/arch/alpha/kernel/core_titan.c b/arch/alpha/kernel/core_titan.c
index 2a2820fb1be6..77f5d68ed04b 100644
--- a/arch/alpha/kernel/core_titan.c
+++ b/arch/alpha/kernel/core_titan.c
@@ -20,7 +20,6 @@
#include <asm/ptrace.h>
#include <asm/smp.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/vga.h>
diff --git a/arch/alpha/kernel/machvec_impl.h b/arch/alpha/kernel/machvec_impl.h
index 38f045ec5cd2..393d5d6ca5d2 100644
--- a/arch/alpha/kernel/machvec_impl.h
+++ b/arch/alpha/kernel/machvec_impl.h
@@ -7,8 +7,6 @@
* This file has goodies to help simplify instantiation of machine vectors.
*/
-#include <asm/pgalloc.h>
-
/* Whee. These systems don't have an HAE:
IRONGATE, MARVEL, POLARIS, TSUNAMI, TITAN, WILDFIRE
Fix things up for the GENERIC kernel by defining the HAE address
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 631cc17410d1..f4dd9f3f3001 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -36,7 +36,6 @@
#include <asm/io.h>
#include <asm/irq.h>
-#include <asm/pgalloc.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index 5ad6087de1d6..0636e254a22f 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -17,7 +17,6 @@
#include <linux/module.h>
#include <asm/hwrpb.h>
-#include <asm/pgalloc.h>
#include <asm/sections.h>
pg_data_t node_data[MAX_NUMNODES];
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index 72f5405a7ec5..7287c793d1c9 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -13,7 +13,6 @@
#include <linux/kdebug.h>
#include <linux/perf_event.h>
#include <linux/mm_types.h>
-#include <asm/pgalloc.h>
#include <asm/mmu.h>
/*
diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index e7bdc2ac1c87..f886ac69d8ad 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -14,7 +14,6 @@
#include <linux/module.h>
#include <linux/highmem.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/sections.h>
#include <asm/arcregs.h>
diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h
index 069da393110c..15f4674715f8 100644
--- a/arch/arm/include/asm/pgalloc.h
+++ b/arch/arm/include/asm/pgalloc.h
@@ -22,17 +22,6 @@
#ifdef CONFIG_ARM_LPAE
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
- return (pmd_t *)get_zeroed_page(GFP_KERNEL);
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
- BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
- free_page((unsigned long)pmd);
-}
-
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
set_pud(pud, __pud(__pa(pmd) | PMD_TYPE_TABLE));
@@ -76,6 +65,7 @@ static inline void clean_pte_table(pte_t *pte)
#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
#define __HAVE_ARCH_PTE_ALLOC_ONE
+#define __HAVE_ARCH_PGD_FREE
#include <asm-generic/pgalloc.h>
static inline pte_t *
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index 4d4e7b6aabff..9415222b49ad 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -27,7 +27,6 @@
#else /* !CONFIG_MMU */
#include <linux/swap.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
static inline void __tlb_remove_table(void *_table)
diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c
index 974b6c64d3e6..5d84ad333f05 100644
--- a/arch/arm/kernel/machine_kexec.c
+++ b/arch/arm/kernel/machine_kexec.c
@@ -11,7 +11,6 @@
#include <linux/irq.h>
#include <linux/memblock.h>
#include <linux/of_fdt.h>
-#include <asm/pgalloc.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/fncpy.h>
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 9a6432557871..5d9da61eff62 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -37,7 +37,6 @@
#include <asm/idmap.h>
#include <asm/topology.h>
#include <asm/mmu_context.h>
-#include <asm/pgalloc.h>
#include <asm/procinfo.h>
#include <asm/processor.h>
#include <asm/sections.h>
diff --git a/arch/arm/kernel/suspend.c b/arch/arm/kernel/suspend.c
index d2c9338d74e8..24bd20564be7 100644
--- a/arch/arm/kernel/suspend.c
+++ b/arch/arm/kernel/suspend.c
@@ -7,7 +7,6 @@
#include <asm/bugs.h>
#include <asm/cacheflush.h>
#include <asm/idmap.h>
-#include <asm/pgalloc.h>
#include <asm/memory.h>
#include <asm/smp_plat.h>
#include <asm/suspend.h>
diff --git a/arch/arm/mach-omap2/omap-mpuss-lowpower.c b/arch/arm/mach-omap2/omap-mpuss-lowpower.c
index 67fa28532a3a..9fba98c2313a 100644
--- a/arch/arm/mach-omap2/omap-mpuss-lowpower.c
+++ b/arch/arm/mach-omap2/omap-mpuss-lowpower.c
@@ -42,7 +42,6 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/smp_scu.h>
-#include <asm/pgalloc.h>
#include <asm/suspend.h>
#include <asm/virt.h>
#include <asm/hardware/cache-l2x0.h>
diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c
index a1e5aace897a..dd7a0277c5c0 100644
--- a/arch/arm/mm/hugetlbpage.c
+++ b/arch/arm/mm/hugetlbpage.c
@@ -17,7 +17,6 @@
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
/*
* On ARM, huge pages are backed by pmd's rather than pte's, so we do a lot
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 01e18e43b174..000c1b48e973 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -243,13 +243,8 @@ void __init bootmem_init(void)
(phys_addr_t)max_low_pfn << PAGE_SHIFT);
/*
- * Sparsemem tries to allocate bootmem in memory_present(),
- * so must be done after the fixed reservations
- */
- memblocks_present();
-
- /*
- * sparse_init() needs the bootmem allocator up and running.
+ * sparse_init() tries to allocate memory from memblock, so must be
+ * done after the fixed reservations
*/
sparse_init();
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index cc3c9a6a1113..c36f977b2ccb 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -29,6 +29,7 @@
#include <asm/traps.h>
#include <asm/procinfo.h>
#include <asm/memory.h>
+#include <asm/pgalloc.h>
#include <asm/mach/arch.h>
#include <asm/mach/map.h>
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 58e93583ddb6..3c6a7f5988b1 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -13,37 +13,13 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
+#define __HAVE_ARCH_PGD_FREE
+#include <asm-generic/pgalloc.h>
#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
#if CONFIG_PGTABLE_LEVELS > 2
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
- gfp_t gfp = GFP_PGTABLE_USER;
- struct page *page;
-
- if (mm == &init_mm)
- gfp = GFP_PGTABLE_KERNEL;
-
- page = alloc_page(gfp);
- if (!page)
- return NULL;
- if (!pgtable_pmd_page_ctor(page)) {
- __free_page(page);
- return NULL;
- }
- return page_address(page);
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmdp)
-{
- BUG_ON((unsigned long)pmdp & (PAGE_SIZE-1));
- pgtable_pmd_page_dtor(virt_to_page(pmdp));
- free_page((unsigned long)pmdp);
-}
-
static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot)
{
set_pud(pudp, __pud(__phys_to_pud_val(pmdp) | prot));
@@ -62,17 +38,6 @@ static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot)
#if CONFIG_PGTABLE_LEVELS > 3
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
- return (pud_t *)__get_free_page(GFP_PGTABLE_USER);
-}
-
-static inline void pud_free(struct mm_struct *mm, pud_t *pudp)
-{
- BUG_ON((unsigned long)pudp & (PAGE_SIZE-1));
- free_page((unsigned long)pudp);
-}
-
static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
{
set_p4d(p4dp, __p4d(__phys_to_p4d_val(pudp) | prot));
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index c793276ec7ad..87e81d29e6fb 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -276,7 +276,7 @@ arch_initcall(reserve_memblock_reserved_regions);
u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
-void __init setup_arch(char **cmdline_p)
+void __init __no_sanitize_address setup_arch(char **cmdline_p)
{
init_mm.start_code = (unsigned long) _text;
init_mm.end_code = (unsigned long) _etext;
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index e43a8ff19f0f..8059d50bc8cb 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -43,7 +43,6 @@
#include <asm/kvm_mmu.h>
#include <asm/mmu_context.h>
#include <asm/numa.h>
-#include <asm/pgalloc.h>
#include <asm/processor.h>
#include <asm/smp_plat.h>
#include <asm/sections.h>
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index aa421bf4956e..55ecf6de9ff7 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -17,7 +17,6 @@
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
/*
* HugeTLB Support Matrix
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index f8c19c6c8e71..481d22c32a2e 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -430,11 +430,9 @@ void __init bootmem_init(void)
#endif
/*
- * Sparsemem tries to allocate bootmem in memory_present(), so must be
- * done after the fixed reservations.
+ * sparse_init() tries to allocate memory from memblock, so must be
+ * done after the fixed reservations
*/
- memblocks_present();
-
sparse_init();
zone_sizes_init(min, max);
diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index 9be71bee902c..b5e83c46b23e 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -16,7 +16,6 @@
#include <asm/fixmap.h>
#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
static void __iomem *__ioremap_caller(phys_addr_t phys_addr, size_t size,
pgprot_t prot, void *caller)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 1df25f26571d..75df62fea1b6 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -35,6 +35,7 @@
#include <asm/mmu_context.h>
#include <asm/ptdump.h>
#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
#define NO_BLOCK_MAPPINGS BIT(0)
#define NO_CONT_MAPPINGS BIT(1)
@@ -760,15 +761,20 @@ int kern_addr_valid(unsigned long addr)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-static void free_hotplug_page_range(struct page *page, size_t size)
+static void free_hotplug_page_range(struct page *page, size_t size,
+ struct vmem_altmap *altmap)
{
- WARN_ON(PageReserved(page));
- free_pages((unsigned long)page_address(page), get_order(size));
+ if (altmap) {
+ vmem_altmap_free(altmap, size >> PAGE_SHIFT);
+ } else {
+ WARN_ON(PageReserved(page));
+ free_pages((unsigned long)page_address(page), get_order(size));
+ }
}
static void free_hotplug_pgtable_page(struct page *page)
{
- free_hotplug_page_range(page, PAGE_SIZE);
+ free_hotplug_page_range(page, PAGE_SIZE, NULL);
}
static bool pgtable_range_aligned(unsigned long start, unsigned long end,
@@ -791,7 +797,8 @@ static bool pgtable_range_aligned(unsigned long start, unsigned long end,
}
static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
- unsigned long end, bool free_mapped)
+ unsigned long end, bool free_mapped,
+ struct vmem_altmap *altmap)
{
pte_t *ptep, pte;
@@ -805,12 +812,14 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
pte_clear(&init_mm, addr, ptep);
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
if (free_mapped)
- free_hotplug_page_range(pte_page(pte), PAGE_SIZE);
+ free_hotplug_page_range(pte_page(pte),
+ PAGE_SIZE, altmap);
} while (addr += PAGE_SIZE, addr < end);
}
static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
- unsigned long end, bool free_mapped)
+ unsigned long end, bool free_mapped,
+ struct vmem_altmap *altmap)
{
unsigned long next;
pmd_t *pmdp, pmd;
@@ -833,16 +842,17 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
if (free_mapped)
free_hotplug_page_range(pmd_page(pmd),
- PMD_SIZE);
+ PMD_SIZE, altmap);
continue;
}
WARN_ON(!pmd_table(pmd));
- unmap_hotplug_pte_range(pmdp, addr, next, free_mapped);
+ unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap);
} while (addr = next, addr < end);
}
static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
- unsigned long end, bool free_mapped)
+ unsigned long end, bool free_mapped,
+ struct vmem_altmap *altmap)
{
unsigned long next;
pud_t *pudp, pud;
@@ -865,16 +875,17 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
if (free_mapped)
free_hotplug_page_range(pud_page(pud),
- PUD_SIZE);
+ PUD_SIZE, altmap);
continue;
}
WARN_ON(!pud_table(pud));
- unmap_hotplug_pmd_range(pudp, addr, next, free_mapped);
+ unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap);
} while (addr = next, addr < end);
}
static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
- unsigned long end, bool free_mapped)
+ unsigned long end, bool free_mapped,
+ struct vmem_altmap *altmap)
{
unsigned long next;
p4d_t *p4dp, p4d;
@@ -887,16 +898,24 @@ static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
continue;
WARN_ON(!p4d_present(p4d));
- unmap_hotplug_pud_range(p4dp, addr, next, free_mapped);
+ unmap_hotplug_pud_range(p4dp, addr, next, free_mapped, altmap);
} while (addr = next, addr < end);
}
static void unmap_hotplug_range(unsigned long addr, unsigned long end,
- bool free_mapped)
+ bool free_mapped, struct vmem_altmap *altmap)
{
unsigned long next;
pgd_t *pgdp, pgd;
+ /*
+ * altmap can only be used as vmemmap mapping backing memory.
+ * In case the backing memory itself is not being freed, then
+ * altmap is irrelevant. Warn about this inconsistency when
+ * encountered.
+ */
+ WARN_ON(!free_mapped && altmap);
+
do {
next = pgd_addr_end(addr, end);
pgdp = pgd_offset_k(addr);
@@ -905,7 +924,7 @@ static void unmap_hotplug_range(unsigned long addr, unsigned long end,
continue;
WARN_ON(!pgd_present(pgd));
- unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped);
+ unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
} while (addr = next, addr < end);
}
@@ -1069,7 +1088,7 @@ static void free_empty_tables(unsigned long addr, unsigned long end,
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap)
{
- return vmemmap_populate_basepages(start, end, node);
+ return vmemmap_populate_basepages(start, end, node, altmap);
}
#else /* !ARM64_SWAPPER_USES_SECTION_MAPS */
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
@@ -1101,7 +1120,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
if (pmd_none(READ_ONCE(*pmdp))) {
void *p = NULL;
- p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
if (!p)
return -ENOMEM;
@@ -1119,7 +1138,7 @@ void vmemmap_free(unsigned long start, unsigned long end,
#ifdef CONFIG_MEMORY_HOTPLUG
WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
- unmap_hotplug_range(start, end, true);
+ unmap_hotplug_range(start, end, true, altmap);
free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
#endif
}
@@ -1410,7 +1429,7 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
WARN_ON(pgdir != init_mm.pgd);
WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
- unmap_hotplug_range(start, end, false);
+ unmap_hotplug_range(start, end, false, NULL);
free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
}
diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h
index c7c1ed27e348..d58d8146b729 100644
--- a/arch/csky/include/asm/pgalloc.h
+++ b/arch/csky/include/asm/pgalloc.h
@@ -9,7 +9,7 @@
#include <linux/sched.h>
#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
-#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
+#include <asm-generic/pgalloc.h>
static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
pte_t *pte)
@@ -42,11 +42,6 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
return pte;
}
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- free_pages((unsigned long)pgd, PGD_ORDER);
-}
-
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *ret;
diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c
index e7425e6b0419..041d0de6a1b6 100644
--- a/arch/csky/kernel/smp.c
+++ b/arch/csky/kernel/smp.c
@@ -23,7 +23,6 @@
#include <asm/traps.h>
#include <asm/sections.h>
#include <asm/mmu_context.h>
-#include <asm/pgalloc.h>
#ifdef CONFIG_CPU_HAS_FPU
#include <abi/fpu.h>
#endif
diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h
index cc9be514a676..f0c47e6a7427 100644
--- a/arch/hexagon/include/asm/pgalloc.h
+++ b/arch/hexagon/include/asm/pgalloc.h
@@ -11,7 +11,7 @@
#include <asm/mem-layout.h>
#include <asm/atomic.h>
-#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
+#include <asm-generic/pgalloc.h>
extern unsigned long long kmap_generation;
@@ -41,11 +41,6 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
return pgd;
}
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- free_page((unsigned long) pgd);
-}
-
static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
pgtable_t pte)
{
diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h
index 2a3050345099..9601cfe83c94 100644
--- a/arch/ia64/include/asm/pgalloc.h
+++ b/arch/ia64/include/asm/pgalloc.h
@@ -29,11 +29,6 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
return (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
}
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- free_page((unsigned long)pgd);
-}
-
#if CONFIG_PGTABLE_LEVELS == 4
static inline void
p4d_populate(struct mm_struct *mm, p4d_t * p4d_entry, pud_t * pud)
@@ -41,15 +36,6 @@ p4d_populate(struct mm_struct *mm, p4d_t * p4d_entry, pud_t * pud)
p4d_val(*p4d_entry) = __pa(pud);
}
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
- return (pud_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-}
-
-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
-{
- free_page((unsigned long)pud);
-}
#define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud)
#endif /* CONFIG_PGTABLE_LEVELS == 4 */
@@ -59,16 +45,6 @@ pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd)
pud_val(*pud_entry) = __pa(pmd);
}
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
- return (pmd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
- free_page((unsigned long)pmd);
-}
-
#define __pmd_free_tlb(tlb, pmd, address) pmd_free((tlb)->mm, pmd)
static inline void
diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
index f1f257d632b3..8d9da6f08a62 100644
--- a/arch/ia64/include/asm/tlb.h
+++ b/arch/ia64/include/asm/tlb.h
@@ -42,7 +42,6 @@
#include <linux/pagemap.h>
#include <linux/swap.h>
-#include <asm/pgalloc.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 7a4de9d994c5..ec0b40f6e9c6 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -40,7 +40,6 @@
#include <asm/elf.h>
#include <asm/irq.h>
#include <asm/kexec.h>
-#include <asm/pgalloc.h>
#include <asm/processor.h>
#include <asm/sal.h>
#include <asm/switch_to.h>
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index bbfd421e6deb..0e2742003121 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -39,7 +39,6 @@
#include <asm/io.h>
#include <asm/irq.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/processor.h>
#include <asm/ptrace.h>
#include <asm/sal.h>
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 016683b743c2..c29c600d7967 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -49,7 +49,6 @@
#include <asm/irq.h>
#include <asm/mca.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/processor.h>
#include <asm/ptrace.h>
#include <asm/sal.h>
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index d7d31c718d2d..e30e360beef8 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -21,7 +21,6 @@
#include <linux/swap.h>
#include <asm/meminit.h>
-#include <asm/pgalloc.h>
#include <asm/sections.h>
#include <asm/mca.h>
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index da810ca234da..dbe829fc5298 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -24,7 +24,6 @@
#include <linux/efi.h>
#include <linux/nodemask.h>
#include <linux/slab.h>
-#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/meminit.h>
#include <asm/numa.h>
@@ -601,7 +600,6 @@ void __init paging_init(void)
max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
#ifdef CONFIG_VIRTUAL_MEM_MAP
@@ -656,7 +654,7 @@ void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat)
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap)
{
- return vmemmap_populate_basepages(start, end, node);
+ return vmemmap_populate_basepages(start, end, node, NULL);
}
void vmemmap_free(unsigned long start, unsigned long end,
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 32352a73df0c..b331f94d20ac 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -18,7 +18,6 @@
#include <linux/sysctl.h>
#include <linux/log2.h>
#include <asm/mman.h>
-#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
index 71c19918e387..135b5135cace 100644
--- a/arch/ia64/mm/tlb.c
+++ b/arch/ia64/mm/tlb.c
@@ -27,7 +27,6 @@
#include <asm/delay.h>
#include <asm/mmu_context.h>
-#include <asm/pgalloc.h>
#include <asm/pal.h>
#include <asm/tlbflush.h>
#include <asm/dma.h>
diff --git a/arch/m68k/include/asm/mmu_context.h b/arch/m68k/include/asm/mmu_context.h
index cac9f289d1f6..993fd7e37069 100644
--- a/arch/m68k/include/asm/mmu_context.h
+++ b/arch/m68k/include/asm/mmu_context.h
@@ -222,7 +222,7 @@ static inline void activate_mm(struct mm_struct *prev_mm,
#include <asm/setup.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h
index 11b95dadf7c0..000f64869b91 100644
--- a/arch/m68k/include/asm/sun3_pgalloc.h
+++ b/arch/m68k/include/asm/sun3_pgalloc.h
@@ -13,7 +13,7 @@
#include <asm/tlb.h>
-#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
+#include <asm-generic/pgalloc.h>
extern const char bad_pmd_string[];
@@ -40,11 +40,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t page
*/
#define pmd_free(mm, x) do { } while (0)
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- free_page((unsigned long) pgd);
-}
-
static inline pgd_t * pgd_alloc(struct mm_struct *mm)
{
pgd_t *new_pgd;
diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c
index 871a0e11da34..b1ca3522eccc 100644
--- a/arch/m68k/kernel/dma.c
+++ b/arch/m68k/kernel/dma.c
@@ -15,7 +15,7 @@
#include <linux/vmalloc.h>
#include <linux/export.h>
-#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
#if defined(CONFIG_MMU) && !defined(CONFIG_COLDFIRE)
void arch_dma_prep_coherent(struct page *page, size_t size)
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index df6fc782754f..546e81935fe8 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -35,10 +35,9 @@
#include <asm/fpu.h>
#include <linux/uaccess.h>
#include <asm/traps.h>
-#include <asm/pgalloc.h>
#include <asm/machdep.h>
#include <asm/siginfo.h>
-
+#include <asm/tlbflush.h>
static const char *vec_names[] = {
[VEC_RESETSP] = "RESET SP",
diff --git a/arch/m68k/mm/cache.c b/arch/m68k/mm/cache.c
index 5ecb3310e874..b486c0889eec 100644
--- a/arch/m68k/mm/cache.c
+++ b/arch/m68k/mm/cache.c
@@ -8,7 +8,7 @@
*/
#include <linux/module.h>
-#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
#include <asm/traps.h>
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index a94a814ad6ad..508abb63da67 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -15,7 +15,6 @@
#include <asm/setup.h>
#include <asm/traps.h>
-#include <asm/pgalloc.h>
extern void die_if_kernel(char *, struct pt_regs *, long);
diff --git a/arch/m68k/mm/kmap.c b/arch/m68k/mm/kmap.c
index 14d31d216cef..1269d513b221 100644
--- a/arch/m68k/mm/kmap.c
+++ b/arch/m68k/mm/kmap.c
@@ -19,8 +19,8 @@
#include <asm/setup.h>
#include <asm/segment.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/io.h>
+#include <asm/tlbflush.h>
#undef DEBUG
diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c
index a3e7e4e70dd5..2b9cb4a62281 100644
--- a/arch/m68k/mm/mcfmmu.c
+++ b/arch/m68k/mm/mcfmmu.c
@@ -20,6 +20,7 @@
#include <asm/mmu_context.h>
#include <asm/mcf_pgalloc.h>
#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
#define KMAPAREA(x) ((x >= VMALLOC_START) && (x < KMAP_END))
diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c
index 65e0c4071912..fe75aecfb238 100644
--- a/arch/m68k/mm/memory.c
+++ b/arch/m68k/mm/memory.c
@@ -17,7 +17,6 @@
#include <asm/setup.h>
#include <asm/segment.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/traps.h>
#include <asm/machdep.h>
diff --git a/arch/m68k/sun3x/dvma.c b/arch/m68k/sun3x/dvma.c
index fef52d222d46..08bb92113026 100644
--- a/arch/m68k/sun3x/dvma.c
+++ b/arch/m68k/sun3x/dvma.c
@@ -22,7 +22,7 @@
#include <asm/dvma.h>
#include <asm/io.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
/* IOMMU support */
diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h
index ebb6b7939bb8..8839ce00ea05 100644
--- a/arch/microblaze/include/asm/pgalloc.h
+++ b/arch/microblaze/include/asm/pgalloc.h
@@ -28,12 +28,6 @@ static inline pgd_t *get_pgd(void)
return (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0);
}
-static inline void free_pgd(pgd_t *pgd)
-{
- free_page((unsigned long)pgd);
-}
-
-#define pgd_free(mm, pgd) free_pgd(pgd)
#define pgd_alloc(mm) get_pgd()
#define pmd_pgtable(pmd) pmd_page(pmd)
diff --git a/arch/microblaze/include/asm/tlbflush.h b/arch/microblaze/include/asm/tlbflush.h
index 6f8f5c77a050..1200e2bf14bb 100644
--- a/arch/microblaze/include/asm/tlbflush.h
+++ b/arch/microblaze/include/asm/tlbflush.h
@@ -15,7 +15,6 @@
#include <asm/processor.h> /* For TASK_SIZE */
#include <asm/mmu.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
extern void _tlbie(unsigned long address);
extern void _tlbia(void);
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index 6cabeab9e2ba..a9e46e525cd0 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -18,7 +18,6 @@
#include <linux/tick.h>
#include <linux/bitops.h>
#include <linux/ptrace.h>
-#include <asm/pgalloc.h>
#include <linux/uaccess.h> /* for USER_DS macros */
#include <asm/cacheflush.h>
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index bdd6d0c86e16..65bf5fd8d473 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -35,7 +35,6 @@
#include <asm/entry.h>
#include <asm/ucontext.h>
#include <linux/uaccess.h>
-#include <asm/pgalloc.h>
#include <linux/syscalls.h>
#include <asm/cacheflush.h>
#include <asm/syscalls.h>
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 521b59ba716c..0880a003573d 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -172,9 +172,6 @@ void __init setup_memory(void)
&memblock.memory, 0);
}
- /* XXX need to clip this if using highmem? */
- sparse_memory_present_with_active_regions(0);
-
paging_init();
}
diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h
index fa77cb71f303..8b18424b3120 100644
--- a/arch/mips/include/asm/pgalloc.h
+++ b/arch/mips/include/asm/pgalloc.h
@@ -13,7 +13,9 @@
#include <linux/mm.h>
#include <linux/sched.h>
-#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
+#define __HAVE_ARCH_PMD_ALLOC_ONE
+#define __HAVE_ARCH_PUD_ALLOC_ONE
+#include <asm-generic/pgalloc.h>
static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
pte_t *pte)
@@ -47,11 +49,6 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
extern void pgd_init(unsigned long page);
extern pgd_t *pgd_alloc(struct mm_struct *mm);
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- free_pages((unsigned long)pgd, PGD_ORDER);
-}
-
#define __pte_free_tlb(tlb,pte,address) \
do { \
pgtable_pte_page_dtor(pte); \
@@ -70,11 +67,6 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
return pmd;
}
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
- free_pages((unsigned long)pmd, PMD_ORDER);
-}
-
#define __pmd_free_tlb(tlb, x, addr) pmd_free((tlb)->mm, x)
#endif
@@ -91,11 +83,6 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
return pud;
}
-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
-{
- free_pages((unsigned long)pud, PUD_ORDER);
-}
-
static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
set_p4d(p4d, __p4d((unsigned long)pud));
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 588b21245e00..bf5f5acab0a8 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -371,14 +371,6 @@ static void __init bootmem_init(void)
#endif
}
-
- /*
- * In any case the added to the memblock memory regions
- * (highmem/lowmem, available/reserved, etc) are considered
- * as present, so inform sparsemem about them.
- */
- memblocks_present();
-
/*
* Reserve initrd memory if needed.
*/
diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c
index 901f5be5ee76..ea8bb1bc667e 100644
--- a/arch/mips/loongson64/numa.c
+++ b/arch/mips/loongson64/numa.c
@@ -220,7 +220,6 @@ static __init void prom_meminit(void)
cpumask_clear(&__node_cpumask[node]);
}
}
- memblocks_present();
max_low_pfn = PHYS_PFN(memblock_end_of_DRAM());
for (cpu = 0; cpu < loongson_sysconf.nr_cpus; cpu++) {
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index 1213215ea965..d411e0a90a5b 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -402,8 +402,6 @@ void __init prom_meminit(void)
}
__node_data[node] = &null_node;
}
-
- memblocks_present();
}
void __init prom_free_prom_memory(void)
diff --git a/arch/mips/sgi-ip32/ip32-memory.c b/arch/mips/sgi-ip32/ip32-memory.c
index be1b2cfc4c3e..62b956cc2d1d 100644
--- a/arch/mips/sgi-ip32/ip32-memory.c
+++ b/arch/mips/sgi-ip32/ip32-memory.c
@@ -14,7 +14,6 @@
#include <asm/ip32/crime.h>
#include <asm/bootinfo.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
extern void crime_init(void);
diff --git a/arch/nds32/mm/mm-nds32.c b/arch/nds32/mm/mm-nds32.c
index 8503bee882d1..55bec50ccc03 100644
--- a/arch/nds32/mm/mm-nds32.c
+++ b/arch/nds32/mm/mm-nds32.c
@@ -2,6 +2,8 @@
// Copyright (C) 2005-2017 Andes Technology Corporation
#include <linux/init_task.h>
+
+#define __HAVE_ARCH_PGD_FREE
#include <asm/pgalloc.h>
#define FIRST_KERNEL_PGD_NR (USER_PTRS_PER_PGD)
diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h
index 0b146d773c85..e6600d2a5ae0 100644
--- a/arch/nios2/include/asm/pgalloc.h
+++ b/arch/nios2/include/asm/pgalloc.h
@@ -12,7 +12,7 @@
#include <linux/mm.h>
-#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
+#include <asm-generic/pgalloc.h>
static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
pte_t *pte)
@@ -34,11 +34,6 @@ extern void pmd_init(unsigned long page, unsigned long pagetable);
extern pgd_t *pgd_alloc(struct mm_struct *mm);
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- free_pages((unsigned long)pgd, PGD_ORDER);
-}
-
#define __pte_free_tlb(tlb, pte, addr) \
do { \
pgtable_pte_page_dtor(pte); \
diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h
index da12a4c38c4b..88820299ecc4 100644
--- a/arch/openrisc/include/asm/pgalloc.h
+++ b/arch/openrisc/include/asm/pgalloc.h
@@ -20,6 +20,9 @@
#include <linux/mm.h>
#include <linux/memblock.h>
+#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
+#include <asm-generic/pgalloc.h>
+
extern int mem_init_done;
#define pmd_populate_kernel(mm, pmd, pte) \
@@ -61,38 +64,8 @@ extern inline pgd_t *pgd_alloc(struct mm_struct *mm)
}
#endif
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- free_page((unsigned long)pgd);
-}
-
extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
-static inline struct page *pte_alloc_one(struct mm_struct *mm)
-{
- struct page *pte;
- pte = alloc_pages(GFP_KERNEL, 0);
- if (!pte)
- return NULL;
- clear_page(page_address(pte));
- if (!pgtable_pte_page_ctor(pte)) {
- __free_page(pte);
- return NULL;
- }
- return pte;
-}
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
- free_page((unsigned long)pte);
-}
-
-static inline void pte_free(struct mm_struct *mm, struct page *pte)
-{
- pgtable_pte_page_dtor(pte);
- __free_page(pte);
-}
-
#define __pte_free_tlb(tlb, pte, addr) \
do { \
pgtable_pte_page_dtor(pte); \
diff --git a/arch/openrisc/include/asm/tlbflush.h b/arch/openrisc/include/asm/tlbflush.h
index 4a4639c65cbb..185dcd3731ed 100644
--- a/arch/openrisc/include/asm/tlbflush.h
+++ b/arch/openrisc/include/asm/tlbflush.h
@@ -17,7 +17,6 @@
#include <linux/mm.h>
#include <asm/processor.h>
-#include <asm/pgalloc.h>
#include <asm/current.h>
#include <linux/sched.h>
diff --git a/arch/openrisc/kernel/or32_ksyms.c b/arch/openrisc/kernel/or32_ksyms.c
index 277ac7a55752..212e5f85004c 100644
--- a/arch/openrisc/kernel/or32_ksyms.c
+++ b/arch/openrisc/kernel/or32_ksyms.c
@@ -26,7 +26,6 @@
#include <asm/io.h>
#include <asm/hardirq.h>
#include <asm/delay.h>
-#include <asm/pgalloc.h>
#define DECLARE_EXPORT(name) extern void name(void); EXPORT_SYMBOL(name)
diff --git a/arch/parisc/include/asm/mmu_context.h b/arch/parisc/include/asm/mmu_context.h
index 07b89c74abeb..cb5f2f730421 100644
--- a/arch/parisc/include/asm/mmu_context.h
+++ b/arch/parisc/include/asm/mmu_context.h
@@ -5,7 +5,6 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/atomic.h>
-#include <asm/pgalloc.h>
#include <asm-generic/mm_hooks.h>
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h
index 9ac74da256b8..cc7ecc2ef55d 100644
--- a/arch/parisc/include/asm/pgalloc.h
+++ b/arch/parisc/include/asm/pgalloc.h
@@ -10,7 +10,9 @@
#include <asm/cache.h>
-#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
+#define __HAVE_ARCH_PMD_FREE
+#define __HAVE_ARCH_PGD_FREE
+#include <asm-generic/pgalloc.h>
/* Allocate the top level pgd (page directory)
*
@@ -65,14 +67,6 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
(__u32)(__pa((unsigned long)pmd) >> PxD_VALUE_SHIFT)));
}
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
-{
- pmd_t *pmd = (pmd_t *)__get_free_pages(GFP_KERNEL, PMD_ORDER);
- if (pmd)
- memset(pmd, 0, PAGE_SIZE<<PMD_ORDER);
- return pmd;
-}
-
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
if (pmd_flag(*pmd) & PxD_FLAG_ATTACHED) {
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index 1eedfecc5137..b5e1d9f1b440 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -24,7 +24,6 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/processor.h>
#include <asm/sections.h>
#include <asm/shmparam.h>
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index 4f1596bb1936..38c68e131bbe 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -32,7 +32,6 @@
#include <asm/dma.h> /* for DMA_CHUNK_SIZE */
#include <asm/io.h>
#include <asm/page.h> /* get_order */
-#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/tlbflush.h> /* for purge_tlb_*() macros */
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index de6299ff1530..86ec30cc0e77 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -47,7 +47,6 @@
#include <asm/assembly.h>
#include <asm/pdc.h>
#include <asm/pdc_chassis.h>
-#include <asm/pgalloc.h>
#include <asm/unwind.h>
#include <asm/sections.h>
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 02895a8f2c55..5df5d4cd5d4c 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -30,7 +30,6 @@
#include <asm/ucontext.h>
#include <asm/rt_sigframe.h>
#include <linux/uaccess.h>
-#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
#include <asm/asm-offsets.h>
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index f8a842ddd82d..6271139d2213 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -39,7 +39,6 @@
#include <asm/irq.h> /* for CPU_IRQ_REGION and friends */
#include <asm/mmu_context.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/processor.h>
#include <asm/ptrace.h>
#include <asm/unistd.h>
diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c
index 0e1e212f1c96..d7ba014a7fbb 100644
--- a/arch/parisc/mm/hugetlbpage.c
+++ b/arch/parisc/mm/hugetlbpage.c
@@ -15,7 +15,6 @@
#include <linux/sysctl.h>
#include <asm/mman.h>
-#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 39ea464c8bd9..4381b65ae1e0 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -689,11 +689,6 @@ void __init paging_init(void)
flush_cache_all_local(); /* start with known state */
flush_tlb_all_local(NULL);
- /*
- * Mark all memblocks as present for sparsemem using
- * memory_present() and then initialize sparsemem.
- */
- memblocks_present();
sparse_init();
parisc_bootmem_free();
}
diff --git a/arch/parisc/mm/ioremap.c b/arch/parisc/mm/ioremap.c
index 6e7c005aa09b..345ff0b66499 100644
--- a/arch/parisc/mm/ioremap.c
+++ b/arch/parisc/mm/ioremap.c
@@ -11,7 +11,7 @@
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/io.h>
-#include <asm/pgalloc.h>
+#include <linux/mm.h>
/*
* Generic mapping function (not visible outside):
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index 862985cf5180..fbc6f3002f23 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -12,7 +12,6 @@
#ifndef __powerpc64__
#include <linux/pgtable.h>
#endif
-#include <asm/pgalloc.h>
#ifndef __powerpc64__
#include <asm/page.h>
#include <asm/mmu.h>
diff --git a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c
index 25acb9c5ee1b..964467b3a776 100644
--- a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c
@@ -10,7 +10,6 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
-#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
#include <asm/machdep.h>
diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
index 2a99167afbaf..fd9c7f91b092 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -9,7 +9,6 @@
#include <linux/mm_types.h>
#include <linux/mm.h>
-#include <asm/pgalloc.h>
#include <asm/sections.h>
#include <asm/mmu.h>
#include <asm/tlb.h>
diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c
index 0fbf3dc9f2c2..eb0bccaf221e 100644
--- a/arch/powerpc/mm/book3s64/hash_tlb.c
+++ b/arch/powerpc/mm/book3s64/hash_tlb.c
@@ -21,7 +21,6 @@
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/bug.h>
diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
index c812b401b66c..cb91071eef52 100644
--- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
@@ -2,7 +2,6 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/security.h>
-#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
#include <asm/machdep.h>
#include <asm/mman.h>
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index bf1717f8d5f4..02c7db4087cb 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -29,7 +29,6 @@
#include <linux/slab.h>
#include <linux/hugetlb.h>
-#include <asm/pgalloc.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu.h>
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 4ae5fc0ceb30..02e127fa5777 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -225,12 +225,12 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
* fall back to system memory if the altmap allocation fail.
*/
if (altmap && !altmap_cross_boundary(altmap, start, page_size)) {
- p = altmap_alloc_block_buf(page_size, altmap);
+ p = vmemmap_alloc_block_buf(page_size, node, altmap);
if (!p)
pr_debug("altmap block allocation failed, falling back to system memory");
}
if (!p)
- p = vmemmap_alloc_block_buf(page_size, node);
+ p = vmemmap_alloc_block_buf(page_size, node, NULL);
if (!p)
return -ENOMEM;
diff --git a/arch/powerpc/mm/kasan/8xx.c b/arch/powerpc/mm/kasan/8xx.c
index 569d98a41881..2784224054f8 100644
--- a/arch/powerpc/mm/kasan/8xx.c
+++ b/arch/powerpc/mm/kasan/8xx.c
@@ -5,7 +5,6 @@
#include <linux/kasan.h>
#include <linux/memblock.h>
#include <linux/hugetlb.h>
-#include <asm/pgalloc.h>
static int __init
kasan_init_shadow_8M(unsigned long k_start, unsigned long k_end, void *block)
diff --git a/arch/powerpc/mm/kasan/book3s_32.c b/arch/powerpc/mm/kasan/book3s_32.c
index a32b4640b9de..202bd260a009 100644
--- a/arch/powerpc/mm/kasan/book3s_32.c
+++ b/arch/powerpc/mm/kasan/book3s_32.c
@@ -4,7 +4,6 @@
#include <linux/kasan.h>
#include <linux/memblock.h>
-#include <asm/pgalloc.h>
#include <mm/mmu_decl.h>
int __init kasan_init_region(void *start, size_t size)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 9dafc636588f..42e25874f5a8 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -34,7 +34,6 @@
#include <linux/dma-direct.h>
#include <linux/kprobes.h>
-#include <asm/pgalloc.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
@@ -179,8 +178,6 @@ void __init mem_topology_setup(void)
void __init initmem_init(void)
{
- /* XXX need to clip this if using highmem? */
- sparse_memory_present_with_active_regions(0);
sparse_init();
}
diff --git a/arch/powerpc/mm/nohash/40x.c b/arch/powerpc/mm/nohash/40x.c
index 13e74bc39ba5..95751c322f6c 100644
--- a/arch/powerpc/mm/nohash/40x.c
+++ b/arch/powerpc/mm/nohash/40x.c
@@ -32,7 +32,6 @@
#include <linux/highmem.h>
#include <linux/memblock.h>
-#include <asm/pgalloc.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index 92e8929cbe3e..d2b37146ae6c 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -13,7 +13,6 @@
#include <asm/fixmap.h>
#include <asm/code-patching.h>
#include <asm/inst.h>
-#include <asm/pgalloc.h>
#include <mm/mmu_decl.h>
diff --git a/arch/powerpc/mm/nohash/fsl_booke.c b/arch/powerpc/mm/nohash/fsl_booke.c
index c06dfbb771f4..0c294827d6e5 100644
--- a/arch/powerpc/mm/nohash/fsl_booke.c
+++ b/arch/powerpc/mm/nohash/fsl_booke.c
@@ -37,7 +37,6 @@
#include <linux/highmem.h>
#include <linux/memblock.h>
-#include <asm/pgalloc.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c
index bce0e5349978..4c74e8a5482b 100644
--- a/arch/powerpc/mm/nohash/kaslr_booke.c
+++ b/arch/powerpc/mm/nohash/kaslr_booke.c
@@ -15,7 +15,6 @@
#include <linux/libfdt.h>
#include <linux/crash_core.h>
#include <asm/cacheflush.h>
-#include <asm/pgalloc.h>
#include <asm/prom.h>
#include <asm/kdump.h>
#include <mm/mmu_decl.h>
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index 696f568253a0..14514585db98 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -34,6 +34,7 @@
#include <linux/of_fdt.h>
#include <linux/hugetlb.h>
+#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/code-patching.h>
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 058fee9a0835..1f61fa2148b5 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -953,7 +953,6 @@ void __init initmem_init(void)
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
setup_node_data(nid, start_pfn, end_pfn);
- sparse_memory_present_with_active_regions(nid);
}
sparse_init();
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 1136257c3a99..9c0547d77af3 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -23,7 +23,6 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/hugetlb.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/hugetlb.h>
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index bb43a8c04bee..cc6e2f94517f 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -31,7 +31,6 @@
#include <linux/slab.h>
#include <linux/hugetlb.h>
-#include <asm/pgalloc.h>
#include <asm/page.h>
#include <asm/prom.h>
#include <asm/mmu_context.h>
diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c
index 5b8bd34cd3a1..ad6df9a2e7c8 100644
--- a/arch/powerpc/mm/ptdump/hashpagetable.c
+++ b/arch/powerpc/mm/ptdump/hashpagetable.c
@@ -17,10 +17,10 @@
#include <linux/seq_file.h>
#include <linux/const.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/plpar_wrappers.h>
#include <linux/memblock.h>
#include <asm/firmware.h>
+#include <asm/pgalloc.h>
struct pg_state {
struct seq_file *seq;
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index c911cd757f7d..aca354fb670b 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -21,7 +21,6 @@
#include <asm/fixmap.h>
#include <linux/const.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/hugetlb.h>
#include <mm/mmu_decl.h>
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 9dba7e880885..45a3a3022a85 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -26,7 +26,6 @@
#include <asm/firmware.h>
#include <asm/hvcall.h>
#include <asm/mmu.h>
-#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <linux/memory.h>
#include <asm/plpar_wrappers.h>
diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
index 3f601ee8233f..23b1544e0ca5 100644
--- a/arch/riscv/include/asm/pgalloc.h
+++ b/arch/riscv/include/asm/pgalloc.h
@@ -11,7 +11,7 @@
#include <asm/tlb.h>
#ifdef CONFIG_MMU
-#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
+#include <asm-generic/pgalloc.h>
static inline void pmd_populate_kernel(struct mm_struct *mm,
pmd_t *pmd, pte_t *pte)
@@ -55,24 +55,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
return pgd;
}
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- free_page((unsigned long)pgd);
-}
-
#ifndef __PAGETABLE_PMD_FOLDED
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
- return (pmd_t *)__get_free_page(
- GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_ZERO);
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
- free_page((unsigned long)pmd);
-}
-
#define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd)
#endif /* __PAGETABLE_PMD_FOLDED */
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index ae7b7fe24658..5873835a3e6b 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -14,7 +14,6 @@
#include <linux/signal.h>
#include <linux/uaccess.h>
-#include <asm/pgalloc.h>
#include <asm/ptrace.h>
#include <asm/tlbflush.h>
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 50bcd9f594d7..787c75f751a5 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -570,7 +570,6 @@ static void __init resource_init(void)
void __init paging_init(void)
{
setup_vm_final();
- memblocks_present();
sparse_init();
setup_zero_page();
zone_sizes_init();
@@ -581,6 +580,6 @@ void __init paging_init(void)
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap)
{
- return vmemmap_populate_basepages(start, end, node);
+ return vmemmap_populate_basepages(start, end, node, NULL);
}
#endif
diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c
index e1ae23911ccd..5057773f82e9 100644
--- a/arch/s390/crypto/prng.c
+++ b/arch/s390/crypto/prng.c
@@ -249,7 +249,7 @@ static void prng_tdes_deinstantiate(void)
{
pr_debug("The prng module stopped "
"after running in triple DES mode\n");
- kzfree(prng_data);
+ kfree_sensitive(prng_data);
}
@@ -442,7 +442,7 @@ outfree:
static void prng_sha512_deinstantiate(void)
{
pr_debug("The prng module stopped after running in SHA-512 mode\n");
- kzfree(prng_data);
+ kfree_sensitive(prng_data);
}
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index aa406c05a350..954fa8ca6cbd 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -36,7 +36,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
#define p4d_free_tlb p4d_free_tlb
#define pud_free_tlb pud_free_tlb
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm-generic/tlb.h>
diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h
index 2204704840ea..acce6a08a1fa 100644
--- a/arch/s390/include/asm/tlbflush.h
+++ b/arch/s390/include/asm/tlbflush.h
@@ -5,7 +5,6 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <asm/processor.h>
-#include <asm/pgalloc.h>
/*
* Flush all TLB entries on the local CPU.
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index 93c6b8932fbd..d91989c7bd6a 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -16,7 +16,6 @@
#include <linux/debug_locks.h>
#include <asm/cio.h>
#include <asm/setup.h>
-#include <asm/pgalloc.h>
#include <asm/smp.h>
#include <asm/ipl.h>
#include <asm/diag.h>
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index a09b9e98936c..11d2f7d05f91 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -25,7 +25,6 @@
#include <linux/compat.h>
#include <trace/syscall.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/switch_to.h>
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 563429dece03..5b8ec1c447e1 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -10,7 +10,6 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
-#include <asm/pgalloc.h>
#include <asm/gmap.h>
#include <asm/virtio-ccw.h>
#include "kvm-s390.h"
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 96ae368aa0a2..2f721a923b54 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -22,7 +22,6 @@
#include <asm/ebcdic.h>
#include <asm/sysinfo.h>
#include <asm/page-states.h>
-#include <asm/pgalloc.h>
#include <asm/gmap.h>
#include <asm/io.h>
#include <asm/ptrace.h>
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index 63e330109b63..eb99e2f95ebe 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -9,7 +9,6 @@
#include <linux/kvm_host.h>
#include <linux/pagemap.h>
#include <linux/sched/signal.h>
-#include <asm/pgalloc.h>
#include <asm/gmap.h>
#include <asm/uv.h>
#include <asm/mman.h>
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 5c15ae3daf71..1141c8d5c0d0 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -21,7 +21,6 @@
#include <linux/oom.h>
#include <linux/uaccess.h>
-#include <asm/pgalloc.h>
#include <asm/diag.h>
#ifdef CONFIG_CMM_IUCV
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 6dc7c3b60ef6..0d282081dc1f 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -115,7 +115,6 @@ void __init paging_init(void)
__load_psw_mask(psw.mask);
kasan_free_early_identity();
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
zone_dma_bits = 31;
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 1b78f630a9ca..e54f928503c5 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -17,7 +17,6 @@
#include <linux/random.h>
#include <linux/compat.h>
#include <linux/security.h>
-#include <asm/pgalloc.h>
#include <asm/elf.h>
static unsigned long stack_maxrandom_size(void)
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 2e0cc19f4cd7..0d25f743b270 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -19,7 +19,6 @@
#include <linux/ksm.h>
#include <linux/mman.h>
-#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h
index d770da3f8b6f..0e6b0be25e33 100644
--- a/arch/sh/include/asm/pgalloc.h
+++ b/arch/sh/include/asm/pgalloc.h
@@ -3,6 +3,10 @@
#define __ASM_SH_PGALLOC_H
#include <asm/page.h>
+
+#define __HAVE_ARCH_PMD_ALLOC_ONE
+#define __HAVE_ARCH_PMD_FREE
+#define __HAVE_ARCH_PGD_FREE
#include <asm-generic/pgalloc.h>
extern pgd_t *pgd_alloc(struct mm_struct *);
diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c
index c20fc5487e05..0dc0f52f9bb8 100644
--- a/arch/sh/kernel/idle.c
+++ b/arch/sh/kernel/idle.c
@@ -14,7 +14,6 @@
#include <linux/irqflags.h>
#include <linux/smp.h>
#include <linux/atomic.h>
-#include <asm/pgalloc.h>
#include <asm/smp.h>
#include <asm/bl_bit.h>
diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c
index 4a98980b8a07..223c14f44af7 100644
--- a/arch/sh/kernel/machine_kexec.c
+++ b/arch/sh/kernel/machine_kexec.c
@@ -14,7 +14,6 @@
#include <linux/ftrace.h>
#include <linux/suspend.h>
#include <linux/memblock.h>
-#include <asm/pgalloc.h>
#include <asm/mmu_context.h>
#include <asm/io.h>
#include <asm/cacheflush.h>
diff --git a/arch/sh/mm/cache-sh3.c b/arch/sh/mm/cache-sh3.c
index 26f3bd43e850..bc595982d396 100644
--- a/arch/sh/mm/cache-sh3.c
+++ b/arch/sh/mm/cache-sh3.c
@@ -16,7 +16,6 @@
#include <asm/cache.h>
#include <asm/io.h>
#include <linux/uaccess.h>
-#include <asm/pgalloc.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
diff --git a/arch/sh/mm/cache-sh7705.c b/arch/sh/mm/cache-sh7705.c
index 48978293226c..4c67b3d88775 100644
--- a/arch/sh/mm/cache-sh7705.c
+++ b/arch/sh/mm/cache-sh7705.c
@@ -20,7 +20,6 @@
#include <asm/cache.h>
#include <asm/io.h>
#include <linux/uaccess.h>
-#include <asm/pgalloc.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index acd5652a0de3..220d7bc43d2b 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -17,7 +17,6 @@
#include <linux/sysctl.h>
#include <asm/mman.h>
-#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index a70ba0fdd0b3..613de8096335 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -27,6 +27,7 @@
#include <asm/sections.h>
#include <asm/setup.h>
#include <asm/cache.h>
+#include <asm/pgalloc.h>
#include <linux/sizes.h>
pgd_t swapper_pg_dir[PTRS_PER_PGD];
@@ -240,12 +241,6 @@ static void __init do_init_bootmem(void)
plat_mem_setup();
- for_each_memblock(memory, reg) {
- int nid = memblock_get_region_node(reg);
-
- memory_present(nid, memblock_region_memory_base_pfn(reg),
- memblock_region_memory_end_pfn(reg));
- }
sparse_init();
}
diff --git a/arch/sh/mm/ioremap_fixed.c b/arch/sh/mm/ioremap_fixed.c
index 07e744d75fa0..aab3f82856bb 100644
--- a/arch/sh/mm/ioremap_fixed.c
+++ b/arch/sh/mm/ioremap_fixed.c
@@ -18,7 +18,6 @@
#include <linux/proc_fs.h>
#include <asm/fixmap.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/addrspace.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
diff --git a/arch/sh/mm/numa.c b/arch/sh/mm/numa.c
index f7e4439deb17..50f0dc1744d0 100644
--- a/arch/sh/mm/numa.c
+++ b/arch/sh/mm/numa.c
@@ -53,7 +53,4 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end)
/* It's up */
node_set_online(nid);
-
- /* Kick sparsemem */
- sparse_memory_present_with_active_regions(nid);
}
diff --git a/arch/sh/mm/tlb-sh3.c b/arch/sh/mm/tlb-sh3.c
index 869243518bb3..fb400afc2a49 100644
--- a/arch/sh/mm/tlb-sh3.c
+++ b/arch/sh/mm/tlb-sh3.c
@@ -21,7 +21,6 @@
#include <asm/io.h>
#include <linux/uaccess.h>
-#include <asm/pgalloc.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
diff --git a/arch/sparc/include/asm/ide.h b/arch/sparc/include/asm/ide.h
index 499aa2e6e276..904cc6cbc155 100644
--- a/arch/sparc/include/asm/ide.h
+++ b/arch/sparc/include/asm/ide.h
@@ -13,7 +13,6 @@
#include <asm/io.h>
#ifdef CONFIG_SPARC64
-#include <asm/pgalloc.h>
#include <asm/spitfire.h>
#include <asm/cacheflush.h>
#include <asm/page.h>
diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h
index 6820d357581c..e841cae544c2 100644
--- a/arch/sparc/include/asm/tlb_64.h
+++ b/arch/sparc/include/asm/tlb_64.h
@@ -4,7 +4,6 @@
#include <linux/swap.h>
#include <linux/pagemap.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
diff --git a/arch/sparc/kernel/leon_smp.c b/arch/sparc/kernel/leon_smp.c
index 41829c024f92..1eed26d423fb 100644
--- a/arch/sparc/kernel/leon_smp.c
+++ b/arch/sparc/kernel/leon_smp.c
@@ -38,7 +38,6 @@
#include <asm/delay.h>
#include <asm/irq.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/oplib.h>
#include <asm/cpudata.h>
#include <asm/asi.h>
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index bd123f1de2e7..3f519e1047b6 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -34,7 +34,6 @@
#include <asm/oplib.h>
#include <linux/uaccess.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/delay.h>
#include <asm/processor.h>
#include <asm/psr.h>
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index 3b005b6c3e0f..f1f8c8ebe641 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -23,7 +23,6 @@
#include <linux/uaccess.h>
#include <asm/ptrace.h>
-#include <asm/pgalloc.h>
#include <asm/cacheflush.h> /* flush_sig_insns */
#include <asm/switch_to.h>
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index 76ce290c67cf..50c127ab46d5 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -29,7 +29,6 @@
#include <asm/irq.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/oplib.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 0085e28bf019..e286e2badc8a 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -47,6 +47,7 @@
#include <linux/uaccess.h>
#include <asm/starfire.h>
#include <asm/tlb.h>
+#include <asm/pgalloc.h>
#include <asm/sections.h>
#include <asm/prom.h>
#include <asm/mdesc.h>
diff --git a/arch/sparc/kernel/sun4m_irq.c b/arch/sparc/kernel/sun4m_irq.c
index 91b61f012d19..1079638986b5 100644
--- a/arch/sparc/kernel/sun4m_irq.c
+++ b/arch/sparc/kernel/sun4m_irq.c
@@ -16,7 +16,6 @@
#include <asm/timer.h>
#include <asm/traps.h>
-#include <asm/pgalloc.h>
#include <asm/irq.h>
#include <asm/io.h>
#include <asm/cacheflush.h>
diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c
index d1fc9a7b7d78..8f2a2afb048a 100644
--- a/arch/sparc/mm/highmem.c
+++ b/arch/sparc/mm/highmem.c
@@ -29,7 +29,6 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
#include <asm/vaddrs.h>
static pte_t *kmap_pte;
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 02e6e5e0f106..fad6d3129904 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1610,7 +1610,6 @@ static unsigned long __init bootmem_init(unsigned long phys_base)
/* XXX cpu notifier XXX */
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
return end_pfn;
diff --git a/arch/sparc/mm/io-unit.c b/arch/sparc/mm/io-unit.c
index bfcc04bfce54..430a47a1b6ae 100644
--- a/arch/sparc/mm/io-unit.c
+++ b/arch/sparc/mm/io-unit.c
@@ -15,7 +15,6 @@
#include <linux/of.h>
#include <linux/of_device.h>
-#include <asm/pgalloc.h>
#include <asm/io.h>
#include <asm/io-unit.h>
#include <asm/mxcc.h>
diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index 35b002eb312e..3a388b1c5d4b 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -16,7 +16,6 @@
#include <linux/of.h>
#include <linux/of_device.h>
-#include <asm/pgalloc.h>
#include <asm/io.h>
#include <asm/mxcc.h>
#include <asm/mbus.h>
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index a32a16c18617..20ee14739333 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -10,7 +10,6 @@
#include <linux/swap.h>
#include <linux/preempt.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h
index 881e76da1938..5393e13e07e0 100644
--- a/arch/um/include/asm/pgalloc.h
+++ b/arch/um/include/asm/pgalloc.h
@@ -10,7 +10,7 @@
#include <linux/mm.h>
-#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
+#include <asm-generic/pgalloc.h>
#define pmd_populate_kernel(mm, pmd, pte) \
set_pmd(pmd, __pmd(_PAGE_TABLE + (unsigned long) __pa(pte)))
@@ -25,7 +25,6 @@
* Allocate and free page tables.
*/
extern pgd_t *pgd_alloc(struct mm_struct *);
-extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
#define __pte_free_tlb(tlb,pte, address) \
do { \
@@ -34,12 +33,6 @@ do { \
} while (0)
#ifdef CONFIG_3_LEVEL_PGTABLES
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
- free_page((unsigned long)pmd);
-}
-
#define __pmd_free_tlb(tlb,x, address) tlb_remove_page((tlb),virt_to_page(x))
#endif
diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h
index 36f452957cef..7e6a4180db9d 100644
--- a/arch/um/include/asm/pgtable-3level.h
+++ b/arch/um/include/asm/pgtable-3level.h
@@ -78,9 +78,6 @@ static inline void pgd_mkuptodate(pgd_t pgd) { pgd_val(pgd) &= ~_PAGE_NEWPAGE; }
#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))
#endif
-struct mm_struct;
-extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address);
-
static inline void pud_clear (pud_t *pud)
{
set_pud(pud, __pud(_PAGE_NEWPAGE));
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index c2ff76c8981e..9242dc91d751 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -196,23 +196,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
return pgd;
}
-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- free_page((unsigned long) pgd);
-}
-
-#ifdef CONFIG_3_LEVEL_PGTABLES
-pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
-{
- pmd_t *pmd = (pmd_t *) __get_free_page(GFP_KERNEL);
-
- if (pmd)
- memset(pmd, 0, PAGE_SIZE);
-
- return pmd;
-}
-#endif
-
void *uml_kmalloc(int size, int flags)
{
return kmalloc(size, flags);
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 385d3d172ee1..ca8a657edf59 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -30,7 +30,6 @@
#include <linux/sched/task_stack.h>
#include <linux/uaccess.h>
-#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
#include <asm/user32.h>
#include <asm/ia32.h>
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 47562147e70b..d98016b83755 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -9,7 +9,6 @@
#include <trace/events/tlb.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/debugreg.h>
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 29aa7859bdee..62ad61d6fefc 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -7,7 +7,8 @@
#include <linux/pagemap.h>
#define __HAVE_ARCH_PTE_ALLOC_ONE
-#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
+#define __HAVE_ARCH_PGD_FREE
+#include <asm-generic/pgalloc.h>
static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
@@ -86,30 +87,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
#define pmd_pgtable(pmd) pmd_page(pmd)
#if CONFIG_PGTABLE_LEVELS > 2
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
- struct page *page;
- gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
-
- if (mm == &init_mm)
- gfp &= ~__GFP_ACCOUNT;
- page = alloc_pages(gfp, 0);
- if (!page)
- return NULL;
- if (!pgtable_pmd_page_ctor(page)) {
- __free_pages(page, 0);
- return NULL;
- }
- return (pmd_t *)page_address(page);
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
- BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
- pgtable_pmd_page_dtor(virt_to_page(pmd));
- free_page((unsigned long)pmd);
-}
-
extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
@@ -147,21 +124,6 @@ static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pu
set_p4d_safe(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
}
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
- gfp_t gfp = GFP_KERNEL_ACCOUNT;
-
- if (mm == &init_mm)
- gfp &= ~__GFP_ACCOUNT;
- return (pud_t *)get_zeroed_page(gfp);
-}
-
-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
-{
- BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
- free_page((unsigned long)pud);
-}
-
extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c826cddae157..d1175533d125 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -7,6 +7,7 @@
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/stringify.h>
+#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e0e2f020ec02..ccf726cc87b7 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -40,7 +40,6 @@
#include <asm/irq_remapping.h>
#include <asm/perf_event.h>
#include <asm/x86_init.h>
-#include <asm/pgalloc.h>
#include <linux/atomic.h>
#include <asm/mpspec.h>
#include <asm/i8259.h>
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index afac7ccce72f..c27b82b62c8b 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -22,7 +22,6 @@
#include <asm/irqdomain.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
-#include <asm/pgalloc.h>
#include <asm/io_apic.h>
#include <asm/proto.h>
#include <asm/bios_ebda.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 438fc554d48d..1f66d2d1e998 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -62,7 +62,6 @@
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
-#include <asm/pgalloc.h>
#include <asm/proto.h>
#else
#include <asm/processor-flags.h>
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 5e5edd2ec893..0c7643d9f7cb 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -21,7 +21,6 @@
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
-#include <asm/pgalloc.h> /* pgd_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_ADDR */
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index cf5781142716..a0d023cb4292 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -17,7 +17,6 @@
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
#include <asm/elf.h>
#if 0 /* This is just for testing */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8b4afad84f4a..4cb958419fb0 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -678,7 +678,6 @@ void __init initmem_init(void)
#endif
memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
- sparse_memory_present_with_active_regions(0);
#ifdef CONFIG_FLATMEM
max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn;
@@ -718,7 +717,6 @@ void __init paging_init(void)
* NOTE: at this point the bootmem allocator is fully available.
*/
olpc_dt_build_devicetree();
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
zone_sizes_init();
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 449e071240e1..3b246ae40c8f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -817,7 +817,6 @@ void __init initmem_init(void)
void __init paging_init(void)
{
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
/*
@@ -1510,10 +1509,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
if (pmd_none(*pmd)) {
void *p;
- if (altmap)
- p = altmap_alloc_block_buf(PMD_SIZE, altmap);
- else
- p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
if (p) {
pte_t entry;
@@ -1540,7 +1536,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
vmemmap_verify((pte_t *)pmd, node, addr, next);
continue;
}
- if (vmemmap_populate_basepages(addr, next, node))
+ if (vmemmap_populate_basepages(addr, next, node, NULL))
return -ENOMEM;
}
return 0;
@@ -1552,7 +1548,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
int err;
if (end - start < PAGES_PER_SECTION * sizeof(struct page))
- err = vmemmap_populate_basepages(start, end, node);
+ err = vmemmap_populate_basepages(start, end, node, NULL);
else if (boot_cpu_has(X86_FEATURE_PSE))
err = vmemmap_populate_hugepages(start, end, node, altmap);
else if (altmap) {
@@ -1560,7 +1556,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
__func__);
err = -ENOMEM;
} else
- err = vmemmap_populate_basepages(start, end, node);
+ err = vmemmap_populate_basepages(start, end, node, NULL);
if (!err)
sync_global_pgds(start, end - 1);
return err;
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index fb620fd9dae9..6e6b39710e5f 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -26,7 +26,6 @@
#include <linux/memblock.h>
#include <linux/pgtable.h>
-#include <asm/pgalloc.h>
#include <asm/setup.h>
#include <asm/kaslr.h>
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 1953685c2ddf..c234634e26ba 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -11,7 +11,6 @@
#include <linux/spinlock.h>
#include <asm/cpu_entry_area.h>
-#include <asm/pgalloc.h>
#include <asm/fixmap.h>
#include <asm/e820/api.h>
#include <asm/tlb.h>
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index a8a924b3c335..1aab92930569 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -34,7 +34,6 @@
#include <asm/vsyscall.h>
#include <asm/cmdline.h>
#include <asm/pti.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
#include <asm/sections.h>
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index a6e5f2c1805d..a2f447dffea6 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -11,6 +11,7 @@
#include <linux/slab.h>
#include <asm/efi.h>
#include <linux/io.h>
+#include <asm/pgalloc.h>
#include <asm/uv/bios.h>
#include <asm/uv/uv_hub.h>
diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c
index d147f1b2c925..cd3914fc9f3d 100644
--- a/arch/x86/power/hibernate.c
+++ b/arch/x86/power/hibernate.c
@@ -98,7 +98,7 @@ static int get_e820_md5(struct e820_table *table, void *buf)
if (crypto_shash_digest(desc, (u8 *)table, size, buf))
ret = -EINVAL;
- kzfree(desc);
+ kfree_sensitive(desc);
free_tfm:
crypto_free_shash(tfm);
diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h
index 1d38f0e755ba..d3a22da4d2c9 100644
--- a/arch/xtensa/include/asm/pgalloc.h
+++ b/arch/xtensa/include/asm/pgalloc.h
@@ -8,9 +8,14 @@
#ifndef _XTENSA_PGALLOC_H
#define _XTENSA_PGALLOC_H
+#ifdef CONFIG_MMU
#include <linux/highmem.h>
#include <linux/slab.h>
+#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
+#define __HAVE_ARCH_PTE_ALLOC_ONE
+#include <asm-generic/pgalloc.h>
+
/*
* Allocating and freeing a pmd is trivial: the 1-entry pmd is
* inside the pgd, so has no extra memory associated with it.
@@ -28,50 +33,37 @@ pgd_alloc(struct mm_struct *mm)
return (pgd_t*) __get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
}
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+static inline void ptes_clear(pte_t *ptep)
{
- free_page((unsigned long)pgd);
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ pte_clear(NULL, 0, ptep + i);
}
static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
{
pte_t *ptep;
- int i;
- ptep = (pte_t *)__get_free_page(GFP_KERNEL);
+ ptep = (pte_t *)__pte_alloc_one_kernel(mm);
if (!ptep)
return NULL;
- for (i = 0; i < 1024; i++)
- pte_clear(NULL, 0, ptep + i);
+ ptes_clear(ptep);
return ptep;
}
static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
{
- pte_t *pte;
struct page *page;
- pte = pte_alloc_one_kernel(mm);
- if (!pte)
+ page = __pte_alloc_one(mm, GFP_PGTABLE_USER);
+ if (!page)
return NULL;
- page = virt_to_page(pte);
- if (!pgtable_pte_page_ctor(page)) {
- __free_page(page);
- return NULL;
- }
+ ptes_clear(page_address(page));
return page;
}
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
- free_page((unsigned long)pte);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
-{
- pgtable_pte_page_dtor(pte);
- __free_page(pte);
-}
#define pmd_pgtable(pmd) pmd_page(pmd)
+#endif /* CONFIG_MMU */
#endif /* _XTENSA_PGALLOC_H */
diff --git a/arch/xtensa/kernel/xtensa_ksyms.c b/arch/xtensa/kernel/xtensa_ksyms.c
index 24cf6972eace..415fe7faa37f 100644
--- a/arch/xtensa/kernel/xtensa_ksyms.c
+++ b/arch/xtensa/kernel/xtensa_ksyms.c
@@ -25,7 +25,6 @@
#include <asm/dma.h>
#include <asm/io.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/ftrace.h>
#ifdef CONFIG_BLK_DEV_FD
#include <asm/floppy.h>
diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c
index 2369433b734a..5835406b3cec 100644
--- a/arch/xtensa/mm/cache.c
+++ b/arch/xtensa/mm/cache.c
@@ -31,7 +31,6 @@
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
/*
* Note:
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index c4decc73fd86..c128dcc7c85b 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -20,7 +20,6 @@
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/hardirq.h>
-#include <asm/pgalloc.h>
DEFINE_PER_CPU(unsigned long, asid_cache) = ASID_USER_FIRST;
void bad_page_fault(struct pt_regs*, unsigned long, int);
diff --git a/crypto/adiantum.c b/crypto/adiantum.c
index 7fbdc3270984..ce4d5725342c 100644
--- a/crypto/adiantum.c
+++ b/crypto/adiantum.c
@@ -177,7 +177,7 @@ static int adiantum_setkey(struct crypto_skcipher *tfm, const u8 *key,
keyp += NHPOLY1305_KEY_SIZE;
WARN_ON(keyp != &data->derived_keys[ARRAY_SIZE(data->derived_keys)]);
out:
- kzfree(data);
+ kfree_sensitive(data);
return err;
}
diff --git a/crypto/ahash.c b/crypto/ahash.c
index 68a0f0cb75c4..d9d65d1cc669 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -183,7 +183,7 @@ static int ahash_setkey_unaligned(struct crypto_ahash *tfm, const u8 *key,
alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
memcpy(alignbuffer, key, keylen);
ret = tfm->setkey(tfm, alignbuffer, keylen);
- kzfree(buffer);
+ kfree_sensitive(buffer);
return ret;
}
@@ -302,7 +302,7 @@ static void ahash_restore_req(struct ahash_request *req, int err)
req->priv = NULL;
/* Free the req->priv.priv from the ADJUSTED request. */
- kzfree(priv);
+ kfree_sensitive(priv);
}
static void ahash_notify_einprogress(struct ahash_request *req)
diff --git a/crypto/api.c b/crypto/api.c
index 5d8fe60b36c1..ed08cbd5b9d3 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -571,7 +571,7 @@ void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm)
alg->cra_exit(tfm);
crypto_exit_ops(tfm);
crypto_mod_put(alg);
- kzfree(mem);
+ kfree_sensitive(mem);
}
EXPORT_SYMBOL_GPL(crypto_destroy_tfm);
diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c
index cc9dbcecaaca..7553ab18db89 100644
--- a/crypto/asymmetric_keys/verify_pefile.c
+++ b/crypto/asymmetric_keys/verify_pefile.c
@@ -376,7 +376,7 @@ static int pefile_digest_pe(const void *pebuf, unsigned int pelen,
}
error:
- kzfree(desc);
+ kfree_sensitive(desc);
error_no_desc:
crypto_free_shash(tfm);
kleave(" = %d", ret);
@@ -447,6 +447,6 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen,
ret = pefile_digest_pe(pebuf, pelen, &ctx);
error:
- kzfree(ctx.digest);
+ kfree_sensitive(ctx.digest);
return ret;
}
diff --git a/crypto/deflate.c b/crypto/deflate.c
index 4c0e6c9d942a..b2a46f6dc961 100644
--- a/crypto/deflate.c
+++ b/crypto/deflate.c
@@ -163,7 +163,7 @@ static void __deflate_exit(void *ctx)
static void deflate_free_ctx(struct crypto_scomp *tfm, void *ctx)
{
__deflate_exit(ctx);
- kzfree(ctx);
+ kfree_sensitive(ctx);
}
static void deflate_exit(struct crypto_tfm *tfm)
diff --git a/crypto/drbg.c b/crypto/drbg.c
index 8d80d93cab97..e99fe34cfa00 100644
--- a/crypto/drbg.c
+++ b/crypto/drbg.c
@@ -1218,19 +1218,19 @@ static inline void drbg_dealloc_state(struct drbg_state *drbg)
{
if (!drbg)
return;
- kzfree(drbg->Vbuf);
+ kfree_sensitive(drbg->Vbuf);
drbg->Vbuf = NULL;
drbg->V = NULL;
- kzfree(drbg->Cbuf);
+ kfree_sensitive(drbg->Cbuf);
drbg->Cbuf = NULL;
drbg->C = NULL;
- kzfree(drbg->scratchpadbuf);
+ kfree_sensitive(drbg->scratchpadbuf);
drbg->scratchpadbuf = NULL;
drbg->reseed_ctr = 0;
drbg->d_ops = NULL;
drbg->core = NULL;
if (IS_ENABLED(CONFIG_CRYPTO_FIPS)) {
- kzfree(drbg->prev);
+ kfree_sensitive(drbg->prev);
drbg->prev = NULL;
drbg->fips_primed = false;
}
@@ -1701,7 +1701,7 @@ static int drbg_fini_hash_kernel(struct drbg_state *drbg)
struct sdesc *sdesc = (struct sdesc *)drbg->priv_data;
if (sdesc) {
crypto_free_shash(sdesc->shash.tfm);
- kzfree(sdesc);
+ kfree_sensitive(sdesc);
}
drbg->priv_data = NULL;
return 0;
diff --git a/crypto/ecc.c b/crypto/ecc.c
index 8acf8433ca29..c80aa25994a0 100644
--- a/crypto/ecc.c
+++ b/crypto/ecc.c
@@ -67,7 +67,7 @@ static u64 *ecc_alloc_digits_space(unsigned int ndigits)
static void ecc_free_digits_space(u64 *space)
{
- kzfree(space);
+ kfree_sensitive(space);
}
static struct ecc_point *ecc_alloc_point(unsigned int ndigits)
@@ -101,9 +101,9 @@ static void ecc_free_point(struct ecc_point *p)
if (!p)
return;
- kzfree(p->x);
- kzfree(p->y);
- kzfree(p);
+ kfree_sensitive(p->x);
+ kfree_sensitive(p->y);
+ kfree_sensitive(p);
}
static void vli_clear(u64 *vli, unsigned int ndigits)
diff --git a/crypto/ecdh.c b/crypto/ecdh.c
index bd599053a8c4..b0232d6ab4ce 100644
--- a/crypto/ecdh.c
+++ b/crypto/ecdh.c
@@ -124,7 +124,7 @@ static int ecdh_compute_value(struct kpp_request *req)
/* fall through */
free_all:
- kzfree(shared_secret);
+ kfree_sensitive(shared_secret);
free_pubkey:
kfree(public_key);
return ret;
diff --git a/crypto/gcm.c b/crypto/gcm.c
index 3a36a9533c96..338ee0769747 100644
--- a/crypto/gcm.c
+++ b/crypto/gcm.c
@@ -139,7 +139,7 @@ static int crypto_gcm_setkey(struct crypto_aead *aead, const u8 *key,
CRYPTO_TFM_REQ_MASK);
err = crypto_ahash_setkey(ghash, (u8 *)&data->hash, sizeof(be128));
out:
- kzfree(data);
+ kfree_sensitive(data);
return err;
}
diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c
index a4b1c026aaee..a69ae3e6c16c 100644
--- a/crypto/gf128mul.c
+++ b/crypto/gf128mul.c
@@ -304,8 +304,8 @@ void gf128mul_free_64k(struct gf128mul_64k *t)
int i;
for (i = 0; i < 16; i++)
- kzfree(t->t[i]);
- kzfree(t);
+ kfree_sensitive(t->t[i]);
+ kfree_sensitive(t);
}
EXPORT_SYMBOL(gf128mul_free_64k);
diff --git a/crypto/jitterentropy-kcapi.c b/crypto/jitterentropy-kcapi.c
index b43684c0dade..eb7d1dd506bf 100644
--- a/crypto/jitterentropy-kcapi.c
+++ b/crypto/jitterentropy-kcapi.c
@@ -57,7 +57,7 @@ void *jent_zalloc(unsigned int len)
void jent_zfree(void *ptr)
{
- kzfree(ptr);
+ kfree_sensitive(ptr);
}
int jent_fips_enabled(void)
diff --git a/crypto/rng.c b/crypto/rng.c
index 1490d210f1a1..a888d84b524a 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -53,7 +53,7 @@ int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen)
err = crypto_rng_alg(tfm)->seed(tfm, seed, slen);
crypto_stats_rng_seed(alg, err);
out:
- kzfree(buf);
+ kfree_sensitive(buf);
return err;
}
EXPORT_SYMBOL_GPL(crypto_rng_reset);
diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c
index 4983b2b4a223..ddd3d10ffc15 100644
--- a/crypto/rsa-pkcs1pad.c
+++ b/crypto/rsa-pkcs1pad.c
@@ -199,7 +199,7 @@ static int pkcs1pad_encrypt_sign_complete(struct akcipher_request *req, int err)
sg_copy_from_buffer(req->dst,
sg_nents_for_len(req->dst, ctx->key_size),
out_buf, ctx->key_size);
- kzfree(out_buf);
+ kfree_sensitive(out_buf);
out:
req->dst_len = ctx->key_size;
@@ -322,7 +322,7 @@ static int pkcs1pad_decrypt_complete(struct akcipher_request *req, int err)
out_buf + pos, req->dst_len);
done:
- kzfree(req_ctx->out_buf);
+ kfree_sensitive(req_ctx->out_buf);
return err;
}
@@ -500,7 +500,7 @@ static int pkcs1pad_verify_complete(struct akcipher_request *req, int err)
req->dst_len) != 0)
err = -EKEYREJECTED;
done:
- kzfree(req_ctx->out_buf);
+ kfree_sensitive(req_ctx->out_buf);
return err;
}
diff --git a/crypto/seqiv.c b/crypto/seqiv.c
index 23e22d8b63e6..0899d527c284 100644
--- a/crypto/seqiv.c
+++ b/crypto/seqiv.c
@@ -33,7 +33,7 @@ static void seqiv_aead_encrypt_complete2(struct aead_request *req, int err)
memcpy(req->iv, subreq->iv, crypto_aead_ivsize(geniv));
out:
- kzfree(subreq->iv);
+ kfree_sensitive(subreq->iv);
}
static void seqiv_aead_encrypt_complete(struct crypto_async_request *base,
diff --git a/crypto/shash.c b/crypto/shash.c
index e6a4b5f39b8c..2e3433ad9762 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -44,7 +44,7 @@ static int shash_setkey_unaligned(struct crypto_shash *tfm, const u8 *key,
alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
memcpy(alignbuffer, key, keylen);
err = shash->setkey(tfm, alignbuffer, keylen);
- kzfree(buffer);
+ kfree_sensitive(buffer);
return err;
}
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 467af525848a..b4dae640de9f 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -592,7 +592,7 @@ static int skcipher_setkey_unaligned(struct crypto_skcipher *tfm,
alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
memcpy(alignbuffer, key, keylen);
ret = cipher->setkey(tfm, alignbuffer, keylen);
- kzfree(buffer);
+ kfree_sensitive(buffer);
return ret;
}
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 6863f911fcee..23c27fc96394 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1744,7 +1744,7 @@ out:
kfree(vec.plaintext);
kfree(vec.digest);
crypto_free_shash(generic_tfm);
- kzfree(generic_desc);
+ kfree_sensitive(generic_desc);
return err;
}
#else /* !CONFIG_CRYPTO_MANAGER_EXTRA_TESTS */
@@ -3665,7 +3665,7 @@ static int drbg_cavs_test(const struct drbg_testvec *test, int pr,
if (IS_ERR(drng)) {
printk(KERN_ERR "alg: drbg: could not allocate DRNG handle for "
"%s\n", driver);
- kzfree(buf);
+ kfree_sensitive(buf);
return -ENOMEM;
}
@@ -3712,7 +3712,7 @@ static int drbg_cavs_test(const struct drbg_testvec *test, int pr,
outbuf:
crypto_free_rng(drng);
- kzfree(buf);
+ kfree_sensitive(buf);
return ret;
}
diff --git a/crypto/zstd.c b/crypto/zstd.c
index 5a3ff258d8f7..1a3309f066f7 100644
--- a/crypto/zstd.c
+++ b/crypto/zstd.c
@@ -137,7 +137,7 @@ static void __zstd_exit(void *ctx)
static void zstd_free_ctx(struct crypto_scomp *tfm, void *ctx)
{
__zstd_exit(ctx);
- kzfree(ctx);
+ kfree_sensitive(ctx);
}
static void zstd_exit(struct crypto_tfm *tfm)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 5b02f69769e8..508b80f6329b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -368,8 +368,8 @@ static ssize_t node_read_meminfo(struct device *dev,
unsigned long sreclaimable, sunreclaimable;
si_meminfo_node(&i, nid);
- sreclaimable = node_page_state(pgdat, NR_SLAB_RECLAIMABLE);
- sunreclaimable = node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE);
+ sreclaimable = node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B);
+ sunreclaimable = node_page_state_pages(pgdat, NR_SLAB_UNRECLAIMABLE_B);
n = sprintf(buf,
"Node %d MemTotal: %8lu kB\n"
"Node %d MemFree: %8lu kB\n"
@@ -440,9 +440,9 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(node_page_state(pgdat, NR_FILE_MAPPED)),
nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
nid, K(i.sharedram),
- nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB),
+ nid, node_page_state(pgdat, NR_KERNEL_STACK_KB),
#ifdef CONFIG_SHADOW_CALL_STACK
- nid, sum_zone_node_page_state(nid, NR_KERNEL_SCS_KB),
+ nid, node_page_state(pgdat, NR_KERNEL_SCS_KB),
#endif
nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
nid, 0UL,
@@ -513,7 +513,7 @@ static ssize_t node_read_vmstat(struct device *dev,
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
n += sprintf(buf+n, "%s %lu\n", node_stat_name(i),
- node_page_state(pgdat, i));
+ node_page_state_pages(pgdat, i));
return n;
}
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index a3eeccf3ac5f..c6ea5d38c509 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -36,7 +36,6 @@
#include <linux/io.h>
#include <linux/rbtree.h>
#include <asm/setup.h>
-#include <asm/pgalloc.h>
#include <asm/hypervisor.h>
#include <xen/grant_table.h>
#include <xen/page.h>
diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c
index 1e4f9a58bb24..b4d5fea27d20 100644
--- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c
+++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c
@@ -254,7 +254,7 @@ theend_iv:
offset = areq->cryptlen - ivsize;
if (rctx->op_dir & CE_DECRYPTION) {
memcpy(areq->iv, backup_iv, ivsize);
- kzfree(backup_iv);
+ kfree_sensitive(backup_iv);
} else {
scatterwalk_map_and_copy(areq->iv, areq->dst, offset,
ivsize, 0);
diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c
index 7a131675a41c..7b39b4495571 100644
--- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c
+++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c
@@ -249,7 +249,7 @@ theend_iv:
if (rctx->op_dir & SS_DECRYPTION) {
memcpy(areq->iv, backup_iv, ivsize);
memzero_explicit(backup_iv, ivsize);
- kzfree(backup_iv);
+ kfree_sensitive(backup_iv);
} else {
scatterwalk_map_and_copy(areq->iv, areq->dst, offset,
ivsize, 0);
diff --git a/drivers/crypto/amlogic/amlogic-gxl-cipher.c b/drivers/crypto/amlogic/amlogic-gxl-cipher.c
index 5880b94dcb32..d93210726697 100644
--- a/drivers/crypto/amlogic/amlogic-gxl-cipher.c
+++ b/drivers/crypto/amlogic/amlogic-gxl-cipher.c
@@ -252,8 +252,8 @@ static int meson_cipher(struct skcipher_request *areq)
}
}
theend:
- kzfree(bkeyiv);
- kzfree(backup_iv);
+ kfree_sensitive(bkeyiv);
+ kfree_sensitive(backup_iv);
return err;
}
diff --git a/drivers/crypto/atmel-ecc.c b/drivers/crypto/atmel-ecc.c
index ff02cc05affb..9bd8e5167be3 100644
--- a/drivers/crypto/atmel-ecc.c
+++ b/drivers/crypto/atmel-ecc.c
@@ -69,7 +69,7 @@ static void atmel_ecdh_done(struct atmel_i2c_work_data *work_data, void *areq,
/* fall through */
free_work_data:
- kzfree(work_data);
+ kfree_sensitive(work_data);
kpp_request_complete(req, status);
}
diff --git a/drivers/crypto/caam/caampkc.c b/drivers/crypto/caam/caampkc.c
index 2e44d685618f..dd5f101e43f8 100644
--- a/drivers/crypto/caam/caampkc.c
+++ b/drivers/crypto/caam/caampkc.c
@@ -854,14 +854,14 @@ static int caam_rsa_dec(struct akcipher_request *req)
static void caam_rsa_free_key(struct caam_rsa_key *key)
{
- kzfree(key->d);
- kzfree(key->p);
- kzfree(key->q);
- kzfree(key->dp);
- kzfree(key->dq);
- kzfree(key->qinv);
- kzfree(key->tmp1);
- kzfree(key->tmp2);
+ kfree_sensitive(key->d);
+ kfree_sensitive(key->p);
+ kfree_sensitive(key->q);
+ kfree_sensitive(key->dp);
+ kfree_sensitive(key->dq);
+ kfree_sensitive(key->qinv);
+ kfree_sensitive(key->tmp1);
+ kfree_sensitive(key->tmp2);
kfree(key->e);
kfree(key->n);
memset(key, 0, sizeof(*key));
@@ -1018,17 +1018,17 @@ static void caam_rsa_set_priv_key_form(struct caam_rsa_ctx *ctx,
return;
free_dq:
- kzfree(rsa_key->dq);
+ kfree_sensitive(rsa_key->dq);
free_dp:
- kzfree(rsa_key->dp);
+ kfree_sensitive(rsa_key->dp);
free_tmp2:
- kzfree(rsa_key->tmp2);
+ kfree_sensitive(rsa_key->tmp2);
free_tmp1:
- kzfree(rsa_key->tmp1);
+ kfree_sensitive(rsa_key->tmp1);
free_q:
- kzfree(rsa_key->q);
+ kfree_sensitive(rsa_key->q);
free_p:
- kzfree(rsa_key->p);
+ kfree_sensitive(rsa_key->p);
}
static int caam_rsa_set_priv_key(struct crypto_akcipher *tfm, const void *key,
diff --git a/drivers/crypto/cavium/cpt/cptvf_main.c b/drivers/crypto/cavium/cpt/cptvf_main.c
index 0f72e9abdefe..a15245992cf9 100644
--- a/drivers/crypto/cavium/cpt/cptvf_main.c
+++ b/drivers/crypto/cavium/cpt/cptvf_main.c
@@ -74,7 +74,7 @@ static void cleanup_worker_threads(struct cpt_vf *cptvf)
for (i = 0; i < cptvf->nr_queues; i++)
tasklet_kill(&cwqe_info->vq_wqe[i].twork);
- kzfree(cwqe_info);
+ kfree_sensitive(cwqe_info);
cptvf->wqe_info = NULL;
}
@@ -88,7 +88,7 @@ static void free_pending_queues(struct pending_qinfo *pqinfo)
continue;
/* free single queue */
- kzfree((queue->head));
+ kfree_sensitive((queue->head));
queue->front = 0;
queue->rear = 0;
@@ -189,7 +189,7 @@ static void free_command_queues(struct cpt_vf *cptvf,
chunk->head = NULL;
chunk->dma_addr = 0;
hlist_del(&chunk->nextchunk);
- kzfree(chunk);
+ kfree_sensitive(chunk);
}
queue->nchunks = 0;
diff --git a/drivers/crypto/cavium/cpt/cptvf_reqmanager.c b/drivers/crypto/cavium/cpt/cptvf_reqmanager.c
index 3878b01e19e1..dc5fda522719 100644
--- a/drivers/crypto/cavium/cpt/cptvf_reqmanager.c
+++ b/drivers/crypto/cavium/cpt/cptvf_reqmanager.c
@@ -305,12 +305,12 @@ static void do_request_cleanup(struct cpt_vf *cptvf,
}
}
- kzfree(info->scatter_components);
- kzfree(info->gather_components);
- kzfree(info->out_buffer);
- kzfree(info->in_buffer);
- kzfree((void *)info->completion_addr);
- kzfree(info);
+ kfree_sensitive(info->scatter_components);
+ kfree_sensitive(info->gather_components);
+ kfree_sensitive(info->out_buffer);
+ kfree_sensitive(info->in_buffer);
+ kfree_sensitive((void *)info->completion_addr);
+ kfree_sensitive(info);
}
static void do_post_process(struct cpt_vf *cptvf, struct cpt_info_buffer *info)
diff --git a/drivers/crypto/cavium/nitrox/nitrox_lib.c b/drivers/crypto/cavium/nitrox/nitrox_lib.c
index 5cbc64b851b9..a5cdc2b48bd6 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_lib.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_lib.c
@@ -90,7 +90,7 @@ static void nitrox_free_aqm_queues(struct nitrox_device *ndev)
for (i = 0; i < ndev->nr_queues; i++) {
nitrox_cmdq_cleanup(ndev->aqmq[i]);
- kzfree(ndev->aqmq[i]);
+ kfree_sensitive(ndev->aqmq[i]);
ndev->aqmq[i] = NULL;
}
}
@@ -122,7 +122,7 @@ static int nitrox_alloc_aqm_queues(struct nitrox_device *ndev)
err = nitrox_cmdq_init(cmdq, AQM_Q_ALIGN_BYTES);
if (err) {
- kzfree(cmdq);
+ kfree_sensitive(cmdq);
goto aqmq_fail;
}
ndev->aqmq[i] = cmdq;
diff --git a/drivers/crypto/cavium/zip/zip_crypto.c b/drivers/crypto/cavium/zip/zip_crypto.c
index 4985bc812b0e..7df71fcebe8f 100644
--- a/drivers/crypto/cavium/zip/zip_crypto.c
+++ b/drivers/crypto/cavium/zip/zip_crypto.c
@@ -260,7 +260,7 @@ void *zip_alloc_scomp_ctx_deflate(struct crypto_scomp *tfm)
ret = zip_ctx_init(zip_ctx, 0);
if (ret) {
- kzfree(zip_ctx);
+ kfree_sensitive(zip_ctx);
return ERR_PTR(ret);
}
@@ -279,7 +279,7 @@ void *zip_alloc_scomp_ctx_lzs(struct crypto_scomp *tfm)
ret = zip_ctx_init(zip_ctx, 1);
if (ret) {
- kzfree(zip_ctx);
+ kfree_sensitive(zip_ctx);
return ERR_PTR(ret);
}
@@ -291,7 +291,7 @@ void zip_free_scomp_ctx(struct crypto_scomp *tfm, void *ctx)
struct zip_kernel_ctx *zip_ctx = ctx;
zip_ctx_exit(zip_ctx);
- kzfree(zip_ctx);
+ kfree_sensitive(zip_ctx);
}
int zip_scomp_compress(struct crypto_scomp *tfm,
diff --git a/drivers/crypto/ccp/ccp-crypto-rsa.c b/drivers/crypto/ccp/ccp-crypto-rsa.c
index 649c91d60401..1223ac70aea2 100644
--- a/drivers/crypto/ccp/ccp-crypto-rsa.c
+++ b/drivers/crypto/ccp/ccp-crypto-rsa.c
@@ -112,13 +112,13 @@ static int ccp_check_key_length(unsigned int len)
static void ccp_rsa_free_key_bufs(struct ccp_ctx *ctx)
{
/* Clean up old key data */
- kzfree(ctx->u.rsa.e_buf);
+ kfree_sensitive(ctx->u.rsa.e_buf);
ctx->u.rsa.e_buf = NULL;
ctx->u.rsa.e_len = 0;
- kzfree(ctx->u.rsa.n_buf);
+ kfree_sensitive(ctx->u.rsa.n_buf);
ctx->u.rsa.n_buf = NULL;
ctx->u.rsa.n_len = 0;
- kzfree(ctx->u.rsa.d_buf);
+ kfree_sensitive(ctx->u.rsa.d_buf);
ctx->u.rsa.d_buf = NULL;
ctx->u.rsa.d_len = 0;
}
diff --git a/drivers/crypto/ccree/cc_aead.c b/drivers/crypto/ccree/cc_aead.c
index 1cf51edbc4b9..35794c7271fb 100644
--- a/drivers/crypto/ccree/cc_aead.c
+++ b/drivers/crypto/ccree/cc_aead.c
@@ -448,7 +448,7 @@ static int cc_get_plain_hmac_key(struct crypto_aead *tfm, const u8 *authkey,
if (dma_mapping_error(dev, key_dma_addr)) {
dev_err(dev, "Mapping key va=0x%p len=%u for DMA failed\n",
key, keylen);
- kzfree(key);
+ kfree_sensitive(key);
return -ENOMEM;
}
if (keylen > blocksize) {
@@ -533,7 +533,7 @@ static int cc_get_plain_hmac_key(struct crypto_aead *tfm, const u8 *authkey,
if (key_dma_addr)
dma_unmap_single(dev, key_dma_addr, keylen, DMA_TO_DEVICE);
- kzfree(key);
+ kfree_sensitive(key);
return rc;
}
diff --git a/drivers/crypto/ccree/cc_buffer_mgr.c b/drivers/crypto/ccree/cc_buffer_mgr.c
index b2bd093e7013..a5e041d9d2cf 100644
--- a/drivers/crypto/ccree/cc_buffer_mgr.c
+++ b/drivers/crypto/ccree/cc_buffer_mgr.c
@@ -488,7 +488,7 @@ void cc_unmap_aead_request(struct device *dev, struct aead_request *req)
if (areq_ctx->gen_ctx.iv_dma_addr) {
dma_unmap_single(dev, areq_ctx->gen_ctx.iv_dma_addr,
hw_iv_size, DMA_BIDIRECTIONAL);
- kzfree(areq_ctx->gen_ctx.iv);
+ kfree_sensitive(areq_ctx->gen_ctx.iv);
}
/* Release pool */
@@ -559,7 +559,7 @@ static int cc_aead_chain_iv(struct cc_drvdata *drvdata,
if (dma_mapping_error(dev, areq_ctx->gen_ctx.iv_dma_addr)) {
dev_err(dev, "Mapping iv %u B at va=%pK for DMA failed\n",
hw_iv_size, req->iv);
- kzfree(areq_ctx->gen_ctx.iv);
+ kfree_sensitive(areq_ctx->gen_ctx.iv);
areq_ctx->gen_ctx.iv = NULL;
rc = -ENOMEM;
goto chain_iv_exit;
diff --git a/drivers/crypto/ccree/cc_cipher.c b/drivers/crypto/ccree/cc_cipher.c
index 076669dc1035..d77ae981b64b 100644
--- a/drivers/crypto/ccree/cc_cipher.c
+++ b/drivers/crypto/ccree/cc_cipher.c
@@ -257,7 +257,7 @@ static void cc_cipher_exit(struct crypto_tfm *tfm)
&ctx_p->user.key_dma_addr);
/* Free key buffer in context */
- kzfree(ctx_p->user.key);
+ kfree_sensitive(ctx_p->user.key);
dev_dbg(dev, "Free key buffer in context. key=@%p\n", ctx_p->user.key);
}
@@ -881,7 +881,7 @@ static void cc_cipher_complete(struct device *dev, void *cc_req, int err)
/* Not a BACKLOG notification */
cc_unmap_cipher_request(dev, req_ctx, ivsize, src, dst);
memcpy(req->iv, req_ctx->iv, ivsize);
- kzfree(req_ctx->iv);
+ kfree_sensitive(req_ctx->iv);
}
skcipher_request_complete(req, err);
@@ -994,7 +994,7 @@ static int cc_cipher_process(struct skcipher_request *req,
exit_process:
if (rc != -EINPROGRESS && rc != -EBUSY) {
- kzfree(req_ctx->iv);
+ kfree_sensitive(req_ctx->iv);
}
return rc;
diff --git a/drivers/crypto/ccree/cc_hash.c b/drivers/crypto/ccree/cc_hash.c
index d5310783af15..683c9a430e11 100644
--- a/drivers/crypto/ccree/cc_hash.c
+++ b/drivers/crypto/ccree/cc_hash.c
@@ -764,7 +764,7 @@ static int cc_hash_setkey(struct crypto_ahash *ahash, const u8 *key,
if (dma_mapping_error(dev, ctx->key_params.key_dma_addr)) {
dev_err(dev, "Mapping key va=0x%p len=%u for DMA failed\n",
ctx->key_params.key, keylen);
- kzfree(ctx->key_params.key);
+ kfree_sensitive(ctx->key_params.key);
return -ENOMEM;
}
dev_dbg(dev, "mapping key-buffer: key_dma_addr=%pad keylen=%u\n",
@@ -913,7 +913,7 @@ out:
&ctx->key_params.key_dma_addr, ctx->key_params.keylen);
}
- kzfree(ctx->key_params.key);
+ kfree_sensitive(ctx->key_params.key);
return rc;
}
@@ -950,7 +950,7 @@ static int cc_xcbc_setkey(struct crypto_ahash *ahash,
if (dma_mapping_error(dev, ctx->key_params.key_dma_addr)) {
dev_err(dev, "Mapping key va=0x%p len=%u for DMA failed\n",
key, keylen);
- kzfree(ctx->key_params.key);
+ kfree_sensitive(ctx->key_params.key);
return -ENOMEM;
}
dev_dbg(dev, "mapping key-buffer: key_dma_addr=%pad keylen=%u\n",
@@ -999,7 +999,7 @@ static int cc_xcbc_setkey(struct crypto_ahash *ahash,
dev_dbg(dev, "Unmapped key-buffer: key_dma_addr=%pad keylen=%u\n",
&ctx->key_params.key_dma_addr, ctx->key_params.keylen);
- kzfree(ctx->key_params.key);
+ kfree_sensitive(ctx->key_params.key);
return rc;
}
diff --git a/drivers/crypto/ccree/cc_request_mgr.c b/drivers/crypto/ccree/cc_request_mgr.c
index 1d7649ecf44e..33fb27745d52 100644
--- a/drivers/crypto/ccree/cc_request_mgr.c
+++ b/drivers/crypto/ccree/cc_request_mgr.c
@@ -107,7 +107,7 @@ void cc_req_mgr_fini(struct cc_drvdata *drvdata)
/* Kill tasklet */
tasklet_kill(&req_mgr_h->comptask);
#endif
- kzfree(req_mgr_h);
+ kfree_sensitive(req_mgr_h);
drvdata->request_mgr_handle = NULL;
}
diff --git a/drivers/crypto/marvell/cesa/hash.c b/drivers/crypto/marvell/cesa/hash.c
index bd0bd9ffd6e9..f2a2fc111164 100644
--- a/drivers/crypto/marvell/cesa/hash.c
+++ b/drivers/crypto/marvell/cesa/hash.c
@@ -1157,7 +1157,7 @@ static int mv_cesa_ahmac_pad_init(struct ahash_request *req,
}
/* Set the memory region to 0 to avoid any leak. */
- kzfree(keydup);
+ kfree_sensitive(keydup);
if (ret)
return ret;
diff --git a/drivers/crypto/marvell/octeontx/otx_cptvf_main.c b/drivers/crypto/marvell/octeontx/otx_cptvf_main.c
index ce3168327a39..228fe8e47e0e 100644
--- a/drivers/crypto/marvell/octeontx/otx_cptvf_main.c
+++ b/drivers/crypto/marvell/octeontx/otx_cptvf_main.c
@@ -68,7 +68,7 @@ static void cleanup_worker_threads(struct otx_cptvf *cptvf)
for (i = 0; i < cptvf->num_queues; i++)
tasklet_kill(&cwqe_info->vq_wqe[i].twork);
- kzfree(cwqe_info);
+ kfree_sensitive(cwqe_info);
cptvf->wqe_info = NULL;
}
@@ -82,7 +82,7 @@ static void free_pending_queues(struct otx_cpt_pending_qinfo *pqinfo)
continue;
/* free single queue */
- kzfree((queue->head));
+ kfree_sensitive((queue->head));
queue->front = 0;
queue->rear = 0;
queue->qlen = 0;
@@ -176,7 +176,7 @@ static void free_command_queues(struct otx_cptvf *cptvf,
chunk->head = NULL;
chunk->dma_addr = 0;
list_del(&chunk->nextchunk);
- kzfree(chunk);
+ kfree_sensitive(chunk);
}
queue->num_chunks = 0;
queue->idx = 0;
diff --git a/drivers/crypto/marvell/octeontx/otx_cptvf_reqmgr.h b/drivers/crypto/marvell/octeontx/otx_cptvf_reqmgr.h
index d912fe0c532d..a02d059fb652 100644
--- a/drivers/crypto/marvell/octeontx/otx_cptvf_reqmgr.h
+++ b/drivers/crypto/marvell/octeontx/otx_cptvf_reqmgr.h
@@ -215,7 +215,7 @@ static inline void do_request_cleanup(struct pci_dev *pdev,
DMA_BIDIRECTIONAL);
}
}
- kzfree(info);
+ kfree_sensitive(info);
}
struct otx_cptvf_wqe;
diff --git a/drivers/crypto/nx/nx.c b/drivers/crypto/nx/nx.c
index f03c238f5a31..40882d6d52c1 100644
--- a/drivers/crypto/nx/nx.c
+++ b/drivers/crypto/nx/nx.c
@@ -746,7 +746,7 @@ void nx_crypto_ctx_exit(struct crypto_tfm *tfm)
{
struct nx_crypto_ctx *nx_ctx = crypto_tfm_ctx(tfm);
- kzfree(nx_ctx->kmem);
+ kfree_sensitive(nx_ctx->kmem);
nx_ctx->csbcpb = NULL;
nx_ctx->csbcpb_aead = NULL;
nx_ctx->in_sg = NULL;
@@ -762,7 +762,7 @@ void nx_crypto_ctx_aead_exit(struct crypto_aead *tfm)
{
struct nx_crypto_ctx *nx_ctx = crypto_aead_ctx(tfm);
- kzfree(nx_ctx->kmem);
+ kfree_sensitive(nx_ctx->kmem);
}
static int nx_probe(struct vio_dev *viodev, const struct vio_device_id *id)
diff --git a/drivers/crypto/virtio/virtio_crypto_algs.c b/drivers/crypto/virtio/virtio_crypto_algs.c
index b2601958282e..583c0b535d13 100644
--- a/drivers/crypto/virtio/virtio_crypto_algs.c
+++ b/drivers/crypto/virtio/virtio_crypto_algs.c
@@ -167,7 +167,7 @@ static int virtio_crypto_alg_skcipher_init_session(
num_in, vcrypto, GFP_ATOMIC);
if (err < 0) {
spin_unlock(&vcrypto->ctrl_lock);
- kzfree(cipher_key);
+ kfree_sensitive(cipher_key);
return err;
}
virtqueue_kick(vcrypto->ctrl_vq);
@@ -184,7 +184,7 @@ static int virtio_crypto_alg_skcipher_init_session(
spin_unlock(&vcrypto->ctrl_lock);
pr_err("virtio_crypto: Create session failed status: %u\n",
le32_to_cpu(vcrypto->input.status));
- kzfree(cipher_key);
+ kfree_sensitive(cipher_key);
return -EINVAL;
}
@@ -197,7 +197,7 @@ static int virtio_crypto_alg_skcipher_init_session(
spin_unlock(&vcrypto->ctrl_lock);
- kzfree(cipher_key);
+ kfree_sensitive(cipher_key);
return 0;
}
@@ -472,9 +472,9 @@ __virtio_crypto_skcipher_do_req(struct virtio_crypto_sym_request *vc_sym_req,
return 0;
free_iv:
- kzfree(iv);
+ kfree_sensitive(iv);
free:
- kzfree(req_data);
+ kfree_sensitive(req_data);
kfree(sgs);
return err;
}
@@ -583,7 +583,7 @@ static void virtio_crypto_skcipher_finalize_req(
scatterwalk_map_and_copy(req->iv, req->dst,
req->cryptlen - AES_BLOCK_SIZE,
AES_BLOCK_SIZE, 0);
- kzfree(vc_sym_req->iv);
+ kfree_sensitive(vc_sym_req->iv);
virtcrypto_clear_request(&vc_sym_req->base);
crypto_finalize_skcipher_request(vc_sym_req->base.dataq->engine,
diff --git a/drivers/crypto/virtio/virtio_crypto_core.c b/drivers/crypto/virtio/virtio_crypto_core.c
index 77e744eaedd0..0c66d6193ca2 100644
--- a/drivers/crypto/virtio/virtio_crypto_core.c
+++ b/drivers/crypto/virtio/virtio_crypto_core.c
@@ -17,7 +17,7 @@ void
virtcrypto_clear_request(struct virtio_crypto_request *vc_req)
{
if (vc_req) {
- kzfree(vc_req->req_data);
+ kfree_sensitive(vc_req->req_data);
kfree(vc_req->sgs);
}
}
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index 4c2972f3153b..6de86e73dfc3 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -28,7 +28,6 @@
#if defined(CONFIG_ARM) && !defined(CONFIG_IOMMU_DMA)
#include <asm/dma-iommu.h>
-#include <asm/pgalloc.h>
#else
#define arm_iommu_create_mapping(...) NULL
#define arm_iommu_attach_device(...) -ENODEV
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index b437a14c4942..37dcc52cf21d 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -407,7 +407,7 @@ static void crypt_iv_lmk_dtr(struct crypt_config *cc)
crypto_free_shash(lmk->hash_tfm);
lmk->hash_tfm = NULL;
- kzfree(lmk->seed);
+ kfree_sensitive(lmk->seed);
lmk->seed = NULL;
}
@@ -558,9 +558,9 @@ static void crypt_iv_tcw_dtr(struct crypt_config *cc)
{
struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
- kzfree(tcw->iv_seed);
+ kfree_sensitive(tcw->iv_seed);
tcw->iv_seed = NULL;
- kzfree(tcw->whitening);
+ kfree_sensitive(tcw->whitening);
tcw->whitening = NULL;
if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm))
@@ -994,8 +994,8 @@ static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *d
kunmap_atomic(data);
out:
- kzfree(ks);
- kzfree(es);
+ kfree_sensitive(ks);
+ kfree_sensitive(es);
skcipher_request_free(req);
return r;
}
@@ -2294,7 +2294,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
key = request_key(type, key_desc + 1, NULL);
if (IS_ERR(key)) {
- kzfree(new_key_string);
+ kfree_sensitive(new_key_string);
return PTR_ERR(key);
}
@@ -2304,7 +2304,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
if (ret < 0) {
up_read(&key->sem);
key_put(key);
- kzfree(new_key_string);
+ kfree_sensitive(new_key_string);
return ret;
}
@@ -2318,10 +2318,10 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
if (!ret) {
set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
- kzfree(cc->key_string);
+ kfree_sensitive(cc->key_string);
cc->key_string = new_key_string;
} else
- kzfree(new_key_string);
+ kfree_sensitive(new_key_string);
return ret;
}
@@ -2382,7 +2382,7 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
/* wipe references to any kernel keyring key */
- kzfree(cc->key_string);
+ kfree_sensitive(cc->key_string);
cc->key_string = NULL;
/* Decode key from its hex representation. */
@@ -2414,7 +2414,7 @@ static int crypt_wipe_key(struct crypt_config *cc)
return r;
}
- kzfree(cc->key_string);
+ kfree_sensitive(cc->key_string);
cc->key_string = NULL;
r = crypt_setkey(cc);
memset(&cc->key, 0, cc->key_size * sizeof(u8));
@@ -2493,15 +2493,15 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->dev)
dm_put_device(ti, cc->dev);
- kzfree(cc->cipher_string);
- kzfree(cc->key_string);
- kzfree(cc->cipher_auth);
- kzfree(cc->authenc_key);
+ kfree_sensitive(cc->cipher_string);
+ kfree_sensitive(cc->key_string);
+ kfree_sensitive(cc->cipher_auth);
+ kfree_sensitive(cc->authenc_key);
mutex_destroy(&cc->bio_alloc_lock);
/* Must zero key material before freeing */
- kzfree(cc);
+ kfree_sensitive(cc);
spin_lock(&dm_crypt_clients_lock);
WARN_ON(!dm_crypt_clients_n);
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 5da3eb661e50..8c8d940e532e 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -3405,8 +3405,8 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int
static void free_alg(struct alg_spec *a)
{
- kzfree(a->alg_string);
- kzfree(a->key);
+ kfree_sensitive(a->alg_string);
+ kfree_sensitive(a->key);
memset(a, 0, sizeof *a);
}
@@ -4337,7 +4337,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
for (i = 0; i < ic->journal_sections; i++) {
struct skcipher_request *req = ic->sk_requests[i];
if (req) {
- kzfree(req->iv);
+ kfree_sensitive(req->iv);
skcipher_request_free(req);
}
}
diff --git a/drivers/misc/ibmvmc.c b/drivers/misc/ibmvmc.c
index c0d139c26505..2d778d0f011e 100644
--- a/drivers/misc/ibmvmc.c
+++ b/drivers/misc/ibmvmc.c
@@ -286,7 +286,7 @@ static void *alloc_dma_buffer(struct vio_dev *vdev, size_t size,
if (dma_mapping_error(&vdev->dev, *dma_handle)) {
*dma_handle = 0;
- kzfree(buffer);
+ kfree_sensitive(buffer);
return NULL;
}
@@ -310,7 +310,7 @@ static void free_dma_buffer(struct vio_dev *vdev, size_t size, void *vaddr,
dma_unmap_single(&vdev->dev, dma_handle, size, DMA_BIDIRECTIONAL);
/* deallocate memory */
- kzfree(vaddr);
+ kfree_sensitive(vaddr);
}
/**
@@ -883,7 +883,7 @@ static int ibmvmc_close(struct inode *inode, struct file *file)
spin_unlock_irqrestore(&hmc->lock, flags);
}
- kzfree(session);
+ kfree_sensitive(session);
return rc;
}
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
index 0874ae47cb03..3ab6db2588d3 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
@@ -137,7 +137,7 @@ static void hclge_free_vector_ring_chain(struct hnae3_ring_chain_node *head)
while (chain) {
chain_tmp = chain->next;
- kzfree(chain);
+ kfree_sensitive(chain);
chain = chain_tmp;
}
}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 6516980965a2..eca73526ac86 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -960,9 +960,9 @@ int ixgbe_ipsec_vf_add_sa(struct ixgbe_adapter *adapter, u32 *msgbuf, u32 vf)
return 0;
err_aead:
- kzfree(xs->aead);
+ kfree_sensitive(xs->aead);
err_xs:
- kzfree(xs);
+ kfree_sensitive(xs);
err_out:
msgbuf[1] = err;
return err;
@@ -1047,7 +1047,7 @@ int ixgbe_ipsec_vf_del_sa(struct ixgbe_adapter *adapter, u32 *msgbuf, u32 vf)
ixgbe_ipsec_del_sa(xs);
/* remove the xs that was made-up in the add request */
- kzfree(xs);
+ kfree_sensitive(xs);
return 0;
}
diff --git a/drivers/net/ppp/ppp_mppe.c b/drivers/net/ppp/ppp_mppe.c
index de3b57d09d0c..208f6e24f37c 100644
--- a/drivers/net/ppp/ppp_mppe.c
+++ b/drivers/net/ppp/ppp_mppe.c
@@ -222,7 +222,7 @@ out_free:
kfree(state->sha1_digest);
if (state->sha1) {
crypto_free_shash(state->sha1->tfm);
- kzfree(state->sha1);
+ kfree_sensitive(state->sha1);
}
kfree(state);
out:
@@ -238,8 +238,8 @@ static void mppe_free(void *arg)
if (state) {
kfree(state->sha1_digest);
crypto_free_shash(state->sha1->tfm);
- kzfree(state->sha1);
- kzfree(state);
+ kfree_sensitive(state->sha1);
+ kfree_sensitive(state);
}
}
diff --git a/drivers/net/wireguard/noise.c b/drivers/net/wireguard/noise.c
index 201a22681945..3dd3b76790d0 100644
--- a/drivers/net/wireguard/noise.c
+++ b/drivers/net/wireguard/noise.c
@@ -114,7 +114,7 @@ static struct noise_keypair *keypair_create(struct wg_peer *peer)
static void keypair_free_rcu(struct rcu_head *rcu)
{
- kzfree(container_of(rcu, struct noise_keypair, rcu));
+ kfree_sensitive(container_of(rcu, struct noise_keypair, rcu));
}
static void keypair_free_kref(struct kref *kref)
@@ -821,7 +821,7 @@ bool wg_noise_handshake_begin_session(struct noise_handshake *handshake,
handshake->entry.peer->device->index_hashtable,
&handshake->entry, &new_keypair->entry);
} else {
- kzfree(new_keypair);
+ kfree_sensitive(new_keypair);
}
rcu_read_unlock_bh();
diff --git a/drivers/net/wireguard/peer.c b/drivers/net/wireguard/peer.c
index 1d634bd3038f..b3b6370e6b95 100644
--- a/drivers/net/wireguard/peer.c
+++ b/drivers/net/wireguard/peer.c
@@ -203,7 +203,7 @@ static void rcu_release(struct rcu_head *rcu)
/* The final zeroing takes care of clearing any remaining handshake key
* material and other potentially sensitive information.
*/
- kzfree(peer);
+ kfree_sensitive(peer);
}
static void kref_release(struct kref *refcount)
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
index 24cb1b1f21f0..9463c108aa96 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
@@ -1369,7 +1369,7 @@ static void iwl_pcie_rx_handle_rb(struct iwl_trans *trans,
&rxcb, rxq->id);
if (reclaim) {
- kzfree(txq->entries[cmd_index].free_buf);
+ kfree_sensitive(txq->entries[cmd_index].free_buf);
txq->entries[cmd_index].free_buf = NULL;
}
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
index 7fc7542535d8..606bef2ecc7b 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
@@ -1026,7 +1026,7 @@ static int iwl_pcie_gen2_enqueue_hcmd(struct iwl_trans *trans,
BUILD_BUG_ON(IWL_TFH_NUM_TBS > sizeof(out_meta->tbs) * BITS_PER_BYTE);
out_meta->flags = cmd->flags;
if (WARN_ON_ONCE(txq->entries[idx].free_buf))
- kzfree(txq->entries[idx].free_buf);
+ kfree_sensitive(txq->entries[idx].free_buf);
txq->entries[idx].free_buf = dup_buf;
trace_iwlwifi_dev_hcmd(trans->dev, cmd, cmd_size, &out_cmd->hdr_wide);
@@ -1257,8 +1257,8 @@ static void iwl_pcie_gen2_txq_free(struct iwl_trans *trans, int txq_id)
/* De-alloc array of command/tx buffers */
if (txq_id == trans->txqs.cmd.q_id)
for (i = 0; i < txq->n_window; i++) {
- kzfree(txq->entries[i].cmd);
- kzfree(txq->entries[i].free_buf);
+ kfree_sensitive(txq->entries[i].cmd);
+ kfree_sensitive(txq->entries[i].free_buf);
}
del_timer_sync(&txq->stuck_timer);
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
index 5c6c3fa0d29f..eb396c06b7fb 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
@@ -721,8 +721,8 @@ static void iwl_pcie_txq_free(struct iwl_trans *trans, int txq_id)
/* De-alloc array of command/tx buffers */
if (txq_id == trans->txqs.cmd.q_id)
for (i = 0; i < txq->n_window; i++) {
- kzfree(txq->entries[i].cmd);
- kzfree(txq->entries[i].free_buf);
+ kfree_sensitive(txq->entries[i].cmd);
+ kfree_sensitive(txq->entries[i].free_buf);
}
/* De-alloc circular buffer of TFDs */
@@ -1765,7 +1765,7 @@ static int iwl_pcie_enqueue_hcmd(struct iwl_trans *trans,
BUILD_BUG_ON(IWL_TFH_NUM_TBS > sizeof(out_meta->tbs) * BITS_PER_BYTE);
out_meta->flags = cmd->flags;
if (WARN_ON_ONCE(txq->entries[idx].free_buf))
- kzfree(txq->entries[idx].free_buf);
+ kfree_sensitive(txq->entries[idx].free_buf);
txq->entries[idx].free_buf = dup_buf;
trace_iwlwifi_dev_hcmd(trans->dev, cmd, cmd_size, &out_cmd->hdr_wide);
diff --git a/drivers/net/wireless/intersil/orinoco/wext.c b/drivers/net/wireless/intersil/orinoco/wext.c
index 1d4dae422106..7b6c4ae8ddb3 100644
--- a/drivers/net/wireless/intersil/orinoco/wext.c
+++ b/drivers/net/wireless/intersil/orinoco/wext.c
@@ -31,8 +31,8 @@ static int orinoco_set_key(struct orinoco_private *priv, int index,
enum orinoco_alg alg, const u8 *key, int key_len,
const u8 *seq, int seq_len)
{
- kzfree(priv->keys[index].key);
- kzfree(priv->keys[index].seq);
+ kfree_sensitive(priv->keys[index].key);
+ kfree_sensitive(priv->keys[index].seq);
if (key_len) {
priv->keys[index].key = kzalloc(key_len, GFP_ATOMIC);
diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h
index 1a1d5e3c8d45..1ea046324e8f 100644
--- a/drivers/s390/crypto/ap_bus.h
+++ b/drivers/s390/crypto/ap_bus.h
@@ -219,8 +219,8 @@ static inline void ap_init_message(struct ap_message *ap_msg)
*/
static inline void ap_release_message(struct ap_message *ap_msg)
{
- kzfree(ap_msg->msg);
- kzfree(ap_msg->private);
+ kfree_sensitive(ap_msg->msg);
+ kfree_sensitive(ap_msg->private);
}
/*
diff --git a/drivers/staging/ks7010/ks_hostif.c b/drivers/staging/ks7010/ks_hostif.c
index b10a92ae2067..eaaf6a5440a9 100644
--- a/drivers/staging/ks7010/ks_hostif.c
+++ b/drivers/staging/ks7010/ks_hostif.c
@@ -245,7 +245,7 @@ michael_mic(u8 *key, u8 *data, unsigned int len, u8 priority, u8 *result)
ret = crypto_shash_finup(desc, data + 12, len - 12, result);
err_free_desc:
- kzfree(desc);
+ kfree_sensitive(desc);
err_free_tfm:
crypto_free_shash(tfm);
diff --git a/drivers/staging/rtl8723bs/core/rtw_security.c b/drivers/staging/rtl8723bs/core/rtw_security.c
index 0f15c96183a0..7f74e1d05b3a 100644
--- a/drivers/staging/rtl8723bs/core/rtw_security.c
+++ b/drivers/staging/rtl8723bs/core/rtw_security.c
@@ -2251,7 +2251,7 @@ static void gf_mulx(u8 *pad)
static void aes_encrypt_deinit(void *ctx)
{
- kzfree(ctx);
+ kfree_sensitive(ctx);
}
diff --git a/drivers/staging/wlan-ng/p80211netdev.c b/drivers/staging/wlan-ng/p80211netdev.c
index b809c0015c0c..7b091c5a2984 100644
--- a/drivers/staging/wlan-ng/p80211netdev.c
+++ b/drivers/staging/wlan-ng/p80211netdev.c
@@ -429,7 +429,7 @@ static netdev_tx_t p80211knetdev_hard_start_xmit(struct sk_buff *skb,
failed:
/* Free up the WEP buffer if it's not the same as the skb */
if ((p80211_wep.data) && (p80211_wep.data != skb->data))
- kzfree(p80211_wep.data);
+ kfree_sensitive(p80211_wep.data);
/* we always free the skb here, never in a lower level. */
if (!result)
diff --git a/drivers/target/iscsi/iscsi_target_auth.c b/drivers/target/iscsi/iscsi_target_auth.c
index 0e54627d9aa8..62d912b79c61 100644
--- a/drivers/target/iscsi/iscsi_target_auth.c
+++ b/drivers/target/iscsi/iscsi_target_auth.c
@@ -484,7 +484,7 @@ static int chap_server_compute_hash(
pr_debug("[server] Sending CHAP_R=0x%s\n", response);
auth_ret = 0;
out:
- kzfree(desc);
+ kfree_sensitive(desc);
if (tfm)
crypto_free_shash(tfm);
kfree(initiatorchg);
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index b1d8b028bf80..37ffccda8bb8 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -58,7 +58,6 @@
#include <linux/sysctl.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/xen/hypervisor.h>
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 095d683ad574..63abe6c3642b 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -25,7 +25,6 @@
#include <linux/miscdevice.h>
#include <linux/moduleparam.h>
-#include <asm/pgalloc.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
diff --git a/fs/Kconfig b/fs/Kconfig
index a88aa3af73c1..aa4c12282301 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -201,6 +201,27 @@ config TMPFS_XATTR
If unsure, say N.
+config TMPFS_INODE64
+ bool "Use 64-bit ino_t by default in tmpfs"
+ depends on TMPFS && 64BIT
+ default n
+ help
+ tmpfs has historically used only inode numbers as wide as an unsigned
+ int. In some cases this can cause wraparound, potentially resulting
+ in multiple files with the same inode number on a single device. This
+ option makes tmpfs use the full width of ino_t by default, without
+ needing to specify the inode64 option when mounting.
+
+ But if a long-lived tmpfs is to be accessed by 32-bit applications so
+ ancient that opening a file larger than 2GiB fails with EINVAL, then
+ the INODE64 config option and inode64 mount option risk operations
+ failing with EOVERFLOW once 33-bit inode numbers are reached.
+
+ To override this configured default, use the inode32 or inode64
+ option when mounting.
+
+ If unsure, say N.
+
config HUGETLBFS
bool "HugeTLB file system support"
depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
diff --git a/fs/aio.c b/fs/aio.c
index 91e7cc4a9f17..5736bff48e9e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -525,9 +525,9 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
return -EINTR;
}
- ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
- PROT_READ | PROT_WRITE,
- MAP_SHARED, 0, &unused, NULL);
+ ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED, 0, &unused, NULL);
mmap_write_unlock(mm);
if (IS_ERR((void *)ctx->mmap_base)) {
ctx->mmap_size = 0;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 0f45521b237c..cf306e0798fd 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -38,7 +38,6 @@
#include <linux/uaccess.h>
#include <asm/param.h>
-#include <asm/pgalloc.h>
typedef char *elf_caddr_t;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 874a551f339c..9daa256f69d4 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -797,7 +797,7 @@ calc_seckey(struct cifs_ses *ses)
ses->auth_key.len = CIFS_SESS_KEY_SIZE;
memzero_explicit(sec_key, CIFS_SESS_KEY_SIZE);
- kzfree(ctx_arc4);
+ kfree_sensitive(ctx_arc4);
return 0;
}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7e3e5e2098eb..0ad1309e88d3 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2183,7 +2183,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
tmp_end++;
if (!(tmp_end < end && tmp_end[1] == delim)) {
/* No it is not. Set the password to NULL */
- kzfree(vol->password);
+ kfree_sensitive(vol->password);
vol->password = NULL;
break;
}
@@ -2221,7 +2221,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
options = end;
}
- kzfree(vol->password);
+ kfree_sensitive(vol->password);
/* Now build new password string */
temp_len = strlen(value);
vol->password = kzalloc(temp_len+1, GFP_KERNEL);
@@ -3199,7 +3199,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
rc = -ENOMEM;
kfree(vol->username);
vol->username = NULL;
- kzfree(vol->password);
+ kfree_sensitive(vol->password);
vol->password = NULL;
goto out_key_put;
}
@@ -4220,7 +4220,7 @@ void
cifs_cleanup_volume_info_contents(struct smb_vol *volume_info)
{
kfree(volume_info->username);
- kzfree(volume_info->password);
+ kfree_sensitive(volume_info->password);
kfree(volume_info->UNC);
kfree(volume_info->domainname);
kfree(volume_info->iocharset);
@@ -5339,7 +5339,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
out:
kfree(vol_info->username);
- kzfree(vol_info->password);
+ kfree_sensitive(vol_info->password);
kfree(vol_info);
return tcon;
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index a44f58bbf7ab..6ee849698962 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -1191,7 +1191,7 @@ err_free_domainname:
err_free_unc:
kfree(new->UNC);
err_free_password:
- kzfree(new->password);
+ kfree_sensitive(new->password);
err_free_username:
kfree(new->username);
kfree(new);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 764234803071..1c14cf01dbef 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -103,12 +103,12 @@ sesInfoFree(struct cifs_ses *buf_to_free)
kfree(buf_to_free->serverOS);
kfree(buf_to_free->serverDomain);
kfree(buf_to_free->serverNOS);
- kzfree(buf_to_free->password);
+ kfree_sensitive(buf_to_free->password);
kfree(buf_to_free->user_name);
kfree(buf_to_free->domainName);
- kzfree(buf_to_free->auth_key.response);
+ kfree_sensitive(buf_to_free->auth_key.response);
kfree(buf_to_free->iface_list);
- kzfree(buf_to_free);
+ kfree_sensitive(buf_to_free);
}
struct cifs_tcon *
@@ -148,7 +148,7 @@ tconInfoFree(struct cifs_tcon *buf_to_free)
}
atomic_dec(&tconInfoAllocCount);
kfree(buf_to_free->nativeFileSystem);
- kzfree(buf_to_free->password);
+ kfree_sensitive(buf_to_free->password);
kfree(buf_to_free->crfid.fid);
#ifdef CONFIG_CIFS_DFS_UPCALL
kfree(buf_to_free->dfs_path);
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index b6b8574caa13..faa25541ccb6 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -16,6 +16,7 @@
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/sched/mm.h>
+#include <linux/slab.h>
#include "fscrypt_private.h"
@@ -187,7 +188,7 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
fail:
for (i = 0; i < queue_refs; i++)
blk_put_queue(blk_key->devs[i]);
- kzfree(blk_key);
+ kfree_sensitive(blk_key);
return err;
}
@@ -201,7 +202,7 @@ void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key)
blk_crypto_evict_key(blk_key->devs[i], &blk_key->base);
blk_put_queue(blk_key->devs[i]);
}
- kzfree(blk_key);
+ kfree_sensitive(blk_key);
}
}
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 71d56f8e2870..e74f239c4428 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -51,7 +51,7 @@ static void free_master_key(struct fscrypt_master_key *mk)
}
key_put(mk->mk_users);
- kzfree(mk);
+ kfree_sensitive(mk);
}
static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec)
@@ -531,7 +531,7 @@ static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep)
static void fscrypt_provisioning_key_free_preparse(
struct key_preparsed_payload *prep)
{
- kzfree(prep->payload.data[0]);
+ kfree_sensitive(prep->payload.data[0]);
}
static void fscrypt_provisioning_key_describe(const struct key *key,
@@ -548,7 +548,7 @@ static void fscrypt_provisioning_key_describe(const struct key *key,
static void fscrypt_provisioning_key_destroy(struct key *key)
{
- kzfree(key->payload.data[0]);
+ kfree_sensitive(key->payload.data[0]);
}
static struct key_type key_type_fscrypt_provisioning = {
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index e4e707fb1100..a3cb52572b05 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -155,7 +155,7 @@ static void free_direct_key(struct fscrypt_direct_key *dk)
{
if (dk) {
fscrypt_destroy_prepared_key(&dk->dk_key);
- kzfree(dk);
+ kfree_sensitive(dk);
}
}
@@ -283,7 +283,7 @@ static int setup_v1_file_key_derived(struct fscrypt_info *ci,
err = fscrypt_set_per_file_enc_key(ci, derived_key);
out:
- kzfree(derived_key);
+ kfree_sensitive(derived_key);
return err;
}
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index af3eb02bbca1..f6a17d259db7 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -838,7 +838,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
out_release_free_unlock:
crypto_free_shash(s->hash_tfm);
out_free_unlock:
- kzfree(s->block_aligned_filename);
+ kfree_sensitive(s->block_aligned_filename);
out_unlock:
mutex_unlock(s->tfm_mutex);
out:
@@ -847,7 +847,7 @@ out:
key_put(auth_tok_key);
}
skcipher_request_free(s->skcipher_req);
- kzfree(s->hash_desc);
+ kfree_sensitive(s->hash_desc);
kfree(s);
return rc;
}
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 8646ba76def3..c0dfd9647627 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -175,7 +175,7 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
}
hlist_del(&daemon->euid_chain);
mutex_unlock(&daemon->mux);
- kzfree(daemon);
+ kfree_sensitive(daemon);
out:
return rc;
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ef5313f9c78f..523954d00dff 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -140,7 +140,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
* already been checked by prepare_hugepage_range. If you add
* any error returns here, do so after setting VM_HUGETLB, so
* is_vm_hugetlb_page tests below unmap_region go the right
- * way when do_mmap_pgoff unwinds (may be important on powerpc
+ * way when do_mmap unwinds (may be important on powerpc
* and ia64).
*/
vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index a87d4391e6b5..cd96083a12c8 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1504,7 +1504,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
na.type = AT_BITMAP;
na.name = I30;
na.name_len = 4;
- bmp_vi = ilookup5(vi->i_sb, vi->i_ino, (test_t)ntfs_test_inode, &na);
+ bmp_vi = ilookup5(vi->i_sb, vi->i_ino, ntfs_test_inode, &na);
if (bmp_vi) {
write_inode_now(bmp_vi, !datasync);
iput(bmp_vi);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index d4359a1df3d5..9bb9f0952b18 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -30,10 +30,10 @@
/**
* ntfs_test_inode - compare two (possibly fake) inodes for equality
* @vi: vfs inode which to test
- * @na: ntfs attribute which is being tested with
+ * @data: data which is being tested with
*
* Compare the ntfs attribute embedded in the ntfs specific part of the vfs
- * inode @vi for equality with the ntfs attribute @na.
+ * inode @vi for equality with the ntfs attribute @data.
*
* If searching for the normal file/directory inode, set @na->type to AT_UNUSED.
* @na->name and @na->name_len are then ignored.
@@ -43,8 +43,9 @@
* NOTE: This function runs with the inode_hash_lock spin lock held so it is not
* allowed to sleep.
*/
-int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
+int ntfs_test_inode(struct inode *vi, void *data)
{
+ ntfs_attr *na = (ntfs_attr *)data;
ntfs_inode *ni;
if (vi->i_ino != na->mft_no)
@@ -72,9 +73,9 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
/**
* ntfs_init_locked_inode - initialize an inode
* @vi: vfs inode to initialize
- * @na: ntfs attribute which to initialize @vi to
+ * @data: data which to initialize @vi to
*
- * Initialize the vfs inode @vi with the values from the ntfs attribute @na in
+ * Initialize the vfs inode @vi with the values from the ntfs attribute @data in
* order to enable ntfs_test_inode() to do its work.
*
* If initializing the normal file/directory inode, set @na->type to AT_UNUSED.
@@ -87,8 +88,9 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
* NOTE: This function runs with the inode->i_lock spin lock held so it is not
* allowed to sleep. (Hence the GFP_ATOMIC allocation.)
*/
-static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
+static int ntfs_init_locked_inode(struct inode *vi, void *data)
{
+ ntfs_attr *na = (ntfs_attr *)data;
ntfs_inode *ni = NTFS_I(vi);
vi->i_ino = na->mft_no;
@@ -131,7 +133,6 @@ static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
return 0;
}
-typedef int (*set_t)(struct inode *, void *);
static int ntfs_read_locked_inode(struct inode *vi);
static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi);
static int ntfs_read_locked_index_inode(struct inode *base_vi,
@@ -164,8 +165,8 @@ struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no)
na.name = NULL;
na.name_len = 0;
- vi = iget5_locked(sb, mft_no, (test_t)ntfs_test_inode,
- (set_t)ntfs_init_locked_inode, &na);
+ vi = iget5_locked(sb, mft_no, ntfs_test_inode,
+ ntfs_init_locked_inode, &na);
if (unlikely(!vi))
return ERR_PTR(-ENOMEM);
@@ -225,8 +226,8 @@ struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
na.name = name;
na.name_len = name_len;
- vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode,
- (set_t)ntfs_init_locked_inode, &na);
+ vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode,
+ ntfs_init_locked_inode, &na);
if (unlikely(!vi))
return ERR_PTR(-ENOMEM);
@@ -280,8 +281,8 @@ struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
na.name = name;
na.name_len = name_len;
- vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode,
- (set_t)ntfs_init_locked_inode, &na);
+ vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode,
+ ntfs_init_locked_inode, &na);
if (unlikely(!vi))
return ERR_PTR(-ENOMEM);
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 98e670fbdd31..363e4e820673 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -253,9 +253,7 @@ typedef struct {
ATTR_TYPE type;
} ntfs_attr;
-typedef int (*test_t)(struct inode *, void *);
-
-extern int ntfs_test_inode(struct inode *vi, ntfs_attr *na);
+extern int ntfs_test_inode(struct inode *vi, void *data);
extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no);
extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index fbb9f1bc623d..0d62cd5bb7f8 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -958,7 +958,7 @@ bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
* dirty code path of the inode dirty code path when writing
* $MFT occurs.
*/
- vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na);
+ vi = ilookup5_nowait(sb, mft_no, ntfs_test_inode, &na);
}
if (vi) {
ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
@@ -1019,7 +1019,7 @@ bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
vi = igrab(mft_vi);
BUG_ON(vi != mft_vi);
} else
- vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode,
+ vi = ilookup5_nowait(sb, na.mft_no, ntfs_test_inode,
&na);
if (!vi) {
/*
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index aca16624b370..5d11380d8724 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -16,9 +16,9 @@ config OCFS2_FS
You'll want to install the ocfs2-tools package in order to at least
get "mount.ocfs2".
- Project web page: http://oss.oracle.com/projects/ocfs2
- Tools web page: http://oss.oracle.com/projects/ocfs2-tools
- OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+ Project web page: https://oss.oracle.com/projects/ocfs2
+ Tools web page: https://oss.oracle.com/projects/ocfs2-tools
+ OCFS2 mailing lists: https://oss.oracle.com/projects/ocfs2/mailman/
For more information on OCFS2, see the file
<file:Documentation/filesystems/ocfs2.rst>.
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index bb981ec76456..7b07f5df3a29 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -256,6 +256,8 @@ static int ocfs2_set_acl(handle_t *handle,
ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
kfree(value);
+ if (!ret)
+ set_cached_acl(inode, type, acl);
return ret;
}
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index eaf042feaf5e..6e07ddb0e3c0 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -124,7 +124,7 @@ u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr
* parity bits that are part of the bit number
* representation. Huh?
*
- * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+ * <wikipedia href="https://en.wikipedia.org/wiki/Hamming_code">
* In other words, the parity bit at position 2^k
* checks bits in positions having bit k set in
* their binary representation. Conversely, for
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 751bc4dc7466..8e3a369086db 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2871,9 +2871,15 @@ int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
0, 0);
- if (status < 0)
+ if (status < 0) {
mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
+ if (ex)
+ up_write(&osb->nfs_sync_rwlock);
+ else
+ up_read(&osb->nfs_sync_rwlock);
+ }
+
return status;
}
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 2dd71d626196..7993d527edae 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -327,8 +327,8 @@ struct ocfs2_super
spinlock_t osb_lock;
u32 s_next_generation;
unsigned long osb_flags;
- s16 s_inode_steal_slot;
- s16 s_meta_steal_slot;
+ u16 s_inode_steal_slot;
+ u16 s_meta_steal_slot;
atomic_t s_num_inodes_stolen;
atomic_t s_num_meta_stolen;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 45745cc3408a..8c8cf7f4eb34 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -879,9 +879,9 @@ static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
{
spin_lock(&osb->osb_lock);
if (type == INODE_ALLOC_SYSTEM_INODE)
- osb->s_inode_steal_slot = slot;
+ osb->s_inode_steal_slot = (u16)slot;
else if (type == EXTENT_ALLOC_SYSTEM_INODE)
- osb->s_meta_steal_slot = slot;
+ osb->s_meta_steal_slot = (u16)slot;
spin_unlock(&osb->osb_lock);
}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index f0a5d30a175d..50b36250beb6 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -40,7 +40,7 @@ struct ocfs2_alloc_context {
u64 ac_last_group;
u64 ac_max_block; /* Highest block number to allocate. 0 is
- is the same as ~0 - unlimited */
+ the same as ~0 - unlimited */
int ac_find_loc_only; /* hack for reflink operation ordering */
struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 71ea9ce71a6b..1d91dd1e8711 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -78,7 +78,7 @@ struct mount_options
unsigned long commit_interval;
unsigned long mount_opt;
unsigned int atime_quantum;
- signed short slot;
+ unsigned short slot;
int localalloc_opt;
unsigned int resv_level;
int dir_resv_level;
@@ -1349,7 +1349,7 @@ static int ocfs2_parse_options(struct super_block *sb,
goto bail;
}
if (option)
- mopt->slot = (s16)option;
+ mopt->slot = (u16)option;
break;
case Opt_commit:
if (match_int(&args[0], &option)) {
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index e9a6841fc25b..887a5532e449 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -41,7 +41,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
si_meminfo(&i);
si_swapinfo(&i);
- committed = percpu_counter_read_positive(&vm_committed_as);
+ committed = vm_memory_committed();
cached = global_node_page_state(NR_FILE_PAGES) -
total_swapcache_pages() - i.bufferram;
@@ -52,8 +52,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
available = si_mem_available();
- sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE);
- sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE);
+ sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
+ sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B);
show_val_kb(m, "MemTotal: ", i.totalram);
show_val_kb(m, "MemFree: ", i.freeram);
@@ -101,10 +101,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SReclaimable: ", sreclaimable);
show_val_kb(m, "SUnreclaim: ", sunreclaim);
seq_printf(m, "KernelStack: %8lu kB\n",
- global_zone_page_state(NR_KERNEL_STACK_KB));
+ global_node_page_state(NR_KERNEL_STACK_KB));
#ifdef CONFIG_SHADOW_CALL_STACK
seq_printf(m, "ShadowCallStack:%8lu kB\n",
- global_zone_page_state(NR_KERNEL_SCS_KB));
+ global_node_page_state(NR_KERNEL_SCS_KB));
#endif
show_val_kb(m, "PageTables: ",
global_zone_page_state(NR_PAGETABLE));
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 73f7421413cb..6f44810921aa 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -102,6 +102,86 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
__free_page(pte_page);
}
+
+#if CONFIG_PGTABLE_LEVELS > 2
+
+#ifndef __HAVE_ARCH_PMD_ALLOC_ONE
+/**
+ * pmd_alloc_one - allocate a page for PMD-level page table
+ * @mm: the mm_struct of the current context
+ *
+ * Allocates a page and runs the pgtable_pmd_page_ctor().
+ * Allocations use %GFP_PGTABLE_USER in user context and
+ * %GFP_PGTABLE_KERNEL in kernel context.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ struct page *page;
+ gfp_t gfp = GFP_PGTABLE_USER;
+
+ if (mm == &init_mm)
+ gfp = GFP_PGTABLE_KERNEL;
+ page = alloc_pages(gfp, 0);
+ if (!page)
+ return NULL;
+ if (!pgtable_pmd_page_ctor(page)) {
+ __free_pages(page, 0);
+ return NULL;
+ }
+ return (pmd_t *)page_address(page);
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMD_FREE
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+ BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+ pgtable_pmd_page_dtor(virt_to_page(pmd));
+ free_page((unsigned long)pmd);
+}
+#endif
+
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
+
+#if CONFIG_PGTABLE_LEVELS > 3
+
+#ifndef __HAVE_ARCH_PUD_FREE
+/**
+ * pud_alloc_one - allocate a page for PUD-level page table
+ * @mm: the mm_struct of the current context
+ *
+ * Allocates a page using %GFP_PGTABLE_USER for user context and
+ * %GFP_PGTABLE_KERNEL for kernel context.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ gfp_t gfp = GFP_PGTABLE_USER;
+
+ if (mm == &init_mm)
+ gfp = GFP_PGTABLE_KERNEL;
+ return (pud_t *)get_zeroed_page(gfp);
+}
+#endif
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+ BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
+ free_page((unsigned long)pud);
+}
+
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
+
+#ifndef __HAVE_ARCH_PGD_FREE
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+ free_page((unsigned long)pgd);
+}
+#endif
+
#endif /* CONFIG_MMU */
#endif /* __ASM_GENERIC_PGALLOC_H */
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index ef75ec86f865..6661ee1cff47 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -14,7 +14,6 @@
#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/hugetlb_inline.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>
diff --git a/include/crypto/aead.h b/include/crypto/aead.h
index 62c68550aab6..c32a6f5664e9 100644
--- a/include/crypto/aead.h
+++ b/include/crypto/aead.h
@@ -425,7 +425,7 @@ static inline struct aead_request *aead_request_alloc(struct crypto_aead *tfm,
*/
static inline void aead_request_free(struct aead_request *req)
{
- kzfree(req);
+ kfree_sensitive(req);
}
/**
diff --git a/include/crypto/akcipher.h b/include/crypto/akcipher.h
index 6924b091adec..1d3aa252caba 100644
--- a/include/crypto/akcipher.h
+++ b/include/crypto/akcipher.h
@@ -207,7 +207,7 @@ static inline struct akcipher_request *akcipher_request_alloc(
*/
static inline void akcipher_request_free(struct akcipher_request *req)
{
- kzfree(req);
+ kfree_sensitive(req);
}
/**
diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h
index fa0a63d298dc..81330c6446f6 100644
--- a/include/crypto/gf128mul.h
+++ b/include/crypto/gf128mul.h
@@ -230,7 +230,7 @@ void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t);
void gf128mul_x8_ble(le128 *r, const le128 *x);
static inline void gf128mul_free_4k(struct gf128mul_4k *t)
{
- kzfree(t);
+ kfree_sensitive(t);
}
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index 19ce91f2359f..0d1b403888c9 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -606,7 +606,7 @@ static inline struct ahash_request *ahash_request_alloc(
*/
static inline void ahash_request_free(struct ahash_request *req)
{
- kzfree(req);
+ kfree_sensitive(req);
}
static inline void ahash_request_zero(struct ahash_request *req)
diff --git a/include/crypto/internal/acompress.h b/include/crypto/internal/acompress.h
index cf478681b53e..cfc47e18820f 100644
--- a/include/crypto/internal/acompress.h
+++ b/include/crypto/internal/acompress.h
@@ -46,7 +46,7 @@ static inline struct acomp_req *__acomp_request_alloc(struct crypto_acomp *tfm)
static inline void __acomp_request_free(struct acomp_req *req)
{
- kzfree(req);
+ kfree_sensitive(req);
}
/**
diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h
index cd9a9b500624..88b591215d5c 100644
--- a/include/crypto/kpp.h
+++ b/include/crypto/kpp.h
@@ -187,7 +187,7 @@ static inline struct kpp_request *kpp_request_alloc(struct crypto_kpp *tfm,
*/
static inline void kpp_request_free(struct kpp_request *req)
{
- kzfree(req);
+ kfree_sensitive(req);
}
/**
diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h
index 5663f71198b3..6a733b171a5d 100644
--- a/include/crypto/skcipher.h
+++ b/include/crypto/skcipher.h
@@ -508,7 +508,7 @@ static inline struct skcipher_request *skcipher_request_alloc(
*/
static inline void skcipher_request_free(struct skcipher_request *req)
{
- kzfree(req);
+ kfree_sensitive(req);
}
static inline void skcipher_request_zero(struct skcipher_request *req)
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 05c47f857383..73db1ae04cef 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -606,7 +606,11 @@ extern void *efi_get_pal_addr (void);
extern void efi_map_pal_code (void);
extern void efi_memmap_walk (efi_freemem_callback_t callback, void *arg);
extern void efi_gettimeofday (struct timespec64 *ts);
+#ifdef CONFIG_EFI
extern void efi_enter_virtual_mode (void); /* switch EFI to virtual mode, if possible */
+#else
+static inline void efi_enter_virtual_mode (void) {}
+#endif
#ifdef CONFIG_X86
extern efi_status_t efi_query_variable_store(u32 attributes,
unsigned long size,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6d1976916635..2df72def1f59 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -528,7 +528,7 @@ static inline int mapping_mapped(struct address_space *mapping)
/*
* Might pages of this file have been modified in userspace?
- * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff
+ * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap
* marks vma as VM_SHARED if it is shared, and the file was opened for
* writing i.e. vma may be mprotected writable even if now readonly.
*
@@ -2950,6 +2950,21 @@ extern void discard_new_inode(struct inode *);
extern unsigned int get_next_ino(void);
extern void evict_inodes(struct super_block *sb);
+/*
+ * Userspace may rely on the the inode number being non-zero. For example, glibc
+ * simply ignores files with zero i_ino in unlink() and other places.
+ *
+ * As an additional complication, if userspace was compiled with
+ * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the
+ * lower 32 bits, so we need to check that those aren't zero explicitly. With
+ * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but
+ * better safe than sorry.
+ */
+static inline bool is_zero_ino(ino_t ino)
+{
+ return (u32)ino == 0;
+}
+
extern void __iget(struct inode * inode);
extern void iget_failed(struct inode *);
extern void clear_inode(struct inode *);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 71f20776b06c..17c4c4975145 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -42,7 +42,7 @@ extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned char *vec);
extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, unsigned long old_end,
+ unsigned long new_addr,
pmd_t *old_pmd, pmd_t *new_pmd);
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot,
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 82522e996c76..087fba34b209 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -38,7 +38,6 @@ extern void kasan_disable_current(void);
void kasan_unpoison_shadow(const void *address, size_t size);
void kasan_unpoison_task_stack(struct task_struct *task);
-void kasan_unpoison_stack_above_sp_to(const void *watermark);
void kasan_alloc_pages(struct page *page, unsigned int order);
void kasan_free_pages(struct page *page, unsigned int order);
@@ -101,7 +100,6 @@ void kasan_restore_multi_shot(bool enabled);
static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
-static inline void kasan_unpoison_stack_above_sp_to(const void *watermark) {}
static inline void kasan_enable_current(void) {}
static inline void kasan_disable_current(void) {}
@@ -174,11 +172,13 @@ static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
void kasan_cache_shrink(struct kmem_cache *cache);
void kasan_cache_shutdown(struct kmem_cache *cache);
+void kasan_record_aux_stack(void *ptr);
#else /* CONFIG_KASAN_GENERIC */
static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
+static inline void kasan_record_aux_stack(void *ptr) {}
#endif /* CONFIG_KASAN_GENERIC */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e77197a62809..1bb49b600310 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -23,6 +23,7 @@
#include <linux/page-flags.h>
struct mem_cgroup;
+struct obj_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;
@@ -31,8 +32,6 @@ struct kmem_cache;
enum memcg_stat_item {
MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
MEMCG_SOCK,
- /* XXX: why are these zone and not node counters? */
- MEMCG_KERNEL_STACK_KB,
MEMCG_NR_STAT,
};
@@ -48,12 +47,6 @@ enum memcg_memory_event {
MEMCG_NR_MEMORY_EVENTS,
};
-enum mem_cgroup_protection {
- MEMCG_PROT_NONE,
- MEMCG_PROT_LOW,
- MEMCG_PROT_MIN,
-};
-
struct mem_cgroup_reclaim_cookie {
pg_data_t *pgdat;
unsigned int generation;
@@ -193,6 +186,22 @@ struct memcg_cgwb_frn {
};
/*
+ * Bucket for arbitrarily byte-sized objects charged to a memory
+ * cgroup. The bucket can be reparented in one piece when the cgroup
+ * is destroyed, without having to round up the individual references
+ * of all live memory objects in the wild.
+ */
+struct obj_cgroup {
+ struct percpu_ref refcnt;
+ struct mem_cgroup *memcg;
+ atomic_t nr_charged_bytes;
+ union {
+ struct list_head list;
+ struct rcu_head rcu;
+ };
+};
+
+/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
@@ -300,7 +309,8 @@ struct mem_cgroup {
/* Index in the kmem_cache->memcg_params.memcg_caches array */
int kmemcg_id;
enum memcg_kmem_state kmem_state;
- struct list_head kmem_caches;
+ struct obj_cgroup __rcu *objcg;
+ struct list_head objcg_list; /* list of inherited objcgs */
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -339,12 +349,49 @@ static inline bool mem_cgroup_disabled(void)
return !cgroup_subsys_enabled(memory_cgrp_subsys);
}
-static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg,
bool in_low_reclaim)
{
if (mem_cgroup_disabled())
return 0;
+ /*
+ * There is no reclaim protection applied to a targeted reclaim.
+ * We are special casing this specific case here because
+ * mem_cgroup_protected calculation is not robust enough to keep
+ * the protection invariant for calculated effective values for
+ * parallel reclaimers with different reclaim target. This is
+ * especially a problem for tail memcgs (as they have pages on LRU)
+ * which would want to have effective values 0 for targeted reclaim
+ * but a different value for external reclaim.
+ *
+ * Example
+ * Let's have global and A's reclaim in parallel:
+ * |
+ * A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G)
+ * |\
+ * | C (low = 1G, usage = 2.5G)
+ * B (low = 1G, usage = 0.5G)
+ *
+ * For the global reclaim
+ * A.elow = A.low
+ * B.elow = min(B.usage, B.low) because children_low_usage <= A.elow
+ * C.elow = min(C.usage, C.low)
+ *
+ * With the effective values resetting we have A reclaim
+ * A.elow = 0
+ * B.elow = B.low
+ * C.elow = C.low
+ *
+ * If the global reclaim races with A's reclaim then
+ * B.elow = C.elow = 0 because children_low_usage > A.elow)
+ * is possible and reclaiming B would be violating the protection.
+ *
+ */
+ if (root == memcg)
+ return 0;
+
if (in_low_reclaim)
return READ_ONCE(memcg->memory.emin);
@@ -352,8 +399,36 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
READ_ONCE(memcg->memory.elow));
}
-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
- struct mem_cgroup *memcg);
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg);
+
+static inline bool mem_cgroup_supports_protection(struct mem_cgroup *memcg)
+{
+ /*
+ * The root memcg doesn't account charges, and doesn't support
+ * protection.
+ */
+ return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg);
+
+}
+
+static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg)
+{
+ if (!mem_cgroup_supports_protection(memcg))
+ return false;
+
+ return READ_ONCE(memcg->memory.elow) >=
+ page_counter_read(&memcg->memory);
+}
+
+static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
+{
+ if (!mem_cgroup_supports_protection(memcg))
+ return false;
+
+ return READ_ONCE(memcg->memory.emin) >=
+ page_counter_read(&memcg->memory);
+}
int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
@@ -416,6 +491,33 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
}
+static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg)
+{
+ return percpu_ref_tryget(&objcg->refcnt);
+}
+
+static inline void obj_cgroup_get(struct obj_cgroup *objcg)
+{
+ percpu_ref_get(&objcg->refcnt);
+}
+
+static inline void obj_cgroup_put(struct obj_cgroup *objcg)
+{
+ percpu_ref_put(&objcg->refcnt);
+}
+
+/*
+ * After the initialization objcg->memcg is always pointing at
+ * a valid memcg, but can be atomically swapped to the parent memcg.
+ *
+ * The caller must ensure that the returned memcg won't be released:
+ * e.g. acquire the rcu_read_lock or css_set_lock.
+ */
+static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
+{
+ return READ_ONCE(objcg->memcg);
+}
+
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
if (memcg)
@@ -679,11 +781,34 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return x;
}
+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val);
void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val);
void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val);
+
void mod_memcg_obj_state(void *p, int idx, int val);
+static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx,
+ int val)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __mod_lruvec_slab_state(p, idx, val);
+ local_irq_restore(flags);
+}
+
+static inline void mod_memcg_lruvec_state(struct lruvec *lruvec,
+ enum node_stat_item idx, int val)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __mod_memcg_lruvec_state(lruvec, idx, val);
+ local_irq_restore(flags);
+}
+
static inline void mod_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx, int val)
{
@@ -825,16 +950,26 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
{
}
-static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg,
bool in_low_reclaim)
{
return 0;
}
-static inline enum mem_cgroup_protection mem_cgroup_protected(
- struct mem_cgroup *root, struct mem_cgroup *memcg)
+static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg)
+{
+}
+
+static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg)
+{
+ return false;
+}
+
+static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
{
- return MEMCG_PROT_NONE;
+ return false;
}
static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
@@ -1057,6 +1192,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return node_page_state(lruvec_pgdat(lruvec), idx);
}
+static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
+ enum node_stat_item idx, int val)
+{
+}
+
static inline void __mod_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx, int val)
{
@@ -1089,6 +1229,14 @@ static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx,
__mod_node_page_state(page_pgdat(page), idx, val);
}
+static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx,
+ int val)
+{
+ struct page *page = virt_to_head_page(p);
+
+ mod_node_page_state(page_pgdat(page), idx, val);
+}
+
static inline void mod_memcg_obj_state(void *p, int idx, int val)
{
}
@@ -1341,9 +1489,6 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
}
#endif
-struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
-void memcg_kmem_put_cache(struct kmem_cache *cachep);
-
#ifdef CONFIG_MEMCG_KMEM
int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
unsigned int nr_pages);
@@ -1351,8 +1496,12 @@ void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
void __memcg_kmem_uncharge_page(struct page *page, int order);
+struct obj_cgroup *get_obj_cgroup_from_current(void);
+
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
+
extern struct static_key_false memcg_kmem_enabled_key;
-extern struct workqueue_struct *memcg_kmem_cache_wq;
extern int memcg_nr_cache_ids;
void memcg_get_cache_ids(void);
@@ -1368,7 +1517,19 @@ void memcg_put_cache_ids(void);
static inline bool memcg_kmem_enabled(void)
{
- return static_branch_unlikely(&memcg_kmem_enabled_key);
+ return static_branch_likely(&memcg_kmem_enabled_key);
+}
+
+static inline bool memcg_kmem_bypass(void)
+{
+ if (in_interrupt())
+ return true;
+
+ /* Allow remote memcg charging in kthread contexts. */
+ if ((!current->mm || (current->flags & PF_KTHREAD)) &&
+ !current->active_memcg)
+ return true;
+ return false;
}
static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6c8333d6c991..f6a82f9bccd7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -206,6 +206,8 @@ int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *,
loff_t *);
int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
loff_t *);
+int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
+ loff_t *);
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
@@ -777,6 +779,11 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
extern void kvfree(const void *addr);
extern void kvfree_sensitive(const void *addr, size_t len);
+static inline int head_mapcount(struct page *head)
+{
+ return atomic_read(compound_mapcount_ptr(head)) + 1;
+}
+
/*
* Mapcount of compound page as a whole, does not include mapped sub-pages.
*
@@ -786,7 +793,7 @@ static inline int compound_mapcount(struct page *page)
{
VM_BUG_ON_PAGE(!PageCompound(page), page);
page = compound_head(page);
- return atomic_read(compound_mapcount_ptr(page)) + 1;
+ return head_mapcount(page);
}
/*
@@ -899,11 +906,16 @@ static inline bool hpage_pincount_available(struct page *page)
return PageCompound(page) && compound_order(page) > 1;
}
+static inline int head_pincount(struct page *head)
+{
+ return atomic_read(compound_pincount_ptr(head));
+}
+
static inline int compound_pincount(struct page *page)
{
VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
page = compound_head(page);
- return atomic_read(compound_pincount_ptr(page));
+ return head_pincount(page);
}
static inline void set_compound_order(struct page *page, unsigned int order)
@@ -2091,51 +2103,11 @@ static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d,
NULL : pud_offset(p4d, address);
}
-static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
- unsigned long address,
- pgtbl_mod_mask *mod_mask)
-
-{
- if (unlikely(pgd_none(*pgd))) {
- if (__p4d_alloc(mm, pgd, address))
- return NULL;
- *mod_mask |= PGTBL_PGD_MODIFIED;
- }
-
- return p4d_offset(pgd, address);
-}
-
-static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d,
- unsigned long address,
- pgtbl_mod_mask *mod_mask)
-{
- if (unlikely(p4d_none(*p4d))) {
- if (__pud_alloc(mm, p4d, address))
- return NULL;
- *mod_mask |= PGTBL_P4D_MODIFIED;
- }
-
- return pud_offset(p4d, address);
-}
-
static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
NULL: pmd_offset(pud, address);
}
-
-static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
- unsigned long address,
- pgtbl_mod_mask *mod_mask)
-{
- if (unlikely(pud_none(*pud))) {
- if (__pmd_alloc(mm, pud, address))
- return NULL;
- *mod_mask |= PGTBL_PUD_MODIFIED;
- }
-
- return pmd_offset(pud, address);
-}
#endif /* CONFIG_MMU */
#if USE_SPLIT_PTE_PTLOCKS
@@ -2251,11 +2223,6 @@ static inline void pgtable_pte_page_dtor(struct page *page)
((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
NULL: pte_offset_kernel(pmd, address))
-#define pte_alloc_kernel_track(pmd, address, mask) \
- ((unlikely(pmd_none(*(pmd))) && \
- (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
- NULL: pte_offset_kernel(pmd, address))
-
#if USE_SPLIT_PMD_PTLOCKS
static struct page *pmd_to_page(pmd_t *pmd)
@@ -2413,9 +2380,6 @@ static inline unsigned long get_num_physpages(void)
* for_each_valid_physical_page_range()
* memblock_add_node(base, size, nid)
* free_area_init(max_zone_pfns);
- *
- * sparse_memory_present_with_active_regions() calls memory_present() for
- * each range when SPARSEMEM is enabled.
*/
void free_area_init(unsigned long *max_zone_pfn);
unsigned long node_map_pfn_alignment(void);
@@ -2426,7 +2390,6 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn,
extern void get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn);
extern unsigned long find_min_pfn_with_active_regions(void);
-extern void sparse_memory_present_with_active_regions(int nid);
#ifndef CONFIG_NEED_MULTIPLE_NODES
static inline int early_pfn_to_nid(unsigned long pfn)
@@ -2577,23 +2540,13 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr,
struct list_head *uf);
extern unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
- vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
- struct list_head *uf);
+ unsigned long pgoff, unsigned long *populate, struct list_head *uf);
extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf, bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
extern int do_madvise(unsigned long start, size_t len_in, int behavior);
-static inline unsigned long
-do_mmap_pgoff(struct file *file, unsigned long addr,
- unsigned long len, unsigned long prot, unsigned long flags,
- unsigned long pgoff, unsigned long *populate,
- struct list_head *uf)
-{
- return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf);
-}
-
#ifdef CONFIG_MMU
extern int __mm_populate(unsigned long addr, unsigned long len,
int ignore_errors);
@@ -3009,14 +2962,15 @@ pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
-pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
+pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
+ struct vmem_altmap *altmap);
void *vmemmap_alloc_block(unsigned long size, int node);
struct vmem_altmap;
-void *vmemmap_alloc_block_buf(unsigned long size, int node);
-void *altmap_alloc_block_buf(unsigned long size, struct vmem_altmap *altmap);
+void *vmemmap_alloc_block_buf(unsigned long size, int node,
+ struct vmem_altmap *altmap);
void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
int vmemmap_populate_basepages(unsigned long start, unsigned long end,
- int node);
+ int node, struct vmem_altmap *altmap);
int vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap);
void vmemmap_populate_print_last(void);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 64ede5f150dc..0277fbab7c93 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -198,7 +198,10 @@ struct page {
atomic_t _refcount;
#ifdef CONFIG_MEMCG
- struct mem_cgroup *mem_cgroup;
+ union {
+ struct mem_cgroup *mem_cgroup;
+ struct obj_cgroup **obj_cgroups;
+ };
#endif
/*
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 4b08e9c9c538..6f34c33075f9 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -57,8 +57,12 @@ extern struct percpu_counter vm_committed_as;
#ifdef CONFIG_SMP
extern s32 vm_committed_as_batch;
+extern void mm_compute_batch(int overcommit_policy);
#else
#define vm_committed_as_batch 0
+static inline void mm_compute_batch(int overcommit_policy)
+{
+}
#endif
unsigned long vm_memory_committed(void);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index c6f0708195cd..b8200782dede 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -521,6 +521,16 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
range->flags = flags;
}
+static inline void mmu_notifier_range_init_migrate(
+ struct mmu_notifier_range *range, unsigned int flags,
+ struct vm_area_struct *vma, struct mm_struct *mm,
+ unsigned long start, unsigned long end, void *pgmap)
+{
+ mmu_notifier_range_init(range, MMU_NOTIFY_MIGRATE, flags, vma, mm,
+ start, end);
+ range->migrate_pgmap_owner = pgmap;
+}
+
#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \
({ \
int __young; \
@@ -645,6 +655,9 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \
_mmu_notifier_range_init(range, start, end)
+#define mmu_notifier_range_init_migrate(range, flags, vma, mm, start, end, \
+ pgmap) \
+ _mmu_notifier_range_init(range, start, end)
static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f6f884970511..635a96cd9b1f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -88,12 +88,10 @@ static inline bool is_migrate_movable(int mt)
extern int page_group_by_mobility_disabled;
-#define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1)
-#define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1)
+#define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1)
#define get_pageblock_migratetype(page) \
- get_pfnblock_flags_mask(page, page_to_pfn(page), \
- PB_migrate_end, MIGRATETYPE_MASK)
+ get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK)
struct free_area {
struct list_head free_list[MIGRATE_TYPES];
@@ -155,10 +153,6 @@ enum zone_stat_item {
NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
NR_MLOCK, /* mlock()ed pages found and moved off LRU */
NR_PAGETABLE, /* used for pagetables */
- NR_KERNEL_STACK_KB, /* measured in KiB */
-#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
- NR_KERNEL_SCS_KB, /* measured in KiB */
-#endif
/* Second 128 byte cacheline */
NR_BOUNCE,
#if IS_ENABLED(CONFIG_ZSMALLOC)
@@ -174,8 +168,8 @@ enum node_stat_item {
NR_INACTIVE_FILE, /* " " " " " */
NR_ACTIVE_FILE, /* " " " " " */
NR_UNEVICTABLE, /* " " " " " */
- NR_SLAB_RECLAIMABLE,
- NR_SLAB_UNRECLAIMABLE,
+ NR_SLAB_RECLAIMABLE_B,
+ NR_SLAB_UNRECLAIMABLE_B,
NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
WORKINGSET_NODES,
@@ -203,10 +197,34 @@ enum node_stat_item {
NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */
NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */
NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */
+ NR_KERNEL_STACK_KB, /* measured in KiB */
+#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
+ NR_KERNEL_SCS_KB, /* measured in KiB */
+#endif
NR_VM_NODE_STAT_ITEMS
};
/*
+ * Returns true if the value is measured in bytes (most vmstat values are
+ * measured in pages). This defines the API part, the internal representation
+ * might be different.
+ */
+static __always_inline bool vmstat_item_in_bytes(int idx)
+{
+ /*
+ * Global and per-node slab counters track slab pages.
+ * It's expected that changes are multiples of PAGE_SIZE.
+ * Internally values are stored in pages.
+ *
+ * Per-memcg and per-lruvec counters track memory, consumed
+ * by individual slab objects. These counters are actually
+ * byte-precise.
+ */
+ return (idx == NR_SLAB_RECLAIMABLE_B ||
+ idx == NR_SLAB_UNRECLAIMABLE_B);
+}
+
+/*
* We do arithmetic on the LRU lists in various places in the code,
* so it is important to keep the active lists LRU_ACTIVE higher in
* the array than the corresponding inactive lists, and to keep
@@ -819,18 +837,6 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);
-#ifdef CONFIG_HAVE_MEMORY_PRESENT
-void memory_present(int nid, unsigned long start, unsigned long end);
-#else
-static inline void memory_present(int nid, unsigned long start, unsigned long end) {}
-#endif
-
-#if defined(CONFIG_SPARSEMEM)
-void memblocks_present(void);
-#else
-static inline void memblocks_present(void) {}
-#endif
-
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
int local_memory_node(int node_id);
#else
@@ -1387,8 +1393,6 @@ struct mminit_pfnnid_cache {
#define early_pfn_valid(pfn) (1)
#endif
-void memory_present(int nid, unsigned long start, unsigned long end);
-
/*
* If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
* need to check pfn validity within that MAX_ORDER_NR_PAGES block.
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index c066fec5b74b..fff52ad370c1 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -56,35 +56,25 @@ struct page;
unsigned long get_pfnblock_flags_mask(struct page *page,
unsigned long pfn,
- unsigned long end_bitidx,
unsigned long mask);
void set_pfnblock_flags_mask(struct page *page,
unsigned long flags,
unsigned long pfn,
- unsigned long end_bitidx,
unsigned long mask);
/* Declarations for getting and setting flags. See mm/page_alloc.c */
-#define get_pageblock_flags_group(page, start_bitidx, end_bitidx) \
- get_pfnblock_flags_mask(page, page_to_pfn(page), \
- end_bitidx, \
- (1 << (end_bitidx - start_bitidx + 1)) - 1)
-#define set_pageblock_flags_group(page, flags, start_bitidx, end_bitidx) \
- set_pfnblock_flags_mask(page, flags, page_to_pfn(page), \
- end_bitidx, \
- (1 << (end_bitidx - start_bitidx + 1)) - 1)
-
#ifdef CONFIG_COMPACTION
#define get_pageblock_skip(page) \
- get_pageblock_flags_group(page, PB_migrate_skip, \
- PB_migrate_skip)
+ get_pfnblock_flags_mask(page, page_to_pfn(page), \
+ (1 << (PB_migrate_skip)))
#define clear_pageblock_skip(page) \
- set_pageblock_flags_group(page, 0, PB_migrate_skip, \
- PB_migrate_skip)
+ set_pfnblock_flags_mask(page, 0, page_to_pfn(page), \
+ (1 << PB_migrate_skip))
#define set_pageblock_skip(page) \
- set_pageblock_flags_group(page, 1, PB_migrate_skip, \
- PB_migrate_skip)
+ set_pfnblock_flags_mask(page, (1 << PB_migrate_skip), \
+ page_to_pfn(page), \
+ (1 << PB_migrate_skip))
#else
static inline bool get_pageblock_skip(struct page *page)
{
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 0a4f54dd4737..01861eebed79 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -44,6 +44,7 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
s32 batch);
s64 __percpu_counter_sum(struct percpu_counter *fbc);
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
+void percpu_counter_sync(struct percpu_counter *fbc);
static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
@@ -172,6 +173,9 @@ static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
return true;
}
+static inline void percpu_counter_sync(struct percpu_counter *fbc)
+{
+}
#endif /* CONFIG_SMP */
static inline void percpu_counter_inc(struct percpu_counter *fbc)
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 6be66f52a2ad..85023ddc2dc2 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -175,12 +175,10 @@ static inline bool in_vfork(struct task_struct *tsk)
* Applies per-task gfp context to the given allocation flags.
* PF_MEMALLOC_NOIO implies GFP_NOIO
* PF_MEMALLOC_NOFS implies GFP_NOFS
- * PF_MEMALLOC_NOCMA implies no allocation from CMA region.
*/
static inline gfp_t current_gfp_context(gfp_t flags)
{
- if (unlikely(current->flags &
- (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_NOCMA))) {
+ if (unlikely(current->flags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) {
/*
* NOIO implies both NOIO and NOFS and it is a weaker context
* so always make sure it makes precedence
@@ -189,10 +187,6 @@ static inline gfp_t current_gfp_context(gfp_t flags)
flags &= ~(__GFP_IO | __GFP_FS);
else if (current->flags & PF_MEMALLOC_NOFS)
flags &= ~__GFP_FS;
-#ifdef CONFIG_CMA
- if (current->flags & PF_MEMALLOC_NOCMA)
- flags &= ~__GFP_MOVABLE;
-#endif
}
return flags;
}
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 7a35a6901221..a5a5d1d4d7b1 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -36,6 +36,9 @@ struct shmem_sb_info {
unsigned char huge; /* Whether to try for hugepages */
kuid_t uid; /* Mount uid for root directory */
kgid_t gid; /* Mount gid for root directory */
+ bool full_inums; /* If i_ino should be uint or ino_t */
+ ino_t next_ino; /* The next per-sb inode number to use */
+ ino_t __percpu *ino_batch; /* The next per-cpu inode number to use */
struct mempolicy *mpol; /* default memory policy for mappings */
spinlock_t shrinklist_lock; /* Protects shrinklist */
struct list_head shrinklist; /* List of shinkable inodes */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 6d454886bcaf..24df2393ec03 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -155,9 +155,6 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name,
void kmem_cache_destroy(struct kmem_cache *);
int kmem_cache_shrink(struct kmem_cache *);
-void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
-void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *);
-
/*
* Please use this macro to create slab caches. Simply specify the
* name of the structure and maybe some flags that are listed above.
@@ -186,10 +183,12 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *);
*/
void * __must_check krealloc(const void *, size_t, gfp_t);
void kfree(const void *);
-void kzfree(const void *);
+void kfree_sensitive(const void *);
size_t __ksize(const void *);
size_t ksize(const void *);
+#define kzfree(x) kfree_sensitive(x) /* For backward compatibility */
+
#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
bool to_user);
@@ -578,8 +577,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
return __kmalloc_node(size, flags, node);
}
-int memcg_update_all_caches(int num_memcgs);
-
/**
* kmalloc_array - allocate memory for an array.
* @n: number of elements.
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index abc7de77b988..9eb430c163c2 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -72,9 +72,6 @@ struct kmem_cache {
int obj_offset;
#endif /* CONFIG_DEBUG_SLAB */
-#ifdef CONFIG_MEMCG
- struct memcg_cache_params memcg_params;
-#endif
#ifdef CONFIG_KASAN
struct kasan_cache kasan_info;
#endif
@@ -114,4 +111,10 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache,
return reciprocal_divide(offset, cache->reciprocal_buffer_size);
}
+static inline int objs_per_slab_page(const struct kmem_cache *cache,
+ const struct page *page)
+{
+ return cache->num;
+}
+
#endif /* _LINUX_SLAB_DEF_H */
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index d2153789bd9f..1be0ed5befa1 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -8,6 +8,7 @@
* (C) 2007 SGI, Christoph Lameter
*/
#include <linux/kobject.h>
+#include <linux/reciprocal_div.h>
enum stat_item {
ALLOC_FASTPATH, /* Allocation from cpu slab */
@@ -86,6 +87,7 @@ struct kmem_cache {
unsigned long min_partial;
unsigned int size; /* The size of an object including metadata */
unsigned int object_size;/* The size of an object without metadata */
+ struct reciprocal_value reciprocal_size;
unsigned int offset; /* Free pointer offset */
#ifdef CONFIG_SLUB_CPU_PARTIAL
/* Number of per cpu partial objects to keep around */
@@ -106,17 +108,7 @@ struct kmem_cache {
struct list_head list; /* List of slab caches */
#ifdef CONFIG_SYSFS
struct kobject kobj; /* For sysfs */
- struct work_struct kobj_remove_work;
#endif
-#ifdef CONFIG_MEMCG
- struct memcg_cache_params memcg_params;
- /* For propagation, maximum size of a stored attr */
- unsigned int max_attr_size;
-#ifdef CONFIG_SYSFS
- struct kset *memcg_kset;
-#endif
-#endif
-
#ifdef CONFIG_SLAB_FREELIST_HARDENED
unsigned long random;
#endif
@@ -182,4 +174,23 @@ static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
return result;
}
+/* Determine object index from a given position */
+static inline unsigned int __obj_to_index(const struct kmem_cache *cache,
+ void *addr, void *obj)
+{
+ return reciprocal_divide(kasan_reset_tag(obj) - addr,
+ cache->reciprocal_size);
+}
+
+static inline unsigned int obj_to_index(const struct kmem_cache *cache,
+ const struct page *page, void *obj)
+{
+ return __obj_to_index(cache, page_address(page), obj);
+}
+
+static inline int objs_per_slab_page(const struct kmem_cache *cache,
+ const struct page *page)
+{
+ return page->objects;
+}
#endif /* _LINUX_SLUB_DEF_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 5b3216ba39a9..7eb59bc552a5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -328,7 +328,6 @@ void workingset_update_node(struct xa_node *node);
/* linux/mm/page_alloc.c */
extern unsigned long totalreserve_pages;
extern unsigned long nr_free_buffer_pages(void);
-extern unsigned long nr_free_pagecache_pages(void);
/* Definition of global_zone_page_state not available yet */
#define nr_free_pages() global_zone_page_state(NR_FREE_PAGES)
@@ -372,7 +371,6 @@ extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
extern int remove_mapping(struct address_space *mapping, struct page *page);
-extern unsigned long vm_total_pages;
extern unsigned long reclaim_pages(struct list_head *page_list);
#ifdef CONFIG_NUMA
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index aa961088c551..91220ace31da 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -8,6 +8,7 @@
#include <linux/vm_event_item.h>
#include <linux/atomic.h>
#include <linux/static_key.h>
+#include <linux/mmdebug.h>
extern int sysctl_stat_interval;
@@ -192,7 +193,8 @@ static inline unsigned long global_zone_page_state(enum zone_stat_item item)
return x;
}
-static inline unsigned long global_node_page_state(enum node_stat_item item)
+static inline
+unsigned long global_node_page_state_pages(enum node_stat_item item)
{
long x = atomic_long_read(&vm_node_stat[item]);
#ifdef CONFIG_SMP
@@ -202,6 +204,13 @@ static inline unsigned long global_node_page_state(enum node_stat_item item)
return x;
}
+static inline unsigned long global_node_page_state(enum node_stat_item item)
+{
+ VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
+ return global_node_page_state_pages(item);
+}
+
static inline unsigned long zone_page_state(struct zone *zone,
enum zone_stat_item item)
{
@@ -242,9 +251,12 @@ extern unsigned long sum_zone_node_page_state(int node,
extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item);
extern unsigned long node_page_state(struct pglist_data *pgdat,
enum node_stat_item item);
+extern unsigned long node_page_state_pages(struct pglist_data *pgdat,
+ enum node_stat_item item);
#else
#define sum_zone_node_page_state(node, item) global_zone_page_state(item)
#define node_page_state(node, item) global_node_page_state(item)
+#define node_page_state_pages(node, item) global_node_page_state_pages(item)
#endif /* CONFIG_NUMA */
#ifdef CONFIG_SMP
diff --git a/init/Kconfig b/init/Kconfig
index 9082ed33a9cd..d6a0b31b13dc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1913,9 +1913,8 @@ config SLAB_MERGE_DEFAULT
command line.
config SLAB_FREELIST_RANDOM
- default n
+ bool "Randomize slab freelist"
depends on SLAB || SLUB
- bool "SLAB freelist randomization"
help
Randomizes the freelist order used on creating new pages. This
security feature reduces the predictability of the kernel slab
@@ -1923,12 +1922,14 @@ config SLAB_FREELIST_RANDOM
config SLAB_FREELIST_HARDENED
bool "Harden slab freelist metadata"
- depends on SLUB
+ depends on SLAB || SLUB
help
Many kernel heap attacks try to target slab cache metadata and
other infrastructure. This options makes minor performance
sacrifices to harden the kernel slab allocator against common
- freelist exploit methods.
+ freelist exploit methods. Some slab implementations have more
+ sanity-checking than others. This option is most effective with
+ CONFIG_SLUB.
config SHUFFLE_PAGE_ALLOCATOR
bool "Page allocator randomization"
diff --git a/init/main.c b/init/main.c
index 276ca3160c8c..de2f9fa6bb4a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -830,7 +830,7 @@ void __init __weak arch_call_rest_init(void)
rest_init();
}
-asmlinkage __visible void __init start_kernel(void)
+asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
{
char *command_line;
char *after_dashes;
diff --git a/ipc/shm.c b/ipc/shm.c
index 0a6dd94afa21..bf38d7e2fbe9 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1558,7 +1558,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
goto invalid;
}
- addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate, NULL);
+ addr = do_mmap(file, addr, size, prot, flags, 0, &populate, NULL);
*raddr = addr;
err = 0;
if (IS_ERR_VALUE(addr))
diff --git a/kernel/fork.c b/kernel/fork.c
index 76d3f3387554..35e9894d394c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -261,7 +261,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
THREAD_SIZE_ORDER);
if (likely(page)) {
- tsk->stack = page_address(page);
+ tsk->stack = kasan_reset_tag(page_address(page));
return tsk->stack;
}
return NULL;
@@ -276,13 +276,8 @@ static inline void free_thread_stack(struct task_struct *tsk)
if (vm) {
int i;
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- mod_memcg_page_state(vm->pages[i],
- MEMCG_KERNEL_STACK_KB,
- -(int)(PAGE_SIZE / 1024));
-
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
memcg_kmem_uncharge_page(vm->pages[i], 0);
- }
for (i = 0; i < NR_CACHED_STACKS; i++) {
if (this_cpu_cmpxchg(cached_stacks[i],
@@ -307,6 +302,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
{
unsigned long *stack;
stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
+ stack = kasan_reset_tag(stack);
tsk->stack = stack;
return stack;
}
@@ -382,31 +378,14 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
void *stack = task_stack_page(tsk);
struct vm_struct *vm = task_stack_vm_area(tsk);
- BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
-
- if (vm) {
- int i;
-
- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- mod_zone_page_state(page_zone(vm->pages[i]),
- NR_KERNEL_STACK_KB,
- PAGE_SIZE / 1024 * account);
- }
- } else {
- /*
- * All stack pages are in the same zone and belong to the
- * same memcg.
- */
- struct page *first_page = virt_to_page(stack);
-
- mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
- THREAD_SIZE / 1024 * account);
-
- mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
- }
+ /* All stack pages are in the same node. */
+ if (vm)
+ mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
+ else
+ mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
}
static int memcg_charge_kernel_stack(struct task_struct *tsk)
@@ -415,24 +394,23 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk)
struct vm_struct *vm = task_stack_vm_area(tsk);
int ret;
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
if (vm) {
int i;
+ BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
/*
* If memcg_kmem_charge_page() fails, page->mem_cgroup
- * pointer is NULL, and both memcg_kmem_uncharge_page()
- * and mod_memcg_page_state() in free_thread_stack()
- * will ignore this page. So it's safe.
+ * pointer is NULL, and memcg_kmem_uncharge_page() in
+ * free_thread_stack() will ignore this page.
*/
ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
0);
if (ret)
return ret;
-
- mod_memcg_page_state(vm->pages[i],
- MEMCG_KERNEL_STACK_KB,
- PAGE_SIZE / 1024);
}
}
#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1d9e2fdfd67a..b2807e7be772 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -480,7 +480,6 @@ EXPORT_SYMBOL(kthread_bind);
* to "name.*%u". Code fills in cpu number.
*
* Description: This helper function creates and names a kernel thread
- * The thread will be woken and put into park mode.
*/
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
void *data, unsigned int cpu,
@@ -1241,13 +1240,16 @@ void kthread_use_mm(struct mm_struct *mm)
WARN_ON_ONCE(tsk->mm);
task_lock(tsk);
+ /* Hold off tlb flush IPIs while switching mm's */
+ local_irq_disable();
active_mm = tsk->active_mm;
if (active_mm != mm) {
mmgrab(mm);
tsk->active_mm = mm;
}
tsk->mm = mm;
- switch_mm(active_mm, mm, tsk);
+ switch_mm_irqs_off(active_mm, mm, tsk);
+ local_irq_enable();
task_unlock(tsk);
#ifdef finish_arch_post_lock_switch
finish_arch_post_lock_switch();
@@ -1276,9 +1278,11 @@ void kthread_unuse_mm(struct mm_struct *mm)
task_lock(tsk);
sync_mm_rss(mm);
+ local_irq_disable();
tsk->mm = NULL;
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
+ local_irq_enable();
task_unlock(tsk);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cef154261fe2..d25749bce7cf 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1663,7 +1663,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
{
unsigned long size;
- size = global_node_page_state(NR_SLAB_RECLAIMABLE)
+ size = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B)
+ global_node_page_state(NR_ACTIVE_ANON)
+ global_node_page_state(NR_INACTIVE_ANON)
+ global_node_page_state(NR_ACTIVE_FILE)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ac7198ed3197..8ce77d9ac716 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -59,6 +59,7 @@
#include <linux/sched/clock.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
+#include <linux/kasan.h>
#include "../time/tick-internal.h"
#include "tree.h"
@@ -2890,6 +2891,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
head->func = func;
head->next = NULL;
local_irq_save(flags);
+ kasan_record_aux_stack(head);
rdp = this_cpu_ptr(&rcu_data);
/* Add the callback to our list. */
diff --git a/kernel/scs.c b/kernel/scs.c
index 5d4d9bbdec36..4ff4a7ba0094 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -17,7 +17,7 @@ static void __scs_account(void *s, int account)
{
struct page *scs_page = virt_to_page(s);
- mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB,
+ mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB,
account * (SCS_SIZE / SZ_1K));
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1b4d2dc270a5..f785de3caac0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2671,7 +2671,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_overcommit_memory,
.maxlen = sizeof(sysctl_overcommit_memory),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = overcommit_policy_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 34b84bcbd3d9..047b53dbfd58 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -18,7 +18,7 @@ config CC_HAS_KASAN_SW_TAGS
config CC_HAS_WORKING_NOSANITIZE_ADDRESS
def_bool !CC_IS_GCC || GCC_VERSION >= 80300
-config KASAN
+menuconfig KASAN
bool "KASAN: runtime memory debugger"
depends on (HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC) || \
(HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS)
@@ -29,9 +29,10 @@ config KASAN
designed to find out-of-bounds accesses and use-after-free bugs.
See Documentation/dev-tools/kasan.rst for details.
+if KASAN
+
choice
prompt "KASAN mode"
- depends on KASAN
default KASAN_GENERIC
help
KASAN has two modes: generic KASAN (similar to userspace ASan,
@@ -39,6 +40,7 @@ choice
software tag-based KASAN (a version based on software memory
tagging, arm64 only, similar to userspace HWASan, enabled with
CONFIG_KASAN_SW_TAGS).
+
Both generic and tag-based KASAN are strictly debugging features.
config KASAN_GENERIC
@@ -50,16 +52,18 @@ config KASAN_GENERIC
select STACKDEPOT
help
Enables generic KASAN mode.
- Supported in both GCC and Clang. With GCC it requires version 4.9.2
- or later for basic support and version 5.0 or later for detection of
- out-of-bounds accesses for stack and global variables and for inline
- instrumentation mode (CONFIG_KASAN_INLINE). With Clang it requires
- version 3.7.0 or later and it doesn't support detection of
- out-of-bounds accesses for global variables yet.
+
+ This mode is supported in both GCC and Clang. With GCC it requires
+ version 8.3.0 or later. With Clang it requires version 7.0.0 or
+ later, but detection of out-of-bounds accesses for global variables
+ is supported only since Clang 11.
+
This mode consumes about 1/8th of available memory at kernel start
and introduces an overhead of ~x1.5 for the rest of the allocations.
The performance slowdown is ~x3.
+
For better error detection enable CONFIG_STACKTRACE.
+
Currently CONFIG_KASAN_GENERIC doesn't work with CONFIG_DEBUG_SLAB
(the resulting kernel does not boot).
@@ -72,15 +76,19 @@ config KASAN_SW_TAGS
select STACKDEPOT
help
Enables software tag-based KASAN mode.
+
This mode requires Top Byte Ignore support by the CPU and therefore
- is only supported for arm64.
- This mode requires Clang version 7.0.0 or later.
+ is only supported for arm64. This mode requires Clang version 7.0.0
+ or later.
+
This mode consumes about 1/16th of available memory at kernel start
and introduces an overhead of ~20% for the rest of the allocations.
This mode may potentially introduce problems relating to pointer
casting and comparison, as it embeds tags into the top byte of each
pointer.
+
For better error detection enable CONFIG_STACKTRACE.
+
Currently CONFIG_KASAN_SW_TAGS doesn't work with CONFIG_DEBUG_SLAB
(the resulting kernel does not boot).
@@ -88,7 +96,6 @@ endchoice
choice
prompt "Instrumentation type"
- depends on KASAN
default KASAN_OUTLINE
config KASAN_OUTLINE
@@ -107,13 +114,11 @@ config KASAN_INLINE
memory accesses. This is faster than outline (in some workloads
it gives about x2 boost over outline instrumentation), but
make kernel's .text size much bigger.
- For CONFIG_KASAN_GENERIC this requires GCC 5.0 or later.
endchoice
config KASAN_STACK_ENABLE
bool "Enable stack instrumentation (unsafe)" if CC_IS_CLANG && !COMPILE_TEST
- depends on KASAN
help
The LLVM stack address sanitizer has a know problem that
causes excessive stack usage in a lot of functions, see
@@ -134,7 +139,7 @@ config KASAN_STACK
config KASAN_S390_4_LEVEL_PAGING
bool "KASan: use 4-level paging"
- depends on KASAN && S390
+ depends on S390
help
Compiling the kernel with KASan disables automatic 3-level vs
4-level paging selection. 3-level paging is used by default (up
@@ -151,7 +156,7 @@ config KASAN_SW_TAGS_IDENTIFY
config KASAN_VMALLOC
bool "Back mappings in vmalloc space with real shadow memory"
- depends on KASAN && HAVE_ARCH_KASAN_VMALLOC
+ depends on HAVE_ARCH_KASAN_VMALLOC
help
By default, the shadow region for vmalloc space is the read-only
zero page. This means that KASAN cannot detect errors involving
@@ -164,8 +169,10 @@ config KASAN_VMALLOC
config TEST_KASAN
tristate "Module for testing KASAN for bug detection"
- depends on m && KASAN
+ depends on m
help
This is a test module doing various nasty things like
out of bounds accesses, use after free. It is useful for testing
kernel debugging features like KASAN.
+
+endif # KASAN
diff --git a/lib/Makefile b/lib/Makefile
index 435f7f13b8aa..f39962104036 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -37,7 +37,6 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o
lib-$(CONFIG_PRINTK) += dump_stack.o
-lib-$(CONFIG_MMU) += ioremap.o
lib-$(CONFIG_SMP) += cpumask.o
lib-y += kobject.o klist.o
diff --git a/lib/mpi/mpiutil.c b/lib/mpi/mpiutil.c
index 20ed0f766787..4cd2b335cb7f 100644
--- a/lib/mpi/mpiutil.c
+++ b/lib/mpi/mpiutil.c
@@ -69,7 +69,7 @@ void mpi_free_limb_space(mpi_ptr_t a)
if (!a)
return;
- kzfree(a);
+ kfree_sensitive(a);
}
void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs)
@@ -95,7 +95,7 @@ int mpi_resize(MPI a, unsigned nlimbs)
if (!p)
return -ENOMEM;
memcpy(p, a->d, a->alloced * sizeof(mpi_limb_t));
- kzfree(a->d);
+ kfree_sensitive(a->d);
a->d = p;
} else {
a->d = kcalloc(nlimbs, sizeof(mpi_limb_t), GFP_KERNEL);
@@ -112,7 +112,7 @@ void mpi_free(MPI a)
return;
if (a->flags & 4)
- kzfree(a->d);
+ kfree_sensitive(a->d);
else
mpi_free_limb_space(a->d);
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index a66595ba5543..a2345de90e93 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -99,6 +99,25 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
EXPORT_SYMBOL(percpu_counter_add_batch);
/*
+ * For percpu_counter with a big batch, the devication of its count could
+ * be big, and there is requirement to reduce the deviation, like when the
+ * counter's batch could be runtime decreased to get a better accuracy,
+ * which can be achieved by running this sync function on each CPU.
+ */
+void percpu_counter_sync(struct percpu_counter *fbc)
+{
+ unsigned long flags;
+ s64 count;
+
+ raw_spin_lock_irqsave(&fbc->lock, flags);
+ count = __this_cpu_read(*fbc->counters);
+ fbc->count += count;
+ __this_cpu_sub(*fbc->counters, count);
+ raw_spin_unlock_irqrestore(&fbc->lock, flags);
+}
+EXPORT_SYMBOL(percpu_counter_sync);
+
+/*
* Add up all the per-cpu counts, return the result. This is a more accurate
* but much slower version of percpu_counter_read_positive()
*/
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index dc2c6a51d11a..53e953bb1d1d 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -23,6 +23,10 @@
#include <asm/page.h>
+#include "../mm/kasan/kasan.h"
+
+#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_SHADOW_SCALE_SIZE)
+
/*
* We assign some test results to these globals to make sure the tests
* are not eliminated as dead code.
@@ -48,7 +52,8 @@ static noinline void __init kmalloc_oob_right(void)
return;
}
- ptr[size] = 'x';
+ ptr[size + OOB_TAG_OFF] = 'x';
+
kfree(ptr);
}
@@ -100,7 +105,8 @@ static noinline void __init kmalloc_pagealloc_oob_right(void)
return;
}
- ptr[size] = 0;
+ ptr[size + OOB_TAG_OFF] = 0;
+
kfree(ptr);
}
@@ -170,7 +176,8 @@ static noinline void __init kmalloc_oob_krealloc_more(void)
return;
}
- ptr2[size2] = 'x';
+ ptr2[size2 + OOB_TAG_OFF] = 'x';
+
kfree(ptr2);
}
@@ -188,7 +195,9 @@ static noinline void __init kmalloc_oob_krealloc_less(void)
kfree(ptr1);
return;
}
- ptr2[size2] = 'x';
+
+ ptr2[size2 + OOB_TAG_OFF] = 'x';
+
kfree(ptr2);
}
@@ -224,7 +233,8 @@ static noinline void __init kmalloc_oob_memset_2(void)
return;
}
- memset(ptr+7, 0, 2);
+ memset(ptr + 7 + OOB_TAG_OFF, 0, 2);
+
kfree(ptr);
}
@@ -240,7 +250,8 @@ static noinline void __init kmalloc_oob_memset_4(void)
return;
}
- memset(ptr+5, 0, 4);
+ memset(ptr + 5 + OOB_TAG_OFF, 0, 4);
+
kfree(ptr);
}
@@ -257,7 +268,8 @@ static noinline void __init kmalloc_oob_memset_8(void)
return;
}
- memset(ptr+1, 0, 8);
+ memset(ptr + 1 + OOB_TAG_OFF, 0, 8);
+
kfree(ptr);
}
@@ -273,7 +285,8 @@ static noinline void __init kmalloc_oob_memset_16(void)
return;
}
- memset(ptr+1, 0, 16);
+ memset(ptr + 1 + OOB_TAG_OFF, 0, 16);
+
kfree(ptr);
}
@@ -289,7 +302,8 @@ static noinline void __init kmalloc_oob_in_memset(void)
return;
}
- memset(ptr, 0, size+5);
+ memset(ptr, 0, size + 5 + OOB_TAG_OFF);
+
kfree(ptr);
}
@@ -423,7 +437,8 @@ static noinline void __init kmem_cache_oob(void)
return;
}
- *p = p[size];
+ *p = p[size + OOB_TAG_OFF];
+
kmem_cache_free(cache, p);
kmem_cache_destroy(cache);
}
@@ -473,7 +488,7 @@ static noinline void __init kasan_global_oob(void)
static noinline void __init kasan_stack_oob(void)
{
char stack_array[10];
- volatile int i = 0;
+ volatile int i = OOB_TAG_OFF;
char *p = &stack_array[ARRAY_SIZE(stack_array) + i];
pr_info("out-of-bounds on stack\n");
@@ -520,25 +535,25 @@ static noinline void __init copy_user_test(void)
}
pr_info("out-of-bounds in copy_from_user()\n");
- unused = copy_from_user(kmem, usermem, size + 1);
+ unused = copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
pr_info("out-of-bounds in copy_to_user()\n");
- unused = copy_to_user(usermem, kmem, size + 1);
+ unused = copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF);
pr_info("out-of-bounds in __copy_from_user()\n");
- unused = __copy_from_user(kmem, usermem, size + 1);
+ unused = __copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
pr_info("out-of-bounds in __copy_to_user()\n");
- unused = __copy_to_user(usermem, kmem, size + 1);
+ unused = __copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF);
pr_info("out-of-bounds in __copy_from_user_inatomic()\n");
- unused = __copy_from_user_inatomic(kmem, usermem, size + 1);
+ unused = __copy_from_user_inatomic(kmem, usermem, size + 1 + OOB_TAG_OFF);
pr_info("out-of-bounds in __copy_to_user_inatomic()\n");
- unused = __copy_to_user_inatomic(usermem, kmem, size + 1);
+ unused = __copy_to_user_inatomic(usermem, kmem, size + 1 + OOB_TAG_OFF);
pr_info("out-of-bounds in strncpy_from_user()\n");
- unused = strncpy_from_user(kmem, usermem, size + 1);
+ unused = strncpy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
vm_munmap((unsigned long)usermem, PAGE_SIZE);
kfree(kmem);
@@ -766,15 +781,15 @@ static noinline void __init kmalloc_double_kzfree(void)
char *ptr;
size_t size = 16;
- pr_info("double-free (kzfree)\n");
+ pr_info("double-free (kfree_sensitive)\n");
ptr = kmalloc(size, GFP_KERNEL);
if (!ptr) {
pr_err("Allocation failed\n");
return;
}
- kzfree(ptr);
- kzfree(ptr);
+ kfree_sensitive(ptr);
+ kfree_sensitive(ptr);
}
#ifdef CONFIG_KASAN_VMALLOC
@@ -801,6 +816,35 @@ static noinline void __init vmalloc_oob(void)
static void __init vmalloc_oob(void) {}
#endif
+static struct kasan_rcu_info {
+ int i;
+ struct rcu_head rcu;
+} *global_rcu_ptr;
+
+static noinline void __init kasan_rcu_reclaim(struct rcu_head *rp)
+{
+ struct kasan_rcu_info *fp = container_of(rp,
+ struct kasan_rcu_info, rcu);
+
+ kfree(fp);
+ fp->i = 1;
+}
+
+static noinline void __init kasan_rcu_uaf(void)
+{
+ struct kasan_rcu_info *ptr;
+
+ pr_info("use-after-free in kasan_rcu_reclaim\n");
+ ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL);
+ if (!ptr) {
+ pr_err("Allocation failed\n");
+ return;
+ }
+
+ global_rcu_ptr = rcu_dereference_protected(ptr, NULL);
+ call_rcu(&global_rcu_ptr->rcu, kasan_rcu_reclaim);
+}
+
static int __init kmalloc_tests_init(void)
{
/*
@@ -848,6 +892,7 @@ static int __init kmalloc_tests_init(void)
kasan_bitops();
kmalloc_double_kzfree();
vmalloc_oob();
+ kasan_rcu_uaf();
kasan_restore_multi_shot(multishot);
diff --git a/mm/Kconfig b/mm/Kconfig
index d41f3fa7e923..6c974888f86f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -88,13 +88,9 @@ config NEED_MULTIPLE_NODES
def_bool y
depends on DISCONTIGMEM || NUMA
-config HAVE_MEMORY_PRESENT
- def_bool y
- depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
-
#
# SPARSEMEM_EXTREME (which is the default) does some bootmem
-# allocations when memory_present() is called. If this cannot
+# allocations when sparse_init() is called. If this cannot
# be done on your architecture, select this option. However,
# statically allocating the mem_section[] array can potentially
# consume vast quantities of .bss, so be careful.
diff --git a/mm/Makefile b/mm/Makefile
index 6e9d46b2efc9..d5649f1c12c0 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -38,7 +38,7 @@ mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
- pgtable-generic.o rmap.o vmalloc.o
+ pgtable-generic.o rmap.o vmalloc.o ioremap.o
ifdef CONFIG_CROSS_MEMORY_ATTACH
diff --git a/mm/debug.c b/mm/debug.c
index 4f376514744d..ca8d1cacdecc 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -69,8 +69,19 @@ void __dump_page(struct page *page, const char *reason)
}
if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) {
- /* Corrupt page, cannot call page_mapping */
- mapping = page->mapping;
+ /*
+ * Corrupt page, so we cannot call page_mapping. Instead, do a
+ * safe subset of the steps that page_mapping() does. Caution:
+ * this will be misleading for tail pages, PageSwapCache pages,
+ * and potentially other situations. (See the page_mapping()
+ * implementation for what's missing here.)
+ */
+ unsigned long tmp = (unsigned long)page->mapping;
+
+ if (tmp & PAGE_MAPPING_ANON)
+ mapping = NULL;
+ else
+ mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS);
head = page;
compound = false;
} else {
@@ -84,86 +95,76 @@ void __dump_page(struct page *page, const char *reason)
*/
mapcount = PageSlab(head) ? 0 : page_mapcount(page);
- if (compound)
+ pr_warn("page:%p refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
+ page, page_ref_count(head), mapcount, mapping,
+ page_to_pgoff(page), page_to_pfn(page));
+ if (compound) {
if (hpage_pincount_available(page)) {
- pr_warn("page:%px refcount:%d mapcount:%d mapping:%p "
- "index:%#lx head:%px order:%u "
- "compound_mapcount:%d compound_pincount:%d\n",
- page, page_ref_count(head), mapcount,
- mapping, page_to_pgoff(page), head,
- compound_order(head), compound_mapcount(page),
- compound_pincount(page));
+ pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n",
+ head, compound_order(head),
+ head_mapcount(head),
+ head_pincount(head));
} else {
- pr_warn("page:%px refcount:%d mapcount:%d mapping:%p "
- "index:%#lx head:%px order:%u "
- "compound_mapcount:%d\n",
- page, page_ref_count(head), mapcount,
- mapping, page_to_pgoff(page), head,
- compound_order(head), compound_mapcount(page));
+ pr_warn("head:%p order:%u compound_mapcount:%d\n",
+ head, compound_order(head),
+ head_mapcount(head));
}
- else
- pr_warn("page:%px refcount:%d mapcount:%d mapping:%p index:%#lx\n",
- page, page_ref_count(page), mapcount,
- mapping, page_to_pgoff(page));
+ }
if (PageKsm(page))
type = "ksm ";
else if (PageAnon(page))
type = "anon ";
else if (mapping) {
- const struct inode *host;
+ struct inode *host;
const struct address_space_operations *a_ops;
- const struct hlist_node *dentry_first;
- const struct dentry *dentry_ptr;
+ struct hlist_node *dentry_first;
+ struct dentry *dentry_ptr;
struct dentry dentry;
/*
* mapping can be invalid pointer and we don't want to crash
* accessing it, so probe everything depending on it carefully
*/
- if (copy_from_kernel_nofault(&host, &mapping->host,
- sizeof(struct inode *)) ||
- copy_from_kernel_nofault(&a_ops, &mapping->a_ops,
- sizeof(struct address_space_operations *))) {
- pr_warn("failed to read mapping->host or a_ops, mapping not a valid kernel address?\n");
+ if (get_kernel_nofault(host, &mapping->host) ||
+ get_kernel_nofault(a_ops, &mapping->a_ops)) {
+ pr_warn("failed to read mapping contents, not a valid kernel address?\n");
goto out_mapping;
}
if (!host) {
- pr_warn("mapping->a_ops:%ps\n", a_ops);
+ pr_warn("aops:%ps\n", a_ops);
goto out_mapping;
}
- if (copy_from_kernel_nofault(&dentry_first,
- &host->i_dentry.first, sizeof(struct hlist_node *))) {
- pr_warn("mapping->a_ops:%ps with invalid mapping->host inode address %px\n",
- a_ops, host);
+ if (get_kernel_nofault(dentry_first, &host->i_dentry.first)) {
+ pr_warn("aops:%ps with invalid host inode %px\n",
+ a_ops, host);
goto out_mapping;
}
if (!dentry_first) {
- pr_warn("mapping->a_ops:%ps\n", a_ops);
+ pr_warn("aops:%ps ino:%lx\n", a_ops, host->i_ino);
goto out_mapping;
}
dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
- if (copy_from_kernel_nofault(&dentry, dentry_ptr,
- sizeof(struct dentry))) {
- pr_warn("mapping->aops:%ps with invalid mapping->host->i_dentry.first %px\n",
- a_ops, dentry_ptr);
+ if (get_kernel_nofault(dentry, dentry_ptr)) {
+ pr_warn("aops:%ps with invalid dentry %px\n", a_ops,
+ dentry_ptr);
} else {
/*
* if dentry is corrupted, the %pd handler may still
* crash, but it's unlikely that we reach here with a
* corrupted struct page
*/
- pr_warn("mapping->aops:%ps dentry name:\"%pd\"\n",
- a_ops, &dentry);
+ pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n",
+ a_ops, host->i_ino, &dentry);
}
}
out_mapping:
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
- pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags,
+ pr_warn("%sflags: %#lx(%pGp)%s\n", type, head->flags, &head->flags,
page_cma ? " CMA" : "");
hex_only:
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index d315ff544f05..086309fb9b6f 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -8,7 +8,7 @@
*
* Author: Anshuman Khandual <anshuman.khandual@arm.com>
*/
-#define pr_fmt(fmt) "debug_vm_pgtable: %s: " fmt, __func__
+#define pr_fmt(fmt) "debug_vm_pgtable: [%-25s]: " fmt, __func__
#include <linux/gfp.h>
#include <linux/highmem.h>
@@ -21,6 +21,7 @@
#include <linux/module.h>
#include <linux/pfn_t.h>
#include <linux/printk.h>
+#include <linux/pgtable.h>
#include <linux/random.h>
#include <linux/spinlock.h>
#include <linux/swap.h>
@@ -28,6 +29,13 @@
#include <linux/start_kernel.h>
#include <linux/sched/mm.h>
#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Please refer Documentation/vm/arch_pgtable_helpers.rst for the semantics
+ * expectations that are being validated here. All future changes in here
+ * or the documentation need to be in sync.
+ */
#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC)
@@ -46,6 +54,7 @@ static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
{
pte_t pte = pfn_pte(pfn, prot);
+ pr_debug("Validating PTE basic\n");
WARN_ON(!pte_same(pte, pte));
WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte))));
WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte))));
@@ -55,6 +64,57 @@ static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte))));
}
+static void __init pte_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pte_t *ptep,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ pr_debug("Validating PTE advanced\n");
+ pte = pfn_pte(pfn, prot);
+ set_pte_at(mm, vaddr, ptep, pte);
+ ptep_set_wrprotect(mm, vaddr, ptep);
+ pte = ptep_get(ptep);
+ WARN_ON(pte_write(pte));
+
+ pte = pfn_pte(pfn, prot);
+ set_pte_at(mm, vaddr, ptep, pte);
+ ptep_get_and_clear(mm, vaddr, ptep);
+ pte = ptep_get(ptep);
+ WARN_ON(!pte_none(pte));
+
+ pte = pfn_pte(pfn, prot);
+ pte = pte_wrprotect(pte);
+ pte = pte_mkclean(pte);
+ set_pte_at(mm, vaddr, ptep, pte);
+ pte = pte_mkwrite(pte);
+ pte = pte_mkdirty(pte);
+ ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
+ pte = ptep_get(ptep);
+ WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
+
+ pte = pfn_pte(pfn, prot);
+ set_pte_at(mm, vaddr, ptep, pte);
+ ptep_get_and_clear_full(mm, vaddr, ptep, 1);
+ pte = ptep_get(ptep);
+ WARN_ON(!pte_none(pte));
+
+ pte = pte_mkyoung(pte);
+ set_pte_at(mm, vaddr, ptep, pte);
+ ptep_test_and_clear_young(vma, vaddr, ptep);
+ pte = ptep_get(ptep);
+ WARN_ON(pte_young(pte));
+}
+
+static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ pr_debug("Validating PTE saved write\n");
+ WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte))));
+ WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte))));
+}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
{
@@ -63,6 +123,7 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
if (!has_transparent_hugepage())
return;
+ pr_debug("Validating PMD basic\n");
WARN_ON(!pmd_same(pmd, pmd));
WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd))));
WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd))));
@@ -77,6 +138,95 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pmd_bad(pmd_mkhuge(pmd)));
}
+static void __init pmd_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pmd_t *pmdp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD advanced\n");
+ /* Align the address wrt HPAGE_PMD_SIZE */
+ vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
+
+ pmd = pfn_pmd(pfn, prot);
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmdp_set_wrprotect(mm, vaddr, pmdp);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(pmd_write(pmd));
+
+ pmd = pfn_pmd(pfn, prot);
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmdp_huge_get_and_clear(mm, vaddr, pmdp);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
+
+ pmd = pfn_pmd(pfn, prot);
+ pmd = pmd_wrprotect(pmd);
+ pmd = pmd_mkclean(pmd);
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmd = pmd_mkwrite(pmd);
+ pmd = pmd_mkdirty(pmd);
+ pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
+
+ pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
+
+ pmd = pmd_mkyoung(pmd);
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmdp_test_and_clear_young(vma, vaddr, pmdp);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(pmd_young(pmd));
+}
+
+static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ pr_debug("Validating PMD leaf\n");
+ /*
+ * PMD based THP is a leaf entry.
+ */
+ pmd = pmd_mkhuge(pmd);
+ WARN_ON(!pmd_leaf(pmd));
+}
+
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd;
+
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ return;
+
+ pr_debug("Validating PMD huge\n");
+ /*
+ * X86 defined pmd_set_huge() verifies that the given
+ * PMD is not a populated non-leaf entry.
+ */
+ WRITE_ONCE(*pmdp, __pmd(0));
+ WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
+ WARN_ON(!pmd_clear_huge(pmdp));
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
+}
+
+static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ pr_debug("Validating PMD saved write\n");
+ WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
+ WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
+}
+
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
{
@@ -85,6 +235,7 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
if (!has_transparent_hugepage())
return;
+ pr_debug("Validating PUD basic\n");
WARN_ON(!pud_same(pud, pud));
WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud))));
WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud))));
@@ -100,18 +251,130 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
*/
WARN_ON(!pud_bad(pud_mkhuge(pud)));
}
+
+static void __init pud_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pud_t *pudp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+ pud_t pud = pfn_pud(pfn, prot);
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PUD advanced\n");
+ /* Align the address wrt HPAGE_PUD_SIZE */
+ vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
+
+ set_pud_at(mm, vaddr, pudp, pud);
+ pudp_set_wrprotect(mm, vaddr, pudp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(pud_write(pud));
+
+#ifndef __PAGETABLE_PMD_FOLDED
+ pud = pfn_pud(pfn, prot);
+ set_pud_at(mm, vaddr, pudp, pud);
+ pudp_huge_get_and_clear(mm, vaddr, pudp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+
+ pud = pfn_pud(pfn, prot);
+ set_pud_at(mm, vaddr, pudp, pud);
+ pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+#endif /* __PAGETABLE_PMD_FOLDED */
+ pud = pfn_pud(pfn, prot);
+ pud = pud_wrprotect(pud);
+ pud = pud_mkclean(pud);
+ set_pud_at(mm, vaddr, pudp, pud);
+ pud = pud_mkwrite(pud);
+ pud = pud_mkdirty(pud);
+ pudp_set_access_flags(vma, vaddr, pudp, pud, 1);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
+
+ pud = pud_mkyoung(pud);
+ set_pud_at(mm, vaddr, pudp, pud);
+ pudp_test_and_clear_young(vma, vaddr, pudp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(pud_young(pud));
+}
+
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud = pfn_pud(pfn, prot);
+
+ pr_debug("Validating PUD leaf\n");
+ /*
+ * PUD based THP is a leaf entry.
+ */
+ pud = pud_mkhuge(pud);
+ WARN_ON(!pud_leaf(pud));
+}
+
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud;
+
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ return;
+
+ pr_debug("Validating PUD huge\n");
+ /*
+ * X86 defined pud_set_huge() verifies that the given
+ * PUD is not a populated non-leaf entry.
+ */
+ WRITE_ONCE(*pudp, __pud(0));
+ WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
+ WARN_ON(!pud_clear_huge(pudp));
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+}
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pud_t *pudp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+}
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { }
static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pmd_t *pmdp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+}
+static void __init pud_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pud_t *pudp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+}
+static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
+{
+}
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+}
+static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
{
p4d_t p4d;
+ pr_debug("Validating P4D basic\n");
memset(&p4d, RANDOM_NZVALUE, sizeof(p4d_t));
WARN_ON(!p4d_same(p4d, p4d));
}
@@ -120,6 +383,7 @@ static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot)
{
pgd_t pgd;
+ pr_debug("Validating PGD basic\n");
memset(&pgd, RANDOM_NZVALUE, sizeof(pgd_t));
WARN_ON(!pgd_same(pgd, pgd));
}
@@ -132,6 +396,7 @@ static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp)
if (mm_pmd_folded(mm))
return;
+ pr_debug("Validating PUD clear\n");
pud = __pud(pud_val(pud) | RANDOM_ORVALUE);
WRITE_ONCE(*pudp, pud);
pud_clear(pudp);
@@ -146,6 +411,8 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
if (mm_pmd_folded(mm))
return;
+
+ pr_debug("Validating PUD populate\n");
/*
* This entry points to next level page table page.
* Hence this must not qualify as pud_bad().
@@ -172,6 +439,7 @@ static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp)
if (mm_pud_folded(mm))
return;
+ pr_debug("Validating P4D clear\n");
p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE);
WRITE_ONCE(*p4dp, p4d);
p4d_clear(p4dp);
@@ -187,6 +455,7 @@ static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
if (mm_pud_folded(mm))
return;
+ pr_debug("Validating P4D populate\n");
/*
* This entry points to next level page table page.
* Hence this must not qualify as p4d_bad().
@@ -205,6 +474,7 @@ static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp)
if (mm_p4d_folded(mm))
return;
+ pr_debug("Validating PGD clear\n");
pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE);
WRITE_ONCE(*pgdp, pgd);
pgd_clear(pgdp);
@@ -220,6 +490,7 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
if (mm_p4d_folded(mm))
return;
+ pr_debug("Validating PGD populate\n");
/*
* This entry points to next level page table page.
* Hence this must not qualify as pgd_bad().
@@ -248,6 +519,7 @@ static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
{
pte_t pte = ptep_get(ptep);
+ pr_debug("Validating PTE clear\n");
pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
set_pte_at(mm, vaddr, ptep, pte);
barrier();
@@ -260,6 +532,7 @@ static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp)
{
pmd_t pmd = READ_ONCE(*pmdp);
+ pr_debug("Validating PMD clear\n");
pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE);
WRITE_ONCE(*pmdp, pmd);
pmd_clear(pmdp);
@@ -272,6 +545,7 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
{
pmd_t pmd;
+ pr_debug("Validating PMD populate\n");
/*
* This entry points to next level page table page.
* Hence this must not qualify as pmd_bad().
@@ -282,6 +556,344 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
WARN_ON(pmd_bad(pmd));
}
+static void __init pte_special_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL))
+ return;
+
+ pr_debug("Validating PTE special\n");
+ WARN_ON(!pte_special(pte_mkspecial(pte)));
+}
+
+static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
+ pr_debug("Validating PTE protnone\n");
+ WARN_ON(!pte_protnone(pte));
+ WARN_ON(!pte_present(pte));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
+ pr_debug("Validating PMD protnone\n");
+ WARN_ON(!pmd_protnone(pmd));
+ WARN_ON(!pmd_present(pmd));
+}
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
+static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ pr_debug("Validating PTE devmap\n");
+ WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ pr_debug("Validating PMD devmap\n");
+ WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd)));
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud = pfn_pud(pfn, prot);
+
+ pr_debug("Validating PUD devmap\n");
+ WARN_ON(!pud_devmap(pud_mkdevmap(pud)));
+}
+#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#else
+static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
+
+static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ return;
+
+ pr_debug("Validating PTE soft dirty\n");
+ WARN_ON(!pte_soft_dirty(pte_mksoft_dirty(pte)));
+ WARN_ON(pte_soft_dirty(pte_clear_soft_dirty(pte)));
+}
+
+static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ return;
+
+ pr_debug("Validating PTE swap soft dirty\n");
+ WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte)));
+ WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte)));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ return;
+
+ pr_debug("Validating PMD soft dirty\n");
+ WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd)));
+ WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd)));
+}
+
+static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) ||
+ !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION))
+ return;
+
+ pr_debug("Validating PMD swap soft dirty\n");
+ WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
+ WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
+}
+#else /* !CONFIG_ARCH_HAS_PTE_DEVMAP */
+static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+}
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
+
+static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot)
+{
+ swp_entry_t swp;
+ pte_t pte;
+
+ pr_debug("Validating PTE swap\n");
+ pte = pfn_pte(pfn, prot);
+ swp = __pte_to_swp_entry(pte);
+ pte = __swp_entry_to_pte(swp);
+ WARN_ON(pfn != pte_pfn(pte));
+}
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot)
+{
+ swp_entry_t swp;
+ pmd_t pmd;
+
+ pr_debug("Validating PMD swap\n");
+ pmd = pfn_pmd(pfn, prot);
+ swp = __pmd_to_swp_entry(pmd);
+ pmd = __swp_entry_to_pmd(swp);
+ WARN_ON(pfn != pmd_pfn(pmd));
+}
+#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
+static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
+static void __init swap_migration_tests(void)
+{
+ struct page *page;
+ swp_entry_t swp;
+
+ if (!IS_ENABLED(CONFIG_MIGRATION))
+ return;
+
+ pr_debug("Validating swap migration\n");
+ /*
+ * swap_migration_tests() requires a dedicated page as it needs to
+ * be locked before creating a migration entry from it. Locking the
+ * page that actually maps kernel text ('start_kernel') can be real
+ * problematic. Lets allocate a dedicated page explicitly for this
+ * purpose that will be freed subsequently.
+ */
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ pr_err("page allocation failed\n");
+ return;
+ }
+
+ /*
+ * make_migration_entry() expects given page to be
+ * locked, otherwise it stumbles upon a BUG_ON().
+ */
+ __SetPageLocked(page);
+ swp = make_migration_entry(page, 1);
+ WARN_ON(!is_migration_entry(swp));
+ WARN_ON(!is_write_migration_entry(swp));
+
+ make_migration_entry_read(&swp);
+ WARN_ON(!is_migration_entry(swp));
+ WARN_ON(is_write_migration_entry(swp));
+
+ swp = make_migration_entry(page, 0);
+ WARN_ON(!is_migration_entry(swp));
+ WARN_ON(is_write_migration_entry(swp));
+ __ClearPageLocked(page);
+ __free_page(page);
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ struct page *page;
+ pte_t pte;
+
+ pr_debug("Validating HugeTLB basic\n");
+ /*
+ * Accessing the page associated with the pfn is safe here,
+ * as it was previously derived from a real kernel symbol.
+ */
+ page = pfn_to_page(pfn);
+ pte = mk_huge_pte(page, prot);
+
+ WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte)));
+ WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte))));
+ WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte))));
+
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+ pte = pfn_pte(pfn, prot);
+
+ WARN_ON(!pte_huge(pte_mkhuge(pte)));
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+}
+
+static void __init hugetlb_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pte_t *ptep, unsigned long pfn,
+ unsigned long vaddr, pgprot_t prot)
+{
+ struct page *page = pfn_to_page(pfn);
+ pte_t pte = ptep_get(ptep);
+ unsigned long paddr = __pfn_to_phys(pfn) & PMD_MASK;
+
+ pr_debug("Validating HugeTLB advanced\n");
+ pte = pte_mkhuge(mk_pte(pfn_to_page(PHYS_PFN(paddr)), prot));
+ set_huge_pte_at(mm, vaddr, ptep, pte);
+ barrier();
+ WARN_ON(!pte_same(pte, huge_ptep_get(ptep)));
+ huge_pte_clear(mm, vaddr, ptep, PMD_SIZE);
+ pte = huge_ptep_get(ptep);
+ WARN_ON(!huge_pte_none(pte));
+
+ pte = mk_huge_pte(page, prot);
+ set_huge_pte_at(mm, vaddr, ptep, pte);
+ barrier();
+ huge_ptep_set_wrprotect(mm, vaddr, ptep);
+ pte = huge_ptep_get(ptep);
+ WARN_ON(huge_pte_write(pte));
+
+ pte = mk_huge_pte(page, prot);
+ set_huge_pte_at(mm, vaddr, ptep, pte);
+ barrier();
+ huge_ptep_get_and_clear(mm, vaddr, ptep);
+ pte = huge_ptep_get(ptep);
+ WARN_ON(!huge_pte_none(pte));
+
+ pte = mk_huge_pte(page, prot);
+ pte = huge_pte_wrprotect(pte);
+ set_huge_pte_at(mm, vaddr, ptep, pte);
+ barrier();
+ pte = huge_pte_mkwrite(pte);
+ pte = huge_pte_mkdirty(pte);
+ huge_ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
+ pte = huge_ptep_get(ptep);
+ WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
+}
+#else /* !CONFIG_HUGETLB_PAGE */
+static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init hugetlb_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pte_t *ptep, unsigned long pfn,
+ unsigned long vaddr, pgprot_t prot)
+{
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD based THP\n");
+ /*
+ * pmd_trans_huge() and pmd_present() must return positive after
+ * MMU invalidation with pmd_mkinvalid(). This behavior is an
+ * optimization for transparent huge page. pmd_trans_huge() must
+ * be true if pmd_page() returns a valid THP to avoid taking the
+ * pmd_lock when others walk over non transhuge pmds (i.e. there
+ * are no THP allocated). Especially when splitting a THP and
+ * removing the present bit from the pmd, pmd_trans_huge() still
+ * needs to return true. pmd_present() should be true whenever
+ * pmd_trans_huge() returns true.
+ */
+ pmd = pfn_pmd(pfn, prot);
+ WARN_ON(!pmd_trans_huge(pmd_mkhuge(pmd)));
+
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+ WARN_ON(!pmd_trans_huge(pmd_mkinvalid(pmd_mkhuge(pmd))));
+ WARN_ON(!pmd_present(pmd_mkinvalid(pmd_mkhuge(pmd))));
+#endif /* __HAVE_ARCH_PMDP_INVALIDATE */
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PUD based THP\n");
+ pud = pfn_pud(pfn, prot);
+ WARN_ON(!pud_trans_huge(pud_mkhuge(pud)));
+
+ /*
+ * pud_mkinvalid() has been dropped for now. Enable back
+ * these tests when it comes back with a modified pud_present().
+ *
+ * WARN_ON(!pud_trans_huge(pud_mkinvalid(pud_mkhuge(pud))));
+ * WARN_ON(!pud_present(pud_mkinvalid(pud_mkhuge(pud))));
+ */
+}
+#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
static unsigned long __init get_random_vaddr(void)
{
unsigned long random_vaddr, random_pages, total_user_pages;
@@ -296,6 +908,7 @@ static unsigned long __init get_random_vaddr(void)
static int __init debug_vm_pgtable(void)
{
+ struct vm_area_struct *vma;
struct mm_struct *mm;
pgd_t *pgdp;
p4d_t *p4dp, *saved_p4dp;
@@ -303,7 +916,7 @@ static int __init debug_vm_pgtable(void)
pmd_t *pmdp, *saved_pmdp, pmd;
pte_t *ptep;
pgtable_t saved_ptep;
- pgprot_t prot;
+ pgprot_t prot, protnone;
phys_addr_t paddr;
unsigned long vaddr, pte_aligned, pmd_aligned;
unsigned long pud_aligned, p4d_aligned, pgd_aligned;
@@ -319,6 +932,18 @@ static int __init debug_vm_pgtable(void)
}
/*
+ * __P000 (or even __S000) will help create page table entries with
+ * PROT_NONE permission as required for pxx_protnone_tests().
+ */
+ protnone = __P000;
+
+ vma = vm_area_alloc(mm);
+ if (!vma) {
+ pr_err("vma allocation failed\n");
+ return 1;
+ }
+
+ /*
* PFN for mapping at PTE level is determined from a standard kernel
* text symbol. But pfns for higher page table levels are derived by
* masking lower bits of this real pfn. These derived pfns might not
@@ -366,6 +991,20 @@ static int __init debug_vm_pgtable(void)
p4d_clear_tests(mm, p4dp);
pgd_clear_tests(mm, pgdp);
+ pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+ pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
+ pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
+ hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+
+ pmd_leaf_tests(pmd_aligned, prot);
+ pud_leaf_tests(pud_aligned, prot);
+
+ pmd_huge_tests(pmdp, pmd_aligned, prot);
+ pud_huge_tests(pudp, pud_aligned, prot);
+
+ pte_savedwrite_tests(pte_aligned, prot);
+ pmd_savedwrite_tests(pmd_aligned, prot);
+
pte_unmap_unlock(ptep, ptl);
pmd_populate_tests(mm, pmdp, saved_ptep);
@@ -373,11 +1012,34 @@ static int __init debug_vm_pgtable(void)
p4d_populate_tests(mm, p4dp, saved_pudp);
pgd_populate_tests(mm, pgdp, saved_p4dp);
+ pte_special_tests(pte_aligned, prot);
+ pte_protnone_tests(pte_aligned, protnone);
+ pmd_protnone_tests(pmd_aligned, protnone);
+
+ pte_devmap_tests(pte_aligned, prot);
+ pmd_devmap_tests(pmd_aligned, prot);
+ pud_devmap_tests(pud_aligned, prot);
+
+ pte_soft_dirty_tests(pte_aligned, prot);
+ pmd_soft_dirty_tests(pmd_aligned, prot);
+ pte_swap_soft_dirty_tests(pte_aligned, prot);
+ pmd_swap_soft_dirty_tests(pmd_aligned, prot);
+
+ pte_swap_tests(pte_aligned, prot);
+ pmd_swap_tests(pmd_aligned, prot);
+
+ swap_migration_tests();
+ hugetlb_basic_tests(pte_aligned, prot);
+
+ pmd_thp_tests(pmd_aligned, prot);
+ pud_thp_tests(pud_aligned, prot);
+
p4d_free(mm, saved_p4dp);
pud_free(mm, saved_pudp);
pmd_free(mm, saved_pmdp);
pte_free(mm, saved_ptep);
+ vm_area_free(vma);
mm_dec_nr_puds(mm);
mm_dec_nr_pmds(mm);
mm_dec_nr_ptes(mm);
diff --git a/mm/filemap.c b/mm/filemap.c
index 9f131f1cfde3..f2bb5ff0293d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -41,6 +41,7 @@
#include <linux/delayacct.h>
#include <linux/psi.h>
#include <linux/ramfs.h>
+#include <linux/page_idle.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -1648,6 +1649,9 @@ EXPORT_SYMBOL(find_lock_entry);
* * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
* page is already in cache. If the page was allocated, unlock it before
* returning so the caller can do the same dance.
+ * * %FGP_WRITE - The page will be written
+ * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask
+ * * %FGP_NOWAIT - Don't get blocked by page lock
*
* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
* if the %GFP flags specified for %FGP_CREAT are atomic.
@@ -1689,6 +1693,11 @@ repeat:
if (fgp_flags & FGP_ACCESSED)
mark_page_accessed(page);
+ else if (fgp_flags & FGP_WRITE) {
+ /* Clear idle flag for buffer write */
+ if (page_is_idle(page))
+ clear_page_idle(page);
+ }
no_page:
if (!page && (fgp_flags & FGP_CREAT)) {
diff --git a/mm/gup.c b/mm/gup.c
index 6f47697f8fb0..d8a33dd1430d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1404,7 +1404,8 @@ retry:
*
* This takes care of mlocking the pages too if VM_LOCKED is set.
*
- * return 0 on success, negative error code on error.
+ * Return either number of pages pinned in the vma, or a negative error
+ * code on error.
*
* vma->vm_mm->mmap_lock must be held.
*
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 78c84bee7e29..206f52b36ffb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1722,19 +1722,13 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd)
}
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, unsigned long old_end,
- pmd_t *old_pmd, pmd_t *new_pmd)
+ unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
pmd_t pmd;
struct mm_struct *mm = vma->vm_mm;
bool force_flush = false;
- if ((old_addr & ~HPAGE_PMD_MASK) ||
- (new_addr & ~HPAGE_PMD_MASK) ||
- old_end - old_addr < HPAGE_PMD_SIZE)
- return false;
-
/*
* The destination pmd shouldn't be established, free_pgtables()
* should have release it.
@@ -2069,8 +2063,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
* free), userland could trigger a small page size TLB miss on the
* small sized TLB while the hugepage TLB entry is still established in
* the huge TLB. Some CPU doesn't like that.
- * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
- * 383 on page 93. Intel should be safe but is also warns that it's
+ * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
+ * 383 on page 105. Intel should be safe but is also warns that it's
* only safe if the permission and cache attributes of the two entries
* loaded in the two TLB is identical (which should be the case here).
* But it is generally safer to never allow small and huge TLB entries
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 590111ea6975..e52c878940bb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -31,6 +31,7 @@
#include <linux/cma.h>
#include <asm/page.h>
+#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <linux/io.h>
@@ -5313,25 +5314,21 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
- unsigned long check_addr;
+ unsigned long a_start, a_end;
if (!(vma->vm_flags & VM_MAYSHARE))
return;
- for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
- unsigned long a_start = check_addr & PUD_MASK;
- unsigned long a_end = a_start + PUD_SIZE;
+ /* Extend the range to be PUD aligned for a worst case scenario */
+ a_start = ALIGN_DOWN(*start, PUD_SIZE);
+ a_end = ALIGN(*end, PUD_SIZE);
- /*
- * If sharing is possible, adjust start/end if necessary.
- */
- if (range_in_vma(vma, a_start, a_end)) {
- if (a_start < *start)
- *start = a_start;
- if (a_end > *end)
- *end = a_end;
- }
- }
+ /*
+ * Intersect the range with the vma range, since pmd sharing won't be
+ * across vma after all
+ */
+ *start = max(vma->vm_start, a_start);
+ *end = min(vma->vm_end, a_end);
}
/*
diff --git a/lib/ioremap.c b/mm/ioremap.c
index 5ee3526f71b8..5fa1ab41d152 100644
--- a/lib/ioremap.c
+++ b/mm/ioremap.c
@@ -13,6 +13,8 @@
#include <linux/export.h>
#include <asm/cacheflush.h>
+#include "pgalloc-track.h"
+
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static int __read_mostly ioremap_p4d_capable;
static int __read_mostly ioremap_pud_capable;
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 757d4074fe28..950fd372a07e 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -40,7 +40,7 @@
#include "kasan.h"
#include "../slab.h"
-static inline depot_stack_handle_t save_stack(gfp_t flags)
+depot_stack_handle_t kasan_save_stack(gfp_t flags)
{
unsigned long entries[KASAN_STACK_DEPTH];
unsigned int nr_entries;
@@ -50,10 +50,10 @@ static inline depot_stack_handle_t save_stack(gfp_t flags)
return stack_depot_save(entries, nr_entries, flags);
}
-static inline void set_track(struct kasan_track *track, gfp_t flags)
+void kasan_set_track(struct kasan_track *track, gfp_t flags)
{
track->pid = current->pid;
- track->stack = save_stack(flags);
+ track->stack = kasan_save_stack(flags);
}
void kasan_enable_current(void)
@@ -180,21 +180,6 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
kasan_unpoison_shadow(base, watermark - base);
}
-/*
- * Clear all poison for the region between the current SP and a provided
- * watermark value, as is sometimes required prior to hand-crafted asm function
- * returns in the middle of functions.
- */
-void kasan_unpoison_stack_above_sp_to(const void *watermark)
-{
- const void *sp = __builtin_frame_address(0);
- size_t size = watermark - sp;
-
- if (WARN_ON(sp > watermark))
- return;
- kasan_unpoison_shadow(sp, size);
-}
-
void kasan_alloc_pages(struct page *page, unsigned int order)
{
u8 tag;
@@ -298,24 +283,6 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
return (void *)object + cache->kasan_info.free_meta_offset;
}
-
-static void kasan_set_free_info(struct kmem_cache *cache,
- void *object, u8 tag)
-{
- struct kasan_alloc_meta *alloc_meta;
- u8 idx = 0;
-
- alloc_meta = get_alloc_info(cache, object);
-
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
- idx = alloc_meta->free_track_idx;
- alloc_meta->free_pointer_tag[idx] = tag;
- alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
-#endif
-
- set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
-}
-
void kasan_poison_slab(struct page *page)
{
unsigned long i;
@@ -491,7 +458,7 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
KASAN_KMALLOC_REDZONE);
if (cache->flags & SLAB_KASAN)
- set_track(&get_alloc_info(cache, object)->alloc_track, flags);
+ kasan_set_track(&get_alloc_info(cache, object)->alloc_track, flags);
return set_tag(object, tag);
}
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 098a7dbaced6..248264b9cb76 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -324,3 +324,46 @@ DEFINE_ASAN_SET_SHADOW(f2);
DEFINE_ASAN_SET_SHADOW(f3);
DEFINE_ASAN_SET_SHADOW(f5);
DEFINE_ASAN_SET_SHADOW(f8);
+
+void kasan_record_aux_stack(void *addr)
+{
+ struct page *page = kasan_addr_to_page(addr);
+ struct kmem_cache *cache;
+ struct kasan_alloc_meta *alloc_info;
+ void *object;
+
+ if (!(page && PageSlab(page)))
+ return;
+
+ cache = page->slab_cache;
+ object = nearest_obj(cache, page, addr);
+ alloc_info = get_alloc_info(cache, object);
+
+ /*
+ * record the last two call_rcu() call stacks.
+ */
+ alloc_info->aux_stack[1] = alloc_info->aux_stack[0];
+ alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT);
+}
+
+void kasan_set_free_info(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_free_meta *free_meta;
+
+ free_meta = get_free_info(cache, object);
+ kasan_set_track(&free_meta->free_track, GFP_NOWAIT);
+
+ /*
+ * the object was freed and has free track set
+ */
+ *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREETRACK;
+}
+
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_KMALLOC_FREETRACK)
+ return NULL;
+ return &get_free_info(cache, object)->free_track;
+}
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c
index e200acb2d292..a38c7a9e192a 100644
--- a/mm/kasan/generic_report.c
+++ b/mm/kasan/generic_report.c
@@ -80,6 +80,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info)
break;
case KASAN_FREE_PAGE:
case KASAN_KMALLOC_FREE:
+ case KASAN_KMALLOC_FREETRACK:
bug_type = "use-after-free";
break;
case KASAN_ALLOCA_LEFT:
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index cfade6413528..ac499456740f 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -17,15 +17,17 @@
#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
+#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */
#else
#define KASAN_FREE_PAGE KASAN_TAG_INVALID
#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID
#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID
#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID
+#define KASAN_KMALLOC_FREETRACK KASAN_TAG_INVALID
#endif
-#define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */
-#define KASAN_VMALLOC_INVALID 0xF9 /* unallocated space in vmapped page */
+#define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */
+#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */
/*
* Stack redzone shadow values
@@ -104,7 +106,15 @@ struct kasan_track {
struct kasan_alloc_meta {
struct kasan_track alloc_track;
+#ifdef CONFIG_KASAN_GENERIC
+ /*
+ * call_rcu() call stack is stored into struct kasan_alloc_meta.
+ * The free stack is stored into struct kasan_free_meta.
+ */
+ depot_stack_handle_t aux_stack[2];
+#else
struct kasan_track free_track[KASAN_NR_FREE_STACKS];
+#endif
#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
u8 free_pointer_tag[KASAN_NR_FREE_STACKS];
u8 free_track_idx;
@@ -119,6 +129,9 @@ struct kasan_free_meta {
* Otherwise it might be used for the allocator freelist.
*/
struct qlist_node quarantine_link;
+#ifdef CONFIG_KASAN_GENERIC
+ struct kasan_track free_track;
+#endif
};
struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
@@ -159,6 +172,12 @@ void kasan_report_invalid_free(void *object, unsigned long ip);
struct page *kasan_addr_to_page(const void *addr);
+depot_stack_handle_t kasan_save_stack(gfp_t flags);
+void kasan_set_track(struct kasan_track *track, gfp_t flags);
+void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag);
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag);
+
#if defined(CONFIG_KASAN_GENERIC) && \
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 978bc4a3eb51..4c5375810449 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -145,6 +145,7 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
if (IS_ENABLED(CONFIG_SLAB))
local_irq_save(flags);
+ *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE;
___cache_free(cache, object, _THIS_IP_);
if (IS_ENABLED(CONFIG_SLAB))
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 51ec45407a0b..4f49fa6cd1aa 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -106,15 +106,20 @@ static void end_report(unsigned long *flags)
kasan_enable_current();
}
+static void print_stack(depot_stack_handle_t stack)
+{
+ unsigned long *entries;
+ unsigned int nr_entries;
+
+ nr_entries = stack_depot_fetch(stack, &entries);
+ stack_trace_print(entries, nr_entries, 0);
+}
+
static void print_track(struct kasan_track *track, const char *prefix)
{
pr_err("%s by task %u:\n", prefix, track->pid);
if (track->stack) {
- unsigned long *entries;
- unsigned int nr_entries;
-
- nr_entries = stack_depot_fetch(track->stack, &entries);
- stack_trace_print(entries, nr_entries, 0);
+ print_stack(track->stack);
} else {
pr_err("(stack is not available)\n");
}
@@ -160,26 +165,6 @@ static void describe_object_addr(struct kmem_cache *cache, void *object,
(void *)(object_addr + cache->object_size));
}
-static struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
- void *object, u8 tag)
-{
- struct kasan_alloc_meta *alloc_meta;
- int i = 0;
-
- alloc_meta = get_alloc_info(cache, object);
-
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
- for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
- if (alloc_meta->free_pointer_tag[i] == tag)
- break;
- }
- if (i == KASAN_NR_FREE_STACKS)
- i = alloc_meta->free_track_idx;
-#endif
-
- return &alloc_meta->free_track[i];
-}
-
static void describe_object(struct kmem_cache *cache, void *object,
const void *addr, u8 tag)
{
@@ -191,8 +176,23 @@ static void describe_object(struct kmem_cache *cache, void *object,
print_track(&alloc_info->alloc_track, "Allocated");
pr_err("\n");
free_track = kasan_get_free_track(cache, object, tag);
- print_track(free_track, "Freed");
- pr_err("\n");
+ if (free_track) {
+ print_track(free_track, "Freed");
+ pr_err("\n");
+ }
+
+#ifdef CONFIG_KASAN_GENERIC
+ if (alloc_info->aux_stack[0]) {
+ pr_err("Last call_rcu():\n");
+ print_stack(alloc_info->aux_stack[0]);
+ pr_err("\n");
+ }
+ if (alloc_info->aux_stack[1]) {
+ pr_err("Second to last call_rcu():\n");
+ print_stack(alloc_info->aux_stack[1]);
+ pr_err("\n");
+ }
+#endif
}
describe_object_addr(cache, object, addr);
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 8a959fdd30e3..e02a36a51f42 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -161,3 +161,40 @@ void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
kasan_poison_shadow((void *)addr, size, tag);
}
EXPORT_SYMBOL(__hwasan_tag_memory);
+
+void kasan_set_free_info(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ u8 idx = 0;
+
+ alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ idx = alloc_meta->free_track_idx;
+ alloc_meta->free_pointer_tag[idx] = tag;
+ alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
+#endif
+
+ kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
+}
+
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ int i = 0;
+
+ alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ break;
+ }
+ if (i == KASAN_NR_FREE_STACKS)
+ i = alloc_meta->free_track_idx;
+#endif
+
+ return &alloc_meta->free_track[i];
+}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 700f5160f3e4..b52bd46ad146 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -431,7 +431,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0;
+ return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
}
static bool hugepage_vma_check(struct vm_area_struct *vma,
@@ -1100,9 +1100,6 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
mmap_write_lock(mm);
- result = SCAN_ANY_PROCESS;
- if (!mmget_still_valid(mm))
- goto out;
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
goto out;
@@ -1412,7 +1409,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
{
unsigned long haddr = addr & HPAGE_PMD_MASK;
struct vm_area_struct *vma = find_vma(mm, haddr);
- struct page *hpage = NULL;
+ struct page *hpage;
pte_t *start_pte, *pte;
pmd_t *pmd, _pmd;
spinlock_t *ptl;
@@ -1432,9 +1429,17 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
return;
+ hpage = find_lock_page(vma->vm_file->f_mapping,
+ linear_page_index(vma, haddr));
+ if (!hpage)
+ return;
+
+ if (!PageHead(hpage))
+ goto drop_hpage;
+
pmd = mm_find_pmd(mm, haddr);
if (!pmd)
- return;
+ goto drop_hpage;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
@@ -1453,30 +1458,11 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
page = vm_normal_page(vma, addr, *pte);
- if (!page || !PageCompound(page))
- goto abort;
-
- if (!hpage) {
- hpage = compound_head(page);
- /*
- * The mapping of the THP should not change.
- *
- * Note that uprobe, debugger, or MAP_PRIVATE may
- * change the page table, but the new page will
- * not pass PageCompound() check.
- */
- if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping))
- goto abort;
- }
-
/*
- * Confirm the page maps to the correct subpage.
- *
- * Note that uprobe, debugger, or MAP_PRIVATE may change
- * the page table, but the new page will not pass
- * PageCompound() check.
+ * Note that uprobe, debugger, or MAP_PRIVATE may change the
+ * page table, but the new page will not be a subpage of hpage.
*/
- if (WARN_ON(hpage + i != page))
+ if (hpage + i != page)
goto abort;
count++;
}
@@ -1495,21 +1481,26 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
pte_unmap_unlock(start_pte, ptl);
/* step 3: set proper refcount and mm_counters. */
- if (hpage) {
+ if (count) {
page_ref_sub(hpage, count);
add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
}
/* step 4: collapse pmd */
ptl = pmd_lock(vma->vm_mm, pmd);
- _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ _pmd = pmdp_collapse_flush(vma, haddr, pmd);
spin_unlock(ptl);
mm_dec_nr_ptes(mm);
pte_free(mm, pmd_pgtable(_pmd));
+
+drop_hpage:
+ unlock_page(hpage);
+ put_page(hpage);
return;
abort:
pte_unmap_unlock(start_pte, ptl);
+ goto drop_hpage;
}
static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
@@ -1538,6 +1529,7 @@ out:
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
+ struct mm_struct *mm;
unsigned long addr;
pmd_t *pmd, _pmd;
@@ -1566,7 +1558,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
continue;
if (vma->vm_end < addr + HPAGE_PMD_SIZE)
continue;
- pmd = mm_find_pmd(vma->vm_mm, addr);
+ mm = vma->vm_mm;
+ pmd = mm_find_pmd(mm, addr);
if (!pmd)
continue;
/*
@@ -1576,17 +1569,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* mmap_lock while holding page lock. Fault path does it in
* reverse order. Trylock is a way to avoid deadlock.
*/
- if (mmap_write_trylock(vma->vm_mm)) {
- spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
- /* assume page table is clear */
- _pmd = pmdp_collapse_flush(vma, addr, pmd);
- spin_unlock(ptl);
- mmap_write_unlock(vma->vm_mm);
- mm_dec_nr_ptes(vma->vm_mm);
- pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+ if (mmap_write_trylock(mm)) {
+ if (!khugepaged_test_exit(mm)) {
+ spinlock_t *ptl = pmd_lock(mm, pmd);
+ /* assume page table is clear */
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ }
+ mmap_write_unlock(mm);
} else {
/* Try again later */
- khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
+ khugepaged_add_pte_mapped_thp(mm, addr);
}
}
i_mmap_unlock_write(mapping);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8cc617ede7e2..8d9ceea7fe4d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -73,8 +73,6 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
-#define MEM_CGROUP_RECLAIM_RETRIES 5
-
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;
@@ -257,8 +255,100 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
}
#ifdef CONFIG_MEMCG_KMEM
+extern spinlock_t css_set_lock;
+
+static void obj_cgroup_release(struct percpu_ref *ref)
+{
+ struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
+ struct mem_cgroup *memcg;
+ unsigned int nr_bytes;
+ unsigned int nr_pages;
+ unsigned long flags;
+
+ /*
+ * At this point all allocated objects are freed, and
+ * objcg->nr_charged_bytes can't have an arbitrary byte value.
+ * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
+ *
+ * The following sequence can lead to it:
+ * 1) CPU0: objcg == stock->cached_objcg
+ * 2) CPU1: we do a small allocation (e.g. 92 bytes),
+ * PAGE_SIZE bytes are charged
+ * 3) CPU1: a process from another memcg is allocating something,
+ * the stock if flushed,
+ * objcg->nr_charged_bytes = PAGE_SIZE - 92
+ * 5) CPU0: we do release this object,
+ * 92 bytes are added to stock->nr_bytes
+ * 6) CPU0: stock is flushed,
+ * 92 bytes are added to objcg->nr_charged_bytes
+ *
+ * In the result, nr_charged_bytes == PAGE_SIZE.
+ * This page will be uncharged in obj_cgroup_release().
+ */
+ nr_bytes = atomic_read(&objcg->nr_charged_bytes);
+ WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
+ nr_pages = nr_bytes >> PAGE_SHIFT;
+
+ spin_lock_irqsave(&css_set_lock, flags);
+ memcg = obj_cgroup_memcg(objcg);
+ if (nr_pages)
+ __memcg_kmem_uncharge(memcg, nr_pages);
+ list_del(&objcg->list);
+ mem_cgroup_put(memcg);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+
+ percpu_ref_exit(ref);
+ kfree_rcu(objcg, rcu);
+}
+
+static struct obj_cgroup *obj_cgroup_alloc(void)
+{
+ struct obj_cgroup *objcg;
+ int ret;
+
+ objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
+ if (!objcg)
+ return NULL;
+
+ ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
+ GFP_KERNEL);
+ if (ret) {
+ kfree(objcg);
+ return NULL;
+ }
+ INIT_LIST_HEAD(&objcg->list);
+ return objcg;
+}
+
+static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent)
+{
+ struct obj_cgroup *objcg, *iter;
+
+ objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
+
+ spin_lock_irq(&css_set_lock);
+
+ /* Move active objcg to the parent's list */
+ xchg(&objcg->memcg, parent);
+ css_get(&parent->css);
+ list_add(&objcg->list, &parent->objcg_list);
+
+ /* Move already reparented objcgs to the parent's list */
+ list_for_each_entry(iter, &memcg->objcg_list, list) {
+ css_get(&parent->css);
+ xchg(&iter->memcg, parent);
+ css_put(&memcg->css);
+ }
+ list_splice(&memcg->objcg_list, &parent->objcg_list);
+
+ spin_unlock_irq(&css_set_lock);
+
+ percpu_ref_kill(&objcg->refcnt);
+}
+
/*
- * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
+ * This will be used as a shrinker list's index.
* The main reason for not using cgroup id for this:
* this works better in sparse environments, where we have a lot of memcgs,
* but only a few kmem-limited. Or also, if we have, for instance, 200
@@ -301,14 +391,12 @@ void memcg_put_cache_ids(void)
/*
* A lot of the calls to the cache allocation functions are expected to be
- * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
* conditional to this static branch, we'll have to allow modules that does
* kmem_cache_alloc and the such to see this symbol as well
*/
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
-
-struct workqueue_struct *memcg_kmem_cache_wq;
#endif
static int memcg_shrinker_map_size;
@@ -477,10 +565,17 @@ ino_t page_cgroup_ino(struct page *page)
unsigned long ino = 0;
rcu_read_lock();
- if (PageSlab(page) && !PageTail(page))
- memcg = memcg_from_slab_page(page);
- else
- memcg = READ_ONCE(page->mem_cgroup);
+ memcg = page->mem_cgroup;
+
+ /*
+ * The lowest bit set means that memcg isn't a valid
+ * memcg pointer, but a obj_cgroups pointer.
+ * In this case the page is shared and doesn't belong
+ * to any specific memory cgroup.
+ */
+ if ((unsigned long) memcg & 0x1UL)
+ memcg = NULL;
+
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
if (memcg)
@@ -681,13 +776,16 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
*/
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
{
- long x;
+ long x, threshold = MEMCG_CHARGE_BATCH;
if (mem_cgroup_disabled())
return;
+ if (vmstat_item_in_bytes(idx))
+ threshold <<= PAGE_SHIFT;
+
x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
- if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+ if (unlikely(abs(x) > threshold)) {
struct mem_cgroup *mi;
/*
@@ -713,29 +811,12 @@ parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
return mem_cgroup_nodeinfo(parent, nid);
}
-/**
- * __mod_lruvec_state - update lruvec memory statistics
- * @lruvec: the lruvec
- * @idx: the stat item
- * @val: delta to add to the counter, can be negative
- *
- * The lruvec is the intersection of the NUMA node and a cgroup. This
- * function updates the all three counters that are affected by a
- * change of state at this level: per-node, per-cgroup, per-lruvec.
- */
-void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
- int val)
+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val)
{
- pg_data_t *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
- long x;
-
- /* Update node */
- __mod_node_page_state(pgdat, idx, val);
-
- if (mem_cgroup_disabled())
- return;
+ long x, threshold = MEMCG_CHARGE_BATCH;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
@@ -746,8 +827,12 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update lruvec */
__this_cpu_add(pn->lruvec_stat_local->count[idx], val);
+ if (vmstat_item_in_bytes(idx))
+ threshold <<= PAGE_SHIFT;
+
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
- if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+ if (unlikely(abs(x) > threshold)) {
+ pg_data_t *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup_per_node *pi;
for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
@@ -757,6 +842,27 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
}
+/**
+ * __mod_lruvec_state - update lruvec memory statistics
+ * @lruvec: the lruvec
+ * @idx: the stat item
+ * @val: delta to add to the counter, can be negative
+ *
+ * The lruvec is the intersection of the NUMA node and a cgroup. This
+ * function updates the all three counters that are affected by a
+ * change of state at this level: per-node, per-cgroup, per-lruvec.
+ */
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val)
+{
+ /* Update node */
+ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+
+ /* Update memcg and lruvec */
+ if (!mem_cgroup_disabled())
+ __mod_memcg_lruvec_state(lruvec, idx, val);
+}
+
void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
{
pg_data_t *pgdat = page_pgdat(virt_to_page(p));
@@ -1377,12 +1483,11 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
(u64)memcg_page_state(memcg, NR_FILE_PAGES) *
PAGE_SIZE);
seq_buf_printf(&s, "kernel_stack %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
+ (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
1024);
seq_buf_printf(&s, "slab %llu\n",
- (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
- memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
- PAGE_SIZE);
+ (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
seq_buf_printf(&s, "sock %llu\n",
(u64)memcg_page_state(memcg, MEMCG_SOCK) *
PAGE_SIZE);
@@ -1412,11 +1517,9 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
PAGE_SIZE);
seq_buf_printf(&s, "slab_reclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
- PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B));
seq_buf_printf(&s, "slab_unreclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
- PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B));
/* Accumulated memory events */
@@ -1560,15 +1663,21 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
.gfp_mask = gfp_mask,
.order = order,
};
- bool ret;
+ bool ret = true;
if (mutex_lock_killable(&oom_lock))
return true;
+
+ if (mem_cgroup_margin(memcg) >= (1 << order))
+ goto unlock;
+
/*
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
ret = should_force_charge() || out_of_memory(&oc);
+
+unlock:
mutex_unlock(&oom_lock);
return ret;
}
@@ -2039,6 +2148,12 @@ EXPORT_SYMBOL(unlock_page_memcg);
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
+
+#ifdef CONFIG_MEMCG_KMEM
+ struct obj_cgroup *cached_objcg;
+ unsigned int nr_bytes;
+#endif
+
struct work_struct work;
unsigned long flags;
#define FLUSHING_CACHED_CHARGE 0
@@ -2046,6 +2161,22 @@ struct memcg_stock_pcp {
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
+#ifdef CONFIG_MEMCG_KMEM
+static void drain_obj_stock(struct memcg_stock_pcp *stock);
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg);
+
+#else
+static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
+{
+}
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg)
+{
+ return false;
+}
+#endif
+
/**
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
@@ -2086,13 +2217,17 @@ static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
+ if (!old)
+ return;
+
if (stock->nr_pages) {
page_counter_uncharge(&old->memory, stock->nr_pages);
if (do_memsw_account())
page_counter_uncharge(&old->memsw, stock->nr_pages);
- css_put_many(&old->css, stock->nr_pages);
stock->nr_pages = 0;
}
+
+ css_put(&old->css);
stock->cached = NULL;
}
@@ -2108,6 +2243,7 @@ static void drain_local_stock(struct work_struct *dummy)
local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
+ drain_obj_stock(stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
@@ -2128,6 +2264,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
drain_stock(stock);
+ css_get(&memcg->css);
stock->cached = memcg;
}
stock->nr_pages += nr_pages;
@@ -2166,6 +2303,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
if (memcg && stock->nr_pages &&
mem_cgroup_is_descendant(memcg, root_memcg))
flush = true;
+ if (obj_stock_flush_required(stock, root_memcg))
+ flush = true;
rcu_read_unlock();
if (flush &&
@@ -2228,18 +2367,29 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
return 0;
}
-static void reclaim_high(struct mem_cgroup *memcg,
- unsigned int nr_pages,
- gfp_t gfp_mask)
+static unsigned long reclaim_high(struct mem_cgroup *memcg,
+ unsigned int nr_pages,
+ gfp_t gfp_mask)
{
+ unsigned long nr_reclaimed = 0;
+
do {
+ unsigned long pflags;
+
if (page_counter_read(&memcg->memory) <=
READ_ONCE(memcg->memory.high))
continue;
+
memcg_memory_event(memcg, MEMCG_HIGH);
- try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+
+ psi_memstall_enter(&pflags);
+ nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
+ gfp_mask, true);
+ psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
+
+ return nr_reclaimed;
}
static void high_work_func(struct work_struct *work)
@@ -2395,16 +2545,32 @@ void mem_cgroup_handle_over_high(void)
{
unsigned long penalty_jiffies;
unsigned long pflags;
+ unsigned long nr_reclaimed;
unsigned int nr_pages = current->memcg_nr_pages_over_high;
+ int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *memcg;
+ bool in_retry = false;
if (likely(!nr_pages))
return;
memcg = get_mem_cgroup_from_mm(current->mm);
- reclaim_high(memcg, nr_pages, GFP_KERNEL);
current->memcg_nr_pages_over_high = 0;
+retry_reclaim:
+ /*
+ * The allocating task should reclaim at least the batch size, but for
+ * subsequent retries we only want to do what's necessary to prevent oom
+ * or breaching resource isolation.
+ *
+ * This is distinct from memory.max or page allocator behaviour because
+ * memory.high is currently batched, whereas memory.max and the page
+ * allocator run every time an allocation is made.
+ */
+ nr_reclaimed = reclaim_high(memcg,
+ in_retry ? SWAP_CLUSTER_MAX : nr_pages,
+ GFP_KERNEL);
+
/*
* memory.high is breached and reclaim is unable to keep up. Throttle
* allocators proactively to slow down excessive growth.
@@ -2432,6 +2598,16 @@ void mem_cgroup_handle_over_high(void)
goto out;
/*
+ * If reclaim is making forward progress but we're still over
+ * memory.high, we want to encourage that rather than doing allocator
+ * throttling.
+ */
+ if (nr_reclaimed || nr_retries--) {
+ in_retry = true;
+ goto retry_reclaim;
+ }
+
+ /*
* If we exit early, we're guaranteed to die (since
* schedule_timeout_killable sets TASK_KILLABLE). This means we don't
* need to account for any ill-begotten jiffies to pay them off later.
@@ -2448,13 +2624,14 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
+ enum oom_status oom_status;
unsigned long nr_reclaimed;
bool may_swap = true;
bool drained = false;
- enum oom_status oom_status;
+ unsigned long pflags;
if (mem_cgroup_is_root(memcg))
return 0;
@@ -2514,8 +2691,10 @@ retry:
memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, may_swap);
+ psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
goto retry;
@@ -2567,7 +2746,7 @@ retry:
get_order(nr_pages * PAGE_SIZE));
switch (oom_status) {
case OOM_SUCCESS:
- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ nr_retries = MAX_RECLAIM_RETRIES;
goto retry;
case OOM_FAILED:
goto force;
@@ -2586,12 +2765,10 @@ force:
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
- css_get_many(&memcg->css, nr_pages);
return 0;
done_restock:
- css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
@@ -2649,8 +2826,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
-
- css_put_many(&memcg->css, nr_pages);
}
#endif
@@ -2669,6 +2844,26 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
}
#ifdef CONFIG_MEMCG_KMEM
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
+ gfp_t gfp)
+{
+ unsigned int objects = objs_per_slab_page(s, page);
+ void *vec;
+
+ vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
+ page_to_nid(page));
+ if (!vec)
+ return -ENOMEM;
+
+ if (cmpxchg(&page->obj_cgroups, NULL,
+ (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
+ kfree(vec);
+ else
+ kmemleak_not_leak(vec);
+
+ return 0;
+}
+
/*
* Returns a pointer to the memory cgroup to which the kernel object is charged.
*
@@ -2685,17 +2880,50 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
page = virt_to_head_page(p);
/*
- * Slab pages don't have page->mem_cgroup set because corresponding
- * kmem caches can be reparented during the lifetime. That's why
- * memcg_from_slab_page() should be used instead.
+ * Slab objects are accounted individually, not per-page.
+ * Memcg membership data for each individual object is saved in
+ * the page->obj_cgroups.
*/
- if (PageSlab(page))
- return memcg_from_slab_page(page);
+ if (page_has_obj_cgroups(page)) {
+ struct obj_cgroup *objcg;
+ unsigned int off;
+
+ off = obj_to_index(page->slab_cache, page, p);
+ objcg = page_obj_cgroups(page)[off];
+ if (objcg)
+ return obj_cgroup_memcg(objcg);
+
+ return NULL;
+ }
/* All other pages use page->mem_cgroup */
return page->mem_cgroup;
}
+__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
+{
+ struct obj_cgroup *objcg = NULL;
+ struct mem_cgroup *memcg;
+
+ if (unlikely(!current->mm && !current->active_memcg))
+ return NULL;
+
+ rcu_read_lock();
+ if (unlikely(current->active_memcg))
+ memcg = rcu_dereference(current->active_memcg);
+ else
+ memcg = mem_cgroup_from_task(current);
+
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ objcg = rcu_dereference(memcg->objcg);
+ if (objcg && obj_cgroup_tryget(objcg))
+ break;
+ }
+ rcu_read_unlock();
+
+ return objcg;
+}
+
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -2721,9 +2949,7 @@ static int memcg_alloc_cache_id(void)
else if (size > MEMCG_CACHES_MAX_SIZE)
size = MEMCG_CACHES_MAX_SIZE;
- err = memcg_update_all_caches(size);
- if (!err)
- err = memcg_update_all_list_lrus(size);
+ err = memcg_update_all_list_lrus(size);
if (!err)
memcg_nr_cache_ids = size;
@@ -2741,150 +2967,6 @@ static void memcg_free_cache_id(int id)
ida_simple_remove(&memcg_cache_ida, id);
}
-struct memcg_kmem_cache_create_work {
- struct mem_cgroup *memcg;
- struct kmem_cache *cachep;
- struct work_struct work;
-};
-
-static void memcg_kmem_cache_create_func(struct work_struct *w)
-{
- struct memcg_kmem_cache_create_work *cw =
- container_of(w, struct memcg_kmem_cache_create_work, work);
- struct mem_cgroup *memcg = cw->memcg;
- struct kmem_cache *cachep = cw->cachep;
-
- memcg_create_kmem_cache(memcg, cachep);
-
- css_put(&memcg->css);
- kfree(cw);
-}
-
-/*
- * Enqueue the creation of a per-memcg kmem_cache.
- */
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- struct memcg_kmem_cache_create_work *cw;
-
- if (!css_tryget_online(&memcg->css))
- return;
-
- cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
- if (!cw) {
- css_put(&memcg->css);
- return;
- }
-
- cw->memcg = memcg;
- cw->cachep = cachep;
- INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
-
- queue_work(memcg_kmem_cache_wq, &cw->work);
-}
-
-static inline bool memcg_kmem_bypass(void)
-{
- if (in_interrupt())
- return true;
-
- /* Allow remote memcg charging in kthread contexts. */
- if ((!current->mm || (current->flags & PF_KTHREAD)) &&
- !current->active_memcg)
- return true;
- return false;
-}
-
-/**
- * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
- * @cachep: the original global kmem cache
- *
- * Return the kmem_cache we're supposed to use for a slab allocation.
- * We try to use the current memcg's version of the cache.
- *
- * If the cache does not exist yet, if we are the first user of it, we
- * create it asynchronously in a workqueue and let the current allocation
- * go through with the original cache.
- *
- * This function takes a reference to the cache it returns to assure it
- * won't get destroyed while we are working with it. Once the caller is
- * done with it, memcg_kmem_put_cache() must be called to release the
- * reference.
- */
-struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
-{
- struct mem_cgroup *memcg;
- struct kmem_cache *memcg_cachep;
- struct memcg_cache_array *arr;
- int kmemcg_id;
-
- VM_BUG_ON(!is_root_cache(cachep));
-
- if (memcg_kmem_bypass())
- return cachep;
-
- rcu_read_lock();
-
- if (unlikely(current->active_memcg))
- memcg = current->active_memcg;
- else
- memcg = mem_cgroup_from_task(current);
-
- if (!memcg || memcg == root_mem_cgroup)
- goto out_unlock;
-
- kmemcg_id = READ_ONCE(memcg->kmemcg_id);
- if (kmemcg_id < 0)
- goto out_unlock;
-
- arr = rcu_dereference(cachep->memcg_params.memcg_caches);
-
- /*
- * Make sure we will access the up-to-date value. The code updating
- * memcg_caches issues a write barrier to match the data dependency
- * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
- */
- memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
-
- /*
- * If we are in a safe context (can wait, and not in interrupt
- * context), we could be be predictable and return right away.
- * This would guarantee that the allocation being performed
- * already belongs in the new cache.
- *
- * However, there are some clashes that can arrive from locking.
- * For instance, because we acquire the slab_mutex while doing
- * memcg_create_kmem_cache, this means no further allocation
- * could happen with the slab_mutex held. So it's better to
- * defer everything.
- *
- * If the memcg is dying or memcg_cache is about to be released,
- * don't bother creating new kmem_caches. Because memcg_cachep
- * is ZEROed as the fist step of kmem offlining, we don't need
- * percpu_ref_tryget_live() here. css_tryget_online() check in
- * memcg_schedule_kmem_cache_create() will prevent us from
- * creation of a new kmem_cache.
- */
- if (unlikely(!memcg_cachep))
- memcg_schedule_kmem_cache_create(memcg, cachep);
- else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
- cachep = memcg_cachep;
-out_unlock:
- rcu_read_unlock();
- return cachep;
-}
-
-/**
- * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
- * @cachep: the cache returned by memcg_kmem_get_cache
- */
-void memcg_kmem_put_cache(struct kmem_cache *cachep)
-{
- if (!is_root_cache(cachep))
- percpu_ref_put(&cachep->memcg_params.refcnt);
-}
-
/**
* __memcg_kmem_charge: charge a number of kernel pages to a memcg
* @memcg: memory cgroup to charge
@@ -2958,6 +3040,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
if (!ret) {
page->mem_cgroup = memcg;
__SetPageKmemcg(page);
+ return 0;
}
}
css_put(&memcg->css);
@@ -2980,13 +3063,146 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
__memcg_kmem_uncharge(memcg, nr_pages);
page->mem_cgroup = NULL;
+ css_put(&memcg->css);
/* slab pages do not have PageKmemcg flag set */
if (PageKmemcg(page))
__ClearPageKmemcg(page);
+}
+
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+{
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+ bool ret = false;
+
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
+ stock->nr_bytes -= nr_bytes;
+ ret = true;
+ }
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void drain_obj_stock(struct memcg_stock_pcp *stock)
+{
+ struct obj_cgroup *old = stock->cached_objcg;
+
+ if (!old)
+ return;
+
+ if (stock->nr_bytes) {
+ unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
+ unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
+
+ if (nr_pages) {
+ rcu_read_lock();
+ __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages);
+ rcu_read_unlock();
+ }
+
+ /*
+ * The leftover is flushed to the centralized per-memcg value.
+ * On the next attempt to refill obj stock it will be moved
+ * to a per-cpu stock (probably, on an other CPU), see
+ * refill_obj_stock().
+ *
+ * How often it's flushed is a trade-off between the memory
+ * limit enforcement accuracy and potential CPU contention,
+ * so it might be changed in the future.
+ */
+ atomic_add(nr_bytes, &old->nr_charged_bytes);
+ stock->nr_bytes = 0;
+ }
+
+ obj_cgroup_put(old);
+ stock->cached_objcg = NULL;
+}
+
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg)
+{
+ struct mem_cgroup *memcg;
+
+ if (stock->cached_objcg) {
+ memcg = obj_cgroup_memcg(stock->cached_objcg);
+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
+ return true;
+ }
+
+ return false;
+}
+
+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+{
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ if (stock->cached_objcg != objcg) { /* reset if necessary */
+ drain_obj_stock(stock);
+ obj_cgroup_get(objcg);
+ stock->cached_objcg = objcg;
+ stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
+ }
+ stock->nr_bytes += nr_bytes;
+
+ if (stock->nr_bytes > PAGE_SIZE)
+ drain_obj_stock(stock);
+
+ local_irq_restore(flags);
+}
+
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+{
+ struct mem_cgroup *memcg;
+ unsigned int nr_pages, nr_bytes;
+ int ret;
+
+ if (consume_obj_stock(objcg, size))
+ return 0;
+
+ /*
+ * In theory, memcg->nr_charged_bytes can have enough
+ * pre-charged bytes to satisfy the allocation. However,
+ * flushing memcg->nr_charged_bytes requires two atomic
+ * operations, and memcg->nr_charged_bytes can't be big,
+ * so it's better to ignore it and try grab some new pages.
+ * memcg->nr_charged_bytes will be flushed in
+ * refill_obj_stock(), called from this function or
+ * independently later.
+ */
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ css_get(&memcg->css);
+ rcu_read_unlock();
+
+ nr_pages = size >> PAGE_SHIFT;
+ nr_bytes = size & (PAGE_SIZE - 1);
+
+ if (nr_bytes)
+ nr_pages += 1;
- css_put_many(&memcg->css, nr_pages);
+ ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
+ if (!ret && nr_bytes)
+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
+
+ css_put(&memcg->css);
+ return ret;
}
+
+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
+{
+ refill_obj_stock(objcg, size);
+}
+
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2997,13 +3213,16 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
+ struct mem_cgroup *memcg = head->mem_cgroup;
int i;
if (mem_cgroup_disabled())
return;
- for (i = 1; i < HPAGE_PMD_NR; i++)
- head[i].mem_cgroup = head->mem_cgroup;
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ css_get(&memcg->css);
+ head[i].mem_cgroup = memcg;
+ }
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -3207,7 +3426,7 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
*/
static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
{
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ int nr_retries = MAX_RECLAIM_RETRIES;
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
@@ -3404,6 +3623,7 @@ static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
+ struct obj_cgroup *objcg;
int memcg_id;
if (cgroup_memory_nokmem)
@@ -3416,7 +3636,16 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
if (memcg_id < 0)
return memcg_id;
- static_branch_inc(&memcg_kmem_enabled_key);
+ objcg = obj_cgroup_alloc();
+ if (!objcg) {
+ memcg_free_cache_id(memcg_id);
+ return -ENOMEM;
+ }
+ objcg->memcg = memcg;
+ rcu_assign_pointer(memcg->objcg, objcg);
+
+ static_branch_enable(&memcg_kmem_enabled_key);
+
/*
* A memory cgroup is considered kmem-online as soon as it gets
* kmemcg_id. Setting the id after enabling static branching will
@@ -3425,7 +3654,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
*/
memcg->kmemcg_id = memcg_id;
memcg->kmem_state = KMEM_ONLINE;
- INIT_LIST_HEAD(&memcg->kmem_caches);
return 0;
}
@@ -3438,22 +3666,14 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
if (memcg->kmem_state != KMEM_ONLINE)
return;
- /*
- * Clear the online state before clearing memcg_caches array
- * entries. The slab_mutex in memcg_deactivate_kmem_caches()
- * guarantees that no cache will be created for this cgroup
- * after we are done (see memcg_create_kmem_cache()).
- */
+
memcg->kmem_state = KMEM_ALLOCATED;
parent = parent_mem_cgroup(memcg);
if (!parent)
parent = root_mem_cgroup;
- /*
- * Deactivate and reparent kmem_caches.
- */
- memcg_deactivate_kmem_caches(memcg, parent);
+ memcg_reparent_objcgs(memcg, parent);
kmemcg_id = memcg->kmemcg_id;
BUG_ON(kmemcg_id < 0);
@@ -3486,11 +3706,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
/* css_alloc() failed, offlining didn't happen */
if (unlikely(memcg->kmem_state == KMEM_ONLINE))
memcg_offline_kmem(memcg);
-
- if (memcg->kmem_state == KMEM_ALLOCATED) {
- WARN_ON(!list_empty(&memcg->kmem_caches));
- static_branch_dec(&memcg_kmem_enabled_key);
- }
}
#else
static int memcg_online_kmem(struct mem_cgroup *memcg)
@@ -4800,9 +5015,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
{
.name = "kmem.slabinfo",
- .seq_start = memcg_slab_start,
- .seq_next = memcg_slab_next,
- .seq_stop = memcg_slab_stop,
.seq_show = memcg_slab_show,
},
#endif
@@ -5022,6 +5234,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
memcg->socket_pressure = jiffies;
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
+ INIT_LIST_HEAD(&memcg->objcg_list);
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
@@ -5084,9 +5297,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
/* The following stuff does not apply to the root */
if (!parent) {
-#ifdef CONFIG_MEMCG_KMEM
- INIT_LIST_HEAD(&memcg->kmem_caches);
-#endif
root_mem_cgroup = memcg;
return &memcg->css;
}
@@ -5448,7 +5658,10 @@ static int mem_cgroup_move_account(struct page *page,
*/
smp_mb();
- page->mem_cgroup = to; /* caller should have done css_get */
+ css_get(&to->css);
+ css_put(&from->css);
+
+ page->mem_cgroup = to;
__unlock_page_memcg(from);
@@ -5669,8 +5882,6 @@ static void __mem_cgroup_clear_mc(void)
if (!mem_cgroup_is_root(mc.to))
page_counter_uncharge(&mc.to->memory, mc.moved_swap);
- css_put_many(&mc.to->css, mc.moved_swap);
-
mc.moved_swap = 0;
}
memcg_oom_recover(from);
@@ -6036,7 +6247,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
bool drained = false;
unsigned long high;
int err;
@@ -6046,8 +6257,6 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (err)
return err;
- page_counter_set_high(&memcg->memory, high);
-
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long reclaimed;
@@ -6071,6 +6280,10 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
break;
}
+ page_counter_set_high(&memcg->memory, high);
+
+ memcg_wb_domain_size_changed(memcg);
+
return nbytes;
}
@@ -6084,7 +6297,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
+ unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
bool drained = false;
unsigned long max;
int err;
@@ -6391,40 +6604,42 @@ static unsigned long effective_protection(unsigned long usage,
*
* WARNING: This function is not stateless! It can only be used as part
* of a top-down tree iteration, not for isolated queries.
- *
- * Returns one of the following:
- * MEMCG_PROT_NONE: cgroup memory is not protected
- * MEMCG_PROT_LOW: cgroup memory is protected as long there is
- * an unprotected supply of reclaimable memory from other cgroups.
- * MEMCG_PROT_MIN: cgroup memory is protected
*/
-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
- struct mem_cgroup *memcg)
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg)
{
unsigned long usage, parent_usage;
struct mem_cgroup *parent;
if (mem_cgroup_disabled())
- return MEMCG_PROT_NONE;
+ return;
if (!root)
root = root_mem_cgroup;
+
+ /*
+ * Effective values of the reclaim targets are ignored so they
+ * can be stale. Have a look at mem_cgroup_protection for more
+ * details.
+ * TODO: calculation should be more robust so that we do not need
+ * that special casing.
+ */
if (memcg == root)
- return MEMCG_PROT_NONE;
+ return;
usage = page_counter_read(&memcg->memory);
if (!usage)
- return MEMCG_PROT_NONE;
+ return;
parent = parent_mem_cgroup(memcg);
/* No parent means a non-hierarchical mode on v1 memcg */
if (!parent)
- return MEMCG_PROT_NONE;
+ return;
if (parent == root) {
memcg->memory.emin = READ_ONCE(memcg->memory.min);
memcg->memory.elow = READ_ONCE(memcg->memory.low);
- goto out;
+ return;
}
parent_usage = page_counter_read(&parent->memory);
@@ -6438,14 +6653,6 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
READ_ONCE(memcg->memory.low),
READ_ONCE(parent->memory.elow),
atomic_long_read(&parent->memory.children_low_usage)));
-
-out:
- if (usage <= memcg->memory.emin)
- return MEMCG_PROT_MIN;
- else if (usage <= memcg->memory.elow)
- return MEMCG_PROT_LOW;
- else
- return MEMCG_PROT_NONE;
}
/**
@@ -6498,6 +6705,7 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
if (ret)
goto out_put;
+ css_get(&memcg->css);
commit_charge(page, memcg);
local_irq_disable();
@@ -6552,9 +6760,6 @@ static void uncharge_batch(const struct uncharge_gather *ug)
__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
-
- if (!mem_cgroup_is_root(ug->memcg))
- css_put_many(&ug->memcg->css, ug->nr_pages);
}
static void uncharge_page(struct page *page, struct uncharge_gather *ug)
@@ -6592,6 +6797,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
ug->dummy_page = page;
page->mem_cgroup = NULL;
+ css_put(&ug->memcg->css);
}
static void uncharge_list(struct list_head *page_list)
@@ -6697,8 +6903,8 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
- css_get_many(&memcg->css, nr_pages);
+ css_get(&memcg->css);
commit_charge(newpage, memcg);
local_irq_save(flags);
@@ -6821,17 +7027,6 @@ static int __init mem_cgroup_init(void)
{
int cpu, node;
-#ifdef CONFIG_MEMCG_KMEM
- /*
- * Kmem cache creation is mostly done with the slab_mutex held,
- * so use a workqueue with limited concurrency to avoid stalling
- * all worker threads in case lots of cgroups are created and
- * destroyed simultaneously.
- */
- memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
- BUG_ON(!memcg_kmem_cache_wq);
-#endif
-
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
@@ -6935,8 +7130,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
mem_cgroup_charge_statistics(memcg, page, -nr_entries);
memcg_check_events(memcg, page);
- if (!mem_cgroup_is_root(memcg))
- css_put_many(&memcg->css, nr_entries);
+ css_put(&memcg->css);
}
/**
diff --git a/mm/memory.c b/mm/memory.c
index 0da48f6586f8..c39a13b09602 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1098,7 +1098,7 @@ again:
}
entry = pte_to_swp_entry(ptent);
- if (non_swap_entry(entry) && is_device_private_entry(entry)) {
+ if (is_device_private_entry(entry)) {
struct page *page = device_private_entry_to_page(entry);
if (unlikely(details && details->check_mapping)) {
@@ -2082,7 +2082,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
- * @addr: target user address to start at
+ * @addr: target page aligned user address to start at
* @pfn: page frame number of kernel physical memory address
* @size: size of mapping area
* @prot: page protection flags for this mapping
@@ -2101,6 +2101,9 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long remap_pfn = pfn;
int err;
+ if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
+ return -EINVAL;
+
/*
* Physically remapped pages are special. Tell the
* rest of the world about it:
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index da374cd3d45b..ac6961abaa10 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -831,6 +831,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
zone->zone_pgdat->node_present_pages += onlined_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ /*
+ * When exposing larger, physically contiguous memory areas to the
+ * buddy, shuffling in the buddy (when freeing onlined pages, putting
+ * them either to the head or the tail of the freelist) is only helpful
+ * for maintaining the shuffle, but not for creating the initial
+ * shuffle. Shuffle the whole zone to make sure the just onlined pages
+ * are properly distributed across the whole freelist.
+ */
shuffle_zone(zone);
node_states_set_node(nid, &arg);
@@ -844,8 +852,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
kswapd_run(nid);
kcompactd_run(nid);
- vm_total_pages = nr_free_pagecache_pages();
-
writeback_set_ratelimit();
memory_notify(MEM_ONLINE, &arg);
@@ -1595,7 +1601,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
kcompactd_stop(node);
}
- vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
diff --git a/mm/migrate.c b/mm/migrate.c
index 4fcc465736ff..d179657f8685 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2386,9 +2386,9 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
* that the registered device driver can skip invalidating device
* private page mappings that won't be migrated.
*/
- mmu_notifier_range_init(&range, MMU_NOTIFY_MIGRATE, 0, migrate->vma,
- migrate->vma->vm_mm, migrate->start, migrate->end);
- range.migrate_pgmap_owner = migrate->pgmap_owner;
+ mmu_notifier_range_init_migrate(&range, 0, migrate->vma,
+ migrate->vma->vm_mm, migrate->start, migrate->end,
+ migrate->pgmap_owner);
mmu_notifier_invalidate_range_start(&range);
walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 435e5f794b3b..b06a30fbedff 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -13,6 +13,7 @@
#include <linux/memory.h>
#include <linux/notifier.h>
#include <linux/sched.h>
+#include <linux/mman.h>
#include "internal.h"
#ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -144,14 +145,23 @@ EXPORT_SYMBOL_GPL(mm_kobj);
#ifdef CONFIG_SMP
s32 vm_committed_as_batch = 32;
-static void __meminit mm_compute_batch(void)
+void mm_compute_batch(int overcommit_policy)
{
u64 memsized_batch;
s32 nr = num_present_cpus();
s32 batch = max_t(s32, nr*2, 32);
-
- /* batch size set to 0.4% of (total memory/#cpus), or max int32 */
- memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff);
+ unsigned long ram_pages = totalram_pages();
+
+ /*
+ * For policy OVERCOMMIT_NEVER, set batch size to 0.4% of
+ * (total memory/#cpus), and lift it to 25% for other policies
+ * to easy the possible lock contention for percpu_counter
+ * vm_committed_as, while the max limit is INT_MAX
+ */
+ if (overcommit_policy == OVERCOMMIT_NEVER)
+ memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX);
+ else
+ memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX);
vm_committed_as_batch = max_t(s32, memsized_batch, batch);
}
@@ -162,7 +172,7 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
switch (action) {
case MEM_ONLINE:
case MEM_OFFLINE:
- mm_compute_batch();
+ mm_compute_batch(sysctl_overcommit_memory);
default:
break;
}
@@ -176,7 +186,7 @@ static struct notifier_block compute_batch_nb __meminitdata = {
static int __init mm_compute_batch_init(void)
{
- mm_compute_batch();
+ mm_compute_batch(sysctl_overcommit_memory);
register_hotmemory_notifier(&compute_batch_nb);
return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index dcdab2675a21..40248d84ad5f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1030,7 +1030,7 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
* anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
*
* We don't check here for the merged mmap wrapping around the end of pagecache
- * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
+ * indices (16TB on ia32) because do_mmap() does not permit mmap's which
* wrap, nor mmaps which cover the final page at index -1UL.
*/
static int
@@ -1365,11 +1365,11 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode,
*/
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
- unsigned long flags, vm_flags_t vm_flags,
- unsigned long pgoff, unsigned long *populate,
- struct list_head *uf)
+ unsigned long flags, unsigned long pgoff,
+ unsigned long *populate, struct list_head *uf)
{
struct mm_struct *mm = current->mm;
+ vm_flags_t vm_flags;
int pkey = 0;
*populate = 0;
@@ -1431,7 +1431,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
- vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+ vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
@@ -1562,11 +1562,12 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
file = fget(fd);
if (!file)
return -EBADF;
- if (is_file_hugepages(file))
+ if (is_file_hugepages(file)) {
len = ALIGN(len, huge_page_size(hstate_file(file)));
- retval = -EINVAL;
- if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
+ } else if (unlikely(flags & MAP_HUGETLB)) {
+ retval = -EINVAL;
goto out_fput;
+ }
} else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL;
struct hstate *hs;
@@ -1689,7 +1690,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
+ struct vm_area_struct *vma, *prev, *merge;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
@@ -1773,6 +1774,25 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
if (error)
goto unmap_and_free_vma;
+ /* If vm_flags changed after call_mmap(), we should try merge vma again
+ * as we may succeed this time.
+ */
+ if (unlikely(vm_flags != vma->vm_flags && prev)) {
+ merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
+ NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
+ if (merge) {
+ fput(file);
+ vm_area_free(vma);
+ vma = merge;
+ /* Update vm_flags and possible addr to pick up the change. We don't
+ * warn here if addr changed as the vma is not linked by vma_link().
+ */
+ addr = vma->vm_start;
+ vm_flags = vma->vm_flags;
+ goto unmap_writable;
+ }
+ }
+
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
@@ -1795,6 +1815,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma_link(mm, vma, prev, rb_link, rb_parent);
/* Once vma denies write, undo our temporary denial count */
if (file) {
+unmap_writable:
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
if (vm_flags & VM_DENYWRITE)
@@ -2209,7 +2230,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
/*
* mmap_region() will call shmem_zero_setup() to create a file,
* so use shmem's get_unmapped_area in case it can be huge.
- * do_mmap_pgoff() will clear pgoff, so match alignment.
+ * do_mmap() will clear pgoff, so match alignment.
*/
pgoff = 0;
get_area = shmem_get_unmapped_area;
@@ -2982,7 +3003,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
}
file = get_file(vma->vm_file);
- ret = do_mmap_pgoff(vma->vm_file, start, size,
+ ret = do_mmap(vma->vm_file, start, size,
prot, flags, pgoff, &populate, NULL);
fput(file);
out:
@@ -3202,7 +3223,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
* By setting it to reflect the virtual start address of the
* vma, merges and splits can happen in a seamless way, just
* using the existing file pgoff checks and manipulations.
- * Similarly in do_mmap_pgoff and in do_brk.
+ * Similarly in do_mmap and in do_brk.
*/
if (vma_is_anonymous(vma)) {
BUG_ON(vma->anon_vma);
diff --git a/mm/mremap.c b/mm/mremap.c
index 6b153dc05fe4..138abbae4f75 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -193,17 +193,12 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
#ifdef CONFIG_HAVE_MOVE_PMD
static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, unsigned long old_end,
- pmd_t *old_pmd, pmd_t *new_pmd)
+ unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
struct mm_struct *mm = vma->vm_mm;
pmd_t pmd;
- if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK)
- || old_end - old_addr < PMD_SIZE)
- return false;
-
/*
* The destination pmd shouldn't be established, free_pgtables()
* should have released it.
@@ -279,6 +274,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
extent = next - old_addr;
if (extent > old_end - old_addr)
extent = old_end - old_addr;
+ next = (new_addr + PMD_SIZE) & PMD_MASK;
+ if (extent > next - new_addr)
+ extent = next - new_addr;
old_pmd = get_old_pmd(vma->vm_mm, old_addr);
if (!old_pmd)
continue;
@@ -292,7 +290,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (need_rmap_locks)
take_rmap_locks(vma);
moved = move_huge_pmd(vma, old_addr, new_addr,
- old_end, old_pmd, new_pmd);
+ old_pmd, new_pmd);
if (need_rmap_locks)
drop_rmap_locks(vma);
if (moved)
@@ -312,7 +310,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (need_rmap_locks)
take_rmap_locks(vma);
moved = move_normal_pmd(vma, old_addr, new_addr,
- old_end, old_pmd, new_pmd);
+ old_pmd, new_pmd);
if (need_rmap_locks)
drop_rmap_locks(vma);
if (moved)
@@ -322,9 +320,6 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (pte_alloc(new_vma->vm_mm, new_pmd))
break;
- next = (new_addr + PMD_SIZE) & PMD_MASK;
- if (extent > next - new_addr)
- extent = next - new_addr;
move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
new_pmd, new_addr, need_rmap_locks);
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 314174817b04..340ae7774c13 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1078,7 +1078,6 @@ unsigned long do_mmap(struct file *file,
unsigned long len,
unsigned long prot,
unsigned long flags,
- vm_flags_t vm_flags,
unsigned long pgoff,
unsigned long *populate,
struct list_head *uf)
@@ -1086,6 +1085,7 @@ unsigned long do_mmap(struct file *file,
struct vm_area_struct *vma;
struct vm_region *region;
struct rb_node *rb;
+ vm_flags_t vm_flags;
unsigned long capabilities, result;
int ret;
@@ -1104,7 +1104,7 @@ unsigned long do_mmap(struct file *file,
/* we've determined that we can make the mapping, now translate what we
* now know into VMA flags */
- vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
+ vm_flags = determine_vm_flags(file, prot, flags, capabilities);
/* we're going to need to record the mapping */
region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
@@ -1763,7 +1763,7 @@ EXPORT_SYMBOL_GPL(access_process_vm);
*
* Check the shared mappings on an inode on behalf of a shrinking truncate to
* make sure that that any outstanding VMAs aren't broken and then shrink the
- * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
+ * vm_regions that extend that beyond so that do_mmap() doesn't
* automatically grant mappings that are too large.
*/
int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 6e94962893ee..d30ce75f23fb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -184,7 +184,7 @@ static bool is_dump_unreclaim_slabs(void)
global_node_page_state(NR_ISOLATED_FILE) +
global_node_page_state(NR_UNEVICTABLE);
- return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru);
+ return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
}
/**
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 28b3e7a67565..4e4ddd67b71e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2076,13 +2076,11 @@ static int page_writeback_cpu_online(unsigned int cpu)
* Called early on to tune the page writeback dirty limits.
*
* We used to scale dirty pages according to how total memory
- * related to pages that could be allocated for buffers (by
- * comparing nr_free_buffer_pages() to vm_total_pages.
+ * related to pages that could be allocated for buffers.
*
* However, that was when we used "dirty_ratio" to scale with
* all memory, and we don't do that any more. "dirty_ratio"
- * is now applied to total non-HIGHPAGE memory (by subtracting
- * totalhigh_pages from vm_total_pages), and as such we can't
+ * is now applied to total non-HIGHPAGE memory, and as such we can't
* get into the old insane situation any more where we had
* large amounts of dirty pages compared to a small amount of
* non-HIGHMEM memory.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 901a21f61d68..167732f4d124 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -459,25 +459,23 @@ static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
pfn &= (PAGES_PER_SECTION-1);
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#else
pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#endif /* CONFIG_SPARSEMEM */
+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
/**
* get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @pfn: The target page frame number
- * @end_bitidx: The last bit of interest to retrieve
* @mask: mask of bits that the caller is interested in
*
* Return: pageblock_bits flags
*/
-static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
+static __always_inline
+unsigned long __get_pfnblock_flags_mask(struct page *page,
unsigned long pfn,
- unsigned long end_bitidx,
unsigned long mask)
{
unsigned long *bitmap;
@@ -490,20 +488,18 @@ static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page
bitidx &= (BITS_PER_LONG-1);
word = bitmap[word_bitidx];
- bitidx += end_bitidx;
- return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
+ return (word >> bitidx) & mask;
}
unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
- unsigned long end_bitidx,
unsigned long mask)
{
- return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
+ return __get_pfnblock_flags_mask(page, pfn, mask);
}
static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
{
- return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
+ return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
}
/**
@@ -511,12 +507,10 @@ static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned
* @page: The page within the block of interest
* @flags: The flags to set
* @pfn: The target page frame number
- * @end_bitidx: The last bit of interest
* @mask: mask of bits that the caller is interested in
*/
void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
unsigned long pfn,
- unsigned long end_bitidx,
unsigned long mask)
{
unsigned long *bitmap;
@@ -533,9 +527,8 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
- bitidx += end_bitidx;
- mask <<= (BITS_PER_LONG - bitidx - 1);
- flags <<= (BITS_PER_LONG - bitidx - 1);
+ mask <<= bitidx;
+ flags <<= bitidx;
word = READ_ONCE(bitmap[word_bitidx]);
for (;;) {
@@ -552,8 +545,8 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
migratetype < MIGRATE_PCPTYPES))
migratetype = MIGRATE_UNMOVABLE;
- set_pageblock_flags_group(page, (unsigned long)migratetype,
- PB_migrate, PB_migrate_end);
+ set_pfnblock_flags_mask(page, (unsigned long)migratetype,
+ page_to_pfn(page), MIGRATETYPE_MASK);
}
#ifdef CONFIG_DEBUG_VM
@@ -813,11 +806,10 @@ static inline struct capture_control *task_capc(struct zone *zone)
{
struct capture_control *capc = current->capture_control;
- return capc &&
+ return unlikely(capc) &&
!(current->flags & PF_KTHREAD) &&
!capc->page &&
- capc->cc->zone == zone &&
- capc->cc->direct_compaction ? capc : NULL;
+ capc->cc->zone == zone ? capc : NULL;
}
static inline bool
@@ -1164,8 +1156,11 @@ static void kernel_init_free_pages(struct page *page, int numpages)
{
int i;
+ /* s390's use of memset() could override KASAN redzones. */
+ kasan_disable_current();
for (i = 0; i < numpages; i++)
clear_highpage(page + i);
+ kasan_enable_current();
}
static __always_inline bool free_pages_prepare(struct page *page,
@@ -2273,7 +2268,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
*/
-static int fallbacks[MIGRATE_TYPES][4] = {
+static int fallbacks[MIGRATE_TYPES][3] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
@@ -2790,7 +2785,7 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
* allocating from CMA when over half of the zone's free memory
* is in the CMA area.
*/
- if (migratetype == MIGRATE_MOVABLE &&
+ if (alloc_flags & ALLOC_CMA &&
zone_page_state(zone, NR_FREE_CMA_PAGES) >
zone_page_state(zone, NR_FREE_PAGES) / 2) {
page = __rmqueue_cma_fallback(zone, order);
@@ -2801,7 +2796,7 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
- if (migratetype == MIGRATE_MOVABLE)
+ if (alloc_flags & ALLOC_CMA)
page = __rmqueue_cma_fallback(zone, order);
if (!page && __rmqueue_fallback(zone, order, migratetype,
@@ -3487,6 +3482,29 @@ static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
}
ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
+static inline long __zone_watermark_unusable_free(struct zone *z,
+ unsigned int order, unsigned int alloc_flags)
+{
+ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
+ long unusable_free = (1 << order) - 1;
+
+ /*
+ * If the caller does not have rights to ALLOC_HARDER then subtract
+ * the high-atomic reserves. This will over-estimate the size of the
+ * atomic reserve but it avoids a search.
+ */
+ if (likely(!alloc_harder))
+ unusable_free += z->nr_reserved_highatomic;
+
+#ifdef CONFIG_CMA
+ /* If allocation can't use CMA areas don't use free CMA pages */
+ if (!(alloc_flags & ALLOC_CMA))
+ unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
+
+ return unusable_free;
+}
+
/*
* Return true if free base pages are above 'mark'. For high-order checks it
* will return true of the order-0 watermark is reached and there is at least
@@ -3502,19 +3520,12 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
/* free_pages may go negative - that's OK */
- free_pages -= (1 << order) - 1;
+ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
- /*
- * If the caller does not have rights to ALLOC_HARDER then subtract
- * the high-atomic reserves. This will over-estimate the size of the
- * atomic reserve but it avoids a search.
- */
- if (likely(!alloc_harder)) {
- free_pages -= z->nr_reserved_highatomic;
- } else {
+ if (unlikely(alloc_harder)) {
/*
* OOM victims can try even harder than normal ALLOC_HARDER
* users on the grounds that it's definitely going to be in
@@ -3527,13 +3538,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
min -= min / 4;
}
-
-#ifdef CONFIG_CMA
- /* If allocation can't use CMA areas don't use free CMA pages */
- if (!(alloc_flags & ALLOC_CMA))
- free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
-
/*
* Check watermarks for an order-0 allocation request. If these
* are not met, then a high-order request also cannot go ahead
@@ -3580,30 +3584,42 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
unsigned long mark, int highest_zoneidx,
- unsigned int alloc_flags)
+ unsigned int alloc_flags, gfp_t gfp_mask)
{
- long free_pages = zone_page_state(z, NR_FREE_PAGES);
- long cma_pages = 0;
+ long free_pages;
-#ifdef CONFIG_CMA
- /* If allocation can't use CMA areas don't use free CMA pages */
- if (!(alloc_flags & ALLOC_CMA))
- cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
+ free_pages = zone_page_state(z, NR_FREE_PAGES);
/*
* Fast check for order-0 only. If this fails then the reserves
- * need to be calculated. There is a corner case where the check
- * passes but only the high-order atomic reserve are free. If
- * the caller is !atomic then it'll uselessly search the free
- * list. That corner case is then slower but it is harmless.
+ * need to be calculated.
*/
- if (!order && (free_pages - cma_pages) >
- mark + z->lowmem_reserve[highest_zoneidx])
+ if (!order) {
+ long fast_free;
+
+ fast_free = free_pages;
+ fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags);
+ if (fast_free > mark + z->lowmem_reserve[highest_zoneidx])
+ return true;
+ }
+
+ if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
+ free_pages))
return true;
+ /*
+ * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
+ * when checking the min watermark. The min watermark is the
+ * point where boosting is ignored so that kswapd is woken up
+ * when below the low watermark.
+ */
+ if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
+ && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
+ mark = z->_watermark[WMARK_MIN];
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx,
+ alloc_flags, free_pages);
+ }
- return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
- free_pages);
+ return false;
}
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
@@ -3671,6 +3687,20 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
return alloc_flags;
}
+static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
+ unsigned int alloc_flags)
+{
+#ifdef CONFIG_CMA
+ unsigned int pflags = current->flags;
+
+ if (!(pflags & PF_MEMALLOC_NOCMA) &&
+ gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+
+#endif
+ return alloc_flags;
+}
+
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
@@ -3747,7 +3777,8 @@ retry:
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
- ac->highest_zoneidx, alloc_flags)) {
+ ac->highest_zoneidx, alloc_flags,
+ gfp_mask)) {
int ret;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -4316,10 +4347,8 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
-#ifdef CONFIG_CMA
- if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
- alloc_flags |= ALLOC_CMA;
-#endif
+ alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
+
return alloc_flags;
}
@@ -4620,7 +4649,7 @@ retry:
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
- alloc_flags = reserve_flags;
+ alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
/*
* Reset the nodemask and zonelist iterators if memory policies can be
@@ -4697,7 +4726,7 @@ retry:
/* Avoid allocations with no watermarks from looping endlessly */
if (tsk_is_oom_victim(current) &&
- (alloc_flags == ALLOC_OOM ||
+ (alloc_flags & ALLOC_OOM ||
(gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;
@@ -4771,7 +4800,11 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
if (cpusets_enabled()) {
*alloc_mask |= __GFP_HARDWALL;
- if (!ac->nodemask)
+ /*
+ * When we are in the interrupt context, it is irrelevant
+ * to the current task context. It means that any node ok.
+ */
+ if (!in_interrupt() && !ac->nodemask)
ac->nodemask = &cpuset_current_mems_allowed;
else
*alloc_flags |= ALLOC_CPUSET;
@@ -4785,8 +4818,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
if (should_fail_alloc_page(gfp_mask, order))
return false;
- if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
- *alloc_flags |= ALLOC_CMA;
+ *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
return true;
}
@@ -5165,19 +5197,6 @@ unsigned long nr_free_buffer_pages(void)
}
EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
-/**
- * nr_free_pagecache_pages - count number of pages beyond high watermark
- *
- * nr_free_pagecache_pages() counts the number of pages which are beyond the
- * high watermark within all zones.
- *
- * Return: number of pages beyond high watermark within all zones.
- */
-unsigned long nr_free_pagecache_pages(void)
-{
- return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
-}
-
static inline void show_node(struct zone *zone)
{
if (IS_ENABLED(CONFIG_NUMA))
@@ -5220,8 +5239,8 @@ long si_mem_available(void)
* items that are in use, and cannot be freed. Cap this estimate at the
* low watermark.
*/
- reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
+ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
available += reclaimable - min(reclaimable / 2, wmark_low);
if (available < 0)
@@ -5364,8 +5383,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_UNEVICTABLE),
global_node_page_state(NR_FILE_DIRTY),
global_node_page_state(NR_WRITEBACK),
- global_node_page_state(NR_SLAB_RECLAIMABLE),
- global_node_page_state(NR_SLAB_UNRECLAIMABLE),
+ global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
+ global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
global_zone_page_state(NR_PAGETABLE),
@@ -5396,6 +5415,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" anon_thp: %lukB"
#endif
" writeback_tmp:%lukB"
+ " kernel_stack:%lukB"
+#ifdef CONFIG_SHADOW_CALL_STACK
+ " shadow_call_stack:%lukB"
+#endif
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
@@ -5417,6 +5440,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
#endif
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+ node_page_state(pgdat, NR_KERNEL_STACK_KB),
+#ifdef CONFIG_SHADOW_CALL_STACK
+ node_page_state(pgdat, NR_KERNEL_SCS_KB),
+#endif
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
"yes" : "no");
}
@@ -5448,10 +5475,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
- " kernel_stack:%lukB"
-#ifdef CONFIG_SHADOW_CALL_STACK
- " shadow_call_stack:%lukB"
-#endif
" pagetables:%lukB"
" bounce:%lukB"
" free_pcp:%lukB"
@@ -5473,10 +5496,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(zone->present_pages),
K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
- zone_page_state(zone, NR_KERNEL_STACK_KB),
-#ifdef CONFIG_SHADOW_CALL_STACK
- zone_page_state(zone, NR_KERNEL_SCS_KB),
-#endif
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp),
@@ -5891,13 +5910,16 @@ build_all_zonelists_init(void)
*/
void __ref build_all_zonelists(pg_data_t *pgdat)
{
+ unsigned long vm_total_pages;
+
if (system_state == SYSTEM_BOOTING) {
build_all_zonelists_init();
} else {
__build_all_zonelists(pgdat);
/* cpuset refresh routine should be here */
}
- vm_total_pages = nr_free_pagecache_pages();
+ /* Get the number of free pages beyond high watermark in all zones. */
+ vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
/*
* Disable grouping by mobility if the number of pages in the
* system is too low to allow the mechanism to work. It would be
@@ -6325,22 +6347,6 @@ void __meminit init_currently_empty_zone(struct zone *zone,
}
/**
- * sparse_memory_present_with_active_regions - Call memory_present for each active range
- * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
- *
- * If an architecture guarantees that all ranges registered contain no holes and may
- * be freed, this function may be used instead of calling memory_present() manually.
- */
-void __init sparse_memory_present_with_active_regions(int nid)
-{
- unsigned long start_pfn, end_pfn;
- int i, this_nid;
-
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
- memory_present(this_nid, start_pfn, end_pfn);
-}
-
-/**
* get_pfn_range_for_nid - Return the start and end page frames for a node
* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
* @start_pfn: Passed by reference. On return, it will have the node start_pfn.
diff --git a/mm/page_counter.c b/mm/page_counter.c
index c56db2d5e159..b4663844c9b3 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -72,7 +72,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
long new;
new = atomic_long_add_return(nr_pages, &c->usage);
- propagate_protected_usage(counter, new);
+ propagate_protected_usage(c, new);
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
@@ -116,7 +116,7 @@ bool page_counter_try_charge(struct page_counter *counter,
new = atomic_long_add_return(nr_pages, &c->usage);
if (new > c->max) {
atomic_long_sub(nr_pages, &c->usage);
- propagate_protected_usage(counter, new);
+ propagate_protected_usage(c, new);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt.
@@ -125,7 +125,7 @@ bool page_counter_try_charge(struct page_counter *counter,
*fail = c;
goto failed;
}
- propagate_protected_usage(counter, new);
+ propagate_protected_usage(c, new);
/*
* Just like with failcnt, we can live with some
* inaccuracy in the watermark.
diff --git a/mm/page_io.c b/mm/page_io.c
index ccda76790088..9e362567d454 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -441,7 +441,7 @@ int swap_readpage(struct page *page, bool synchronous)
break;
if (!blk_poll(disk->queue, qc, true))
- io_schedule();
+ blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
bio_put(bio);
diff --git a/mm/pgalloc-track.h b/mm/pgalloc-track.h
new file mode 100644
index 000000000000..1dcc865029a2
--- /dev/null
+++ b/mm/pgalloc-track.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PGALLLC_TRACK_H
+#define _LINUX_PGALLLC_TRACK_H
+
+#if defined(CONFIG_MMU)
+static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long address,
+ pgtbl_mod_mask *mod_mask)
+{
+ if (unlikely(pgd_none(*pgd))) {
+ if (__p4d_alloc(mm, pgd, address))
+ return NULL;
+ *mod_mask |= PGTBL_PGD_MODIFIED;
+ }
+
+ return p4d_offset(pgd, address);
+}
+
+static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d,
+ unsigned long address,
+ pgtbl_mod_mask *mod_mask)
+{
+ if (unlikely(p4d_none(*p4d))) {
+ if (__pud_alloc(mm, p4d, address))
+ return NULL;
+ *mod_mask |= PGTBL_P4D_MODIFIED;
+ }
+
+ return pud_offset(p4d, address);
+}
+
+static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
+ unsigned long address,
+ pgtbl_mod_mask *mod_mask)
+{
+ if (unlikely(pud_none(*pud))) {
+ if (__pmd_alloc(mm, pud, address))
+ return NULL;
+ *mod_mask |= PGTBL_PUD_MODIFIED;
+ }
+
+ return pmd_offset(pud, address);
+}
+#endif /* CONFIG_MMU */
+
+#define pte_alloc_kernel_track(pmd, address, mask) \
+ ((unlikely(pmd_none(*(pmd))) && \
+ (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
+ NULL: pte_offset_kernel(pmd, address))
+
+#endif /* _LINUX_PGALLLC_TRACK_H */
diff --git a/mm/shmem.c b/mm/shmem.c
index b2abca3f7f33..eb6b36d89722 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -114,11 +114,13 @@ struct shmem_options {
kuid_t uid;
kgid_t gid;
umode_t mode;
+ bool full_inums;
int huge;
int seen;
#define SHMEM_SEEN_BLOCKS 1
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
+#define SHMEM_SEEN_INUMS 8
};
#ifdef CONFIG_TMPFS
@@ -260,18 +262,76 @@ bool vma_is_shmem(struct vm_area_struct *vma)
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
-static int shmem_reserve_inode(struct super_block *sb)
+/*
+ * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
+ * produces a novel ino for the newly allocated inode.
+ *
+ * It may also be called when making a hard link to permit the space needed by
+ * each dentry. However, in that case, no new inode number is needed since that
+ * internally draws from another pool of inode numbers (currently global
+ * get_next_ino()). This case is indicated by passing NULL as inop.
+ */
+#define SHMEM_INO_BATCH 1024
+static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- if (sbinfo->max_inodes) {
+ ino_t ino;
+
+ if (!(sb->s_flags & SB_KERNMOUNT)) {
spin_lock(&sbinfo->stat_lock);
if (!sbinfo->free_inodes) {
spin_unlock(&sbinfo->stat_lock);
return -ENOSPC;
}
sbinfo->free_inodes--;
+ if (inop) {
+ ino = sbinfo->next_ino++;
+ if (unlikely(is_zero_ino(ino)))
+ ino = sbinfo->next_ino++;
+ if (unlikely(!sbinfo->full_inums &&
+ ino > UINT_MAX)) {
+ /*
+ * Emulate get_next_ino uint wraparound for
+ * compatibility
+ */
+ if (IS_ENABLED(CONFIG_64BIT))
+ pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
+ __func__, MINOR(sb->s_dev));
+ sbinfo->next_ino = 1;
+ ino = sbinfo->next_ino++;
+ }
+ *inop = ino;
+ }
spin_unlock(&sbinfo->stat_lock);
+ } else if (inop) {
+ /*
+ * __shmem_file_setup, one of our callers, is lock-free: it
+ * doesn't hold stat_lock in shmem_reserve_inode since
+ * max_inodes is always 0, and is called from potentially
+ * unknown contexts. As such, use a per-cpu batched allocator
+ * which doesn't require the per-sb stat_lock unless we are at
+ * the batch boundary.
+ *
+ * We don't need to worry about inode{32,64} since SB_KERNMOUNT
+ * shmem mounts are not exposed to userspace, so we don't need
+ * to worry about things like glibc compatibility.
+ */
+ ino_t *next_ino;
+ next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
+ ino = *next_ino;
+ if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
+ spin_lock(&sbinfo->stat_lock);
+ ino = sbinfo->next_ino;
+ sbinfo->next_ino += SHMEM_INO_BATCH;
+ spin_unlock(&sbinfo->stat_lock);
+ if (unlikely(is_zero_ino(ino)))
+ ino++;
+ }
+ *inop = ino;
+ *next_ino = ++ino;
+ put_cpu();
}
+
return 0;
}
@@ -2222,13 +2282,14 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
struct inode *inode;
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ ino_t ino;
- if (shmem_reserve_inode(sb))
+ if (shmem_reserve_inode(sb, &ino))
return NULL;
inode = new_inode(sb);
if (inode) {
- inode->i_ino = get_next_ino();
+ inode->i_ino = ino;
inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
@@ -2932,7 +2993,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
* first link must skip that, to get the accounting right.
*/
if (inode->i_nlink) {
- ret = shmem_reserve_inode(inode->i_sb);
+ ret = shmem_reserve_inode(inode->i_sb, NULL);
if (ret)
goto out;
}
@@ -3347,6 +3408,8 @@ enum shmem_param {
Opt_nr_inodes,
Opt_size,
Opt_uid,
+ Opt_inode32,
+ Opt_inode64,
};
static const struct constant_table shmem_param_enums_huge[] = {
@@ -3366,6 +3429,8 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
fsparam_string("nr_inodes", Opt_nr_inodes),
fsparam_string("size", Opt_size),
fsparam_u32 ("uid", Opt_uid),
+ fsparam_flag ("inode32", Opt_inode32),
+ fsparam_flag ("inode64", Opt_inode64),
{}
};
@@ -3437,6 +3502,18 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
break;
}
goto unsupported_parameter;
+ case Opt_inode32:
+ ctx->full_inums = false;
+ ctx->seen |= SHMEM_SEEN_INUMS;
+ break;
+ case Opt_inode64:
+ if (sizeof(ino_t) < 8) {
+ return invalfc(fc,
+ "Cannot use inode64 with <64bit inums in kernel\n");
+ }
+ ctx->full_inums = true;
+ ctx->seen |= SHMEM_SEEN_INUMS;
+ break;
}
return 0;
@@ -3528,8 +3605,16 @@ static int shmem_reconfigure(struct fs_context *fc)
}
}
+ if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
+ sbinfo->next_ino > UINT_MAX) {
+ err = "Current inum too high to switch to 32-bit inums";
+ goto out;
+ }
+
if (ctx->seen & SHMEM_SEEN_HUGE)
sbinfo->huge = ctx->huge;
+ if (ctx->seen & SHMEM_SEEN_INUMS)
+ sbinfo->full_inums = ctx->full_inums;
if (ctx->seen & SHMEM_SEEN_BLOCKS)
sbinfo->max_blocks = ctx->blocks;
if (ctx->seen & SHMEM_SEEN_INODES) {
@@ -3569,6 +3654,29 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
seq_printf(seq, ",gid=%u",
from_kgid_munged(&init_user_ns, sbinfo->gid));
+
+ /*
+ * Showing inode{64,32} might be useful even if it's the system default,
+ * since then people don't have to resort to checking both here and
+ * /proc/config.gz to confirm 64-bit inums were successfully applied
+ * (which may not even exist if IKCONFIG_PROC isn't enabled).
+ *
+ * We hide it when inode64 isn't the default and we are using 32-bit
+ * inodes, since that probably just means the feature isn't even under
+ * consideration.
+ *
+ * As such:
+ *
+ * +-----------------+-----------------+
+ * | TMPFS_INODE64=y | TMPFS_INODE64=n |
+ * +------------------+-----------------+-----------------+
+ * | full_inums=true | show | show |
+ * | full_inums=false | show | hide |
+ * +------------------+-----------------+-----------------+
+ *
+ */
+ if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
+ seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
if (sbinfo->huge)
@@ -3584,6 +3692,7 @@ static void shmem_put_super(struct super_block *sb)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ free_percpu(sbinfo->ino_batch);
percpu_counter_destroy(&sbinfo->used_blocks);
mpol_put(sbinfo->mpol);
kfree(sbinfo);
@@ -3616,6 +3725,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
ctx->blocks = shmem_default_max_blocks();
if (!(ctx->seen & SHMEM_SEEN_INODES))
ctx->inodes = shmem_default_max_inodes();
+ if (!(ctx->seen & SHMEM_SEEN_INUMS))
+ ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
} else {
sb->s_flags |= SB_NOUSER;
}
@@ -3626,8 +3737,14 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#endif
sbinfo->max_blocks = ctx->blocks;
sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
+ if (sb->s_flags & SB_KERNMOUNT) {
+ sbinfo->ino_batch = alloc_percpu(ino_t);
+ if (!sbinfo->ino_batch)
+ goto failed;
+ }
sbinfo->uid = ctx->uid;
sbinfo->gid = ctx->gid;
+ sbinfo->full_inums = ctx->full_inums;
sbinfo->mode = ctx->mode;
sbinfo->huge = ctx->huge;
sbinfo->mpol = ctx->mpol;
@@ -4128,7 +4245,7 @@ EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
/**
* shmem_zero_setup - setup a shared anonymous mapping
- * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
+ * @vma: the vma to be mmapped is prepared by do_mmap
*/
int shmem_zero_setup(struct vm_area_struct *vma)
{
diff --git a/mm/shuffle.c b/mm/shuffle.c
index 44406d9977c7..9b5cd4b004b0 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -10,33 +10,11 @@
#include "shuffle.h"
DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
-static unsigned long shuffle_state __ro_after_init;
-
-/*
- * Depending on the architecture, module parameter parsing may run
- * before, or after the cache detection. SHUFFLE_FORCE_DISABLE prevents,
- * or reverts the enabling of the shuffle implementation. SHUFFLE_ENABLE
- * attempts to turn on the implementation, but aborts if it finds
- * SHUFFLE_FORCE_DISABLE already set.
- */
-__meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
-{
- if (ctl == SHUFFLE_FORCE_DISABLE)
- set_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state);
-
- if (test_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state)) {
- if (test_and_clear_bit(SHUFFLE_ENABLE, &shuffle_state))
- static_branch_disable(&page_alloc_shuffle_key);
- } else if (ctl == SHUFFLE_ENABLE
- && !test_and_set_bit(SHUFFLE_ENABLE, &shuffle_state))
- static_branch_enable(&page_alloc_shuffle_key);
-}
static bool shuffle_param;
static int shuffle_show(char *buffer, const struct kernel_param *kp)
{
- return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state)
- ? 'Y' : 'N');
+ return sprintf(buffer, "%c\n", shuffle_param ? 'Y' : 'N');
}
static __meminit int shuffle_store(const char *val,
@@ -47,9 +25,7 @@ static __meminit int shuffle_store(const char *val,
if (rc < 0)
return rc;
if (shuffle_param)
- page_alloc_shuffle(SHUFFLE_ENABLE);
- else
- page_alloc_shuffle(SHUFFLE_FORCE_DISABLE);
+ static_branch_enable(&page_alloc_shuffle_key);
return 0;
}
module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
@@ -58,25 +34,25 @@ module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
* For two pages to be swapped in the shuffle, they must be free (on a
* 'free_area' lru), have the same order, and have the same migratetype.
*/
-static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order)
+static struct page * __meminit shuffle_valid_page(struct zone *zone,
+ unsigned long pfn, int order)
{
- struct page *page;
+ struct page *page = pfn_to_online_page(pfn);
/*
* Given we're dealing with randomly selected pfns in a zone we
* need to ask questions like...
*/
- /* ...is the pfn even in the memmap? */
- if (!pfn_valid_within(pfn))
+ /* ... is the page managed by the buddy? */
+ if (!page)
return NULL;
- /* ...is the pfn in a present section or a hole? */
- if (!pfn_in_present_section(pfn))
+ /* ... is the page assigned to the same zone? */
+ if (page_zone(page) != zone)
return NULL;
/* ...is the page free and currently on a free_area list? */
- page = pfn_to_page(pfn);
if (!PageBuddy(page))
return NULL;
@@ -123,7 +99,7 @@ void __meminit __shuffle_zone(struct zone *z)
* page_j randomly selected in the span @zone_start_pfn to
* @spanned_pages.
*/
- page_i = shuffle_valid_page(i, order);
+ page_i = shuffle_valid_page(z, i, order);
if (!page_i)
continue;
@@ -137,7 +113,7 @@ void __meminit __shuffle_zone(struct zone *z)
j = z->zone_start_pfn +
ALIGN_DOWN(get_random_long() % z->spanned_pages,
order_pages);
- page_j = shuffle_valid_page(j, order);
+ page_j = shuffle_valid_page(z, j, order);
if (page_j && page_j != page_i)
break;
}
diff --git a/mm/shuffle.h b/mm/shuffle.h
index 4d79f03b6658..71b784f0b7c3 100644
--- a/mm/shuffle.h
+++ b/mm/shuffle.h
@@ -4,23 +4,10 @@
#define _MM_SHUFFLE_H
#include <linux/jump_label.h>
-/*
- * SHUFFLE_ENABLE is called from the command line enabling path, or by
- * platform-firmware enabling that indicates the presence of a
- * direct-mapped memory-side-cache. SHUFFLE_FORCE_DISABLE is called from
- * the command line path and overrides any previous or future
- * SHUFFLE_ENABLE.
- */
-enum mm_shuffle_ctl {
- SHUFFLE_ENABLE,
- SHUFFLE_FORCE_DISABLE,
-};
-
#define SHUFFLE_ORDER (MAX_ORDER-1)
#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
-extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl);
extern void __shuffle_free_memory(pg_data_t *pgdat);
extern bool shuffle_pick_tail(void);
static inline void shuffle_free_memory(pg_data_t *pgdat)
@@ -58,10 +45,6 @@ static inline void shuffle_zone(struct zone *z)
{
}
-static inline void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
-{
-}
-
static inline bool is_shuffle_order(int order)
{
return false;
diff --git a/mm/slab.c b/mm/slab.c
index 9350062ffc1a..3160dff6fd76 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -588,6 +588,16 @@ static int transfer_objects(struct array_cache *to,
return nr;
}
+/* &alien->lock must be held by alien callers. */
+static __always_inline void __free_one(struct array_cache *ac, void *objp)
+{
+ /* Avoid trivial double-free. */
+ if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
+ WARN_ON_ONCE(ac->avail > 0 && ac->entry[ac->avail - 1] == objp))
+ return;
+ ac->entry[ac->avail++] = objp;
+}
+
#ifndef CONFIG_NUMA
#define drain_alien_cache(cachep, alien) do { } while (0)
@@ -767,7 +777,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
STATS_INC_ACOVERFLOW(cachep);
__drain_alien_cache(cachep, ac, page_node, &list);
}
- ac->entry[ac->avail++] = objp;
+ __free_one(ac, objp);
spin_unlock(&alien->lock);
slabs_destroy(cachep, &list);
} else {
@@ -1050,7 +1060,7 @@ int slab_prepare_cpu(unsigned int cpu)
* offline.
*
* Even if all the cpus of a node are down, we don't free the
- * kmem_list3 of any cache. This to avoid a race between cpu_down, and
+ * kmem_cache_node of any cache. This to avoid a race between cpu_down, and
* a kmalloc allocation from another cpu for memory from the node of
* the cpu going down. The list3 structure is usually allocated from
* kmem_cache_create() and gets destroyed at kmem_cache_destroy().
@@ -1239,7 +1249,6 @@ void __init kmem_cache_init(void)
nr_node_ids * sizeof(struct kmem_cache_node *),
SLAB_HWCACHE_ALIGN, 0, 0);
list_add(&kmem_cache->list, &slab_caches);
- memcg_link_cache(kmem_cache, NULL);
slab_state = PARTIAL;
/*
@@ -1370,11 +1379,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
return NULL;
}
- if (charge_slab_page(page, flags, cachep->gfporder, cachep)) {
- __free_pages(page, cachep->gfporder);
- return NULL;
- }
-
+ account_slab_page(page, cachep->gfporder, cachep);
__SetPageSlab(page);
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
if (sk_memalloc_socks() && page_is_pfmemalloc(page))
@@ -1398,7 +1403,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += 1 << order;
- uncharge_slab_page(page, order, cachep);
+ unaccount_slab_page(page, order, cachep);
__free_pages(page, order);
}
@@ -2243,17 +2248,6 @@ int __kmem_cache_shrink(struct kmem_cache *cachep)
return (ret ? 1 : 0);
}
-#ifdef CONFIG_MEMCG
-void __kmemcg_cache_deactivate(struct kmem_cache *cachep)
-{
- __kmem_cache_shrink(cachep);
-}
-
-void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
-{
-}
-#endif
-
int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
return __kmem_cache_shrink(cachep);
@@ -2579,13 +2573,9 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
* Be lazy and only check for valid flags here, keeping it out of the
* critical path in kmem_cache_alloc().
*/
- if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
- gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
- flags &= ~GFP_SLAB_BUG_MASK;
- pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
- invalid_mask, &invalid_mask, flags, &flags);
- dump_stack();
- }
+ if (unlikely(flags & GFP_SLAB_BUG_MASK))
+ flags = kmalloc_fix_flags(flags);
+
WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
@@ -3222,9 +3212,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
unsigned long save_flags;
void *ptr;
int slab_node = numa_mem_id();
+ struct obj_cgroup *objcg = NULL;
flags &= gfp_allowed_mask;
- cachep = slab_pre_alloc_hook(cachep, flags);
+ cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
if (unlikely(!cachep))
return NULL;
@@ -3260,7 +3251,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
memset(ptr, 0, cachep->object_size);
- slab_post_alloc_hook(cachep, flags, 1, &ptr);
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr);
return ptr;
}
@@ -3301,9 +3292,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
unsigned long save_flags;
void *objp;
+ struct obj_cgroup *objcg = NULL;
flags &= gfp_allowed_mask;
- cachep = slab_pre_alloc_hook(cachep, flags);
+ cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
if (unlikely(!cachep))
return NULL;
@@ -3317,7 +3309,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
memset(objp, 0, cachep->object_size);
- slab_post_alloc_hook(cachep, flags, 1, &objp);
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &objp);
return objp;
}
@@ -3426,6 +3418,11 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
if (kasan_slab_free(cachep, objp, _RET_IP_))
return;
+ /* Use KCSAN to help debug racy use-after-free. */
+ if (!(cachep->flags & SLAB_TYPESAFE_BY_RCU))
+ __kcsan_check_access(objp, cachep->object_size,
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
+
___cache_free(cachep, objp, caller);
}
@@ -3439,6 +3436,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
memset(objp, 0, cachep->object_size);
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
+ memcg_slab_free_hook(cachep, virt_to_head_page(objp), objp);
/*
* Skip calling cache_free_alien() when the platform is not numa.
@@ -3466,7 +3464,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
}
}
- ac->entry[ac->avail++] = objp;
+ __free_one(ac, objp);
}
/**
@@ -3504,8 +3502,9 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
void **p)
{
size_t i;
+ struct obj_cgroup *objcg = NULL;
- s = slab_pre_alloc_hook(s, flags);
+ s = slab_pre_alloc_hook(s, &objcg, size, flags);
if (!s)
return 0;
@@ -3528,13 +3527,13 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
for (i = 0; i < size; i++)
memset(p[i], 0, s->object_size);
- slab_post_alloc_hook(s, flags, size, p);
+ slab_post_alloc_hook(s, objcg, flags, size, p);
/* FIXME: Trace call missing. Christoph would like a bulk variant */
return size;
error:
local_irq_enable();
cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
- slab_post_alloc_hook(s, flags, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p);
__kmem_cache_free_bulk(s, i, p);
return 0;
}
@@ -3796,8 +3795,8 @@ fail:
}
/* Always called with the slab_mutex held */
-static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
- int batchcount, int shared, gfp_t gfp)
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+ int batchcount, int shared, gfp_t gfp)
{
struct array_cache __percpu *cpu_cache, *prev;
int cpu;
@@ -3842,29 +3841,6 @@ setup_node:
return setup_kmem_cache_nodes(cachep, gfp);
}
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
- int batchcount, int shared, gfp_t gfp)
-{
- int ret;
- struct kmem_cache *c;
-
- ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
-
- if (slab_state < FULL)
- return ret;
-
- if ((ret < 0) || !is_root_cache(cachep))
- return ret;
-
- lockdep_assert_held(&slab_mutex);
- for_each_memcg_cache(c, cachep) {
- /* return value determined by the root cache only */
- __do_tune_cpucache(c, limit, batchcount, shared, gfp);
- }
-
- return ret;
-}
-
/* Called with slab_mutex held always */
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
@@ -3877,13 +3853,6 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
if (err)
goto end;
- if (!is_root_cache(cachep)) {
- struct kmem_cache *root = memcg_root_cache(cachep);
- limit = root->limit;
- shared = root->shared;
- batchcount = root->batchcount;
- }
-
if (limit && shared && batchcount)
goto skip_setup;
/*
diff --git a/mm/slab.h b/mm/slab.h
index 74f7e09a7cfd..6cc323f1313a 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -30,69 +30,6 @@ struct kmem_cache {
struct list_head list; /* List of all slab caches on the system */
};
-#else /* !CONFIG_SLOB */
-
-struct memcg_cache_array {
- struct rcu_head rcu;
- struct kmem_cache *entries[0];
-};
-
-/*
- * This is the main placeholder for memcg-related information in kmem caches.
- * Both the root cache and the child caches will have it. For the root cache,
- * this will hold a dynamically allocated array large enough to hold
- * information about the currently limited memcgs in the system. To allow the
- * array to be accessed without taking any locks, on relocation we free the old
- * version only after a grace period.
- *
- * Root and child caches hold different metadata.
- *
- * @root_cache: Common to root and child caches. NULL for root, pointer to
- * the root cache for children.
- *
- * The following fields are specific to root caches.
- *
- * @memcg_caches: kmemcg ID indexed table of child caches. This table is
- * used to index child cachces during allocation and cleared
- * early during shutdown.
- *
- * @root_caches_node: List node for slab_root_caches list.
- *
- * @children: List of all child caches. While the child caches are also
- * reachable through @memcg_caches, a child cache remains on
- * this list until it is actually destroyed.
- *
- * The following fields are specific to child caches.
- *
- * @memcg: Pointer to the memcg this cache belongs to.
- *
- * @children_node: List node for @root_cache->children list.
- *
- * @kmem_caches_node: List node for @memcg->kmem_caches list.
- */
-struct memcg_cache_params {
- struct kmem_cache *root_cache;
- union {
- struct {
- struct memcg_cache_array __rcu *memcg_caches;
- struct list_head __root_caches_node;
- struct list_head children;
- bool dying;
- };
- struct {
- struct mem_cgroup *memcg;
- struct list_head children_node;
- struct list_head kmem_caches_node;
- struct percpu_ref refcnt;
-
- void (*work_fn)(struct kmem_cache *);
- union {
- struct rcu_head rcu_head;
- struct work_struct work;
- };
- };
- };
-};
#endif /* CONFIG_SLOB */
#ifdef CONFIG_SLAB
@@ -109,6 +46,7 @@ struct memcg_cache_params {
#include <linux/kmemleak.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
+#include <linux/kmemleak.h>
/*
* State of the slab allocator.
@@ -152,6 +90,7 @@ void create_kmalloc_caches(slab_flags_t);
struct kmem_cache *kmalloc_slab(size_t, gfp_t);
#endif
+gfp_t kmalloc_fix_flags(gfp_t flags);
/* Functions provided by the slab allocators */
int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
@@ -234,10 +173,7 @@ bool __kmem_cache_empty(struct kmem_cache *);
int __kmem_cache_shutdown(struct kmem_cache *);
void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *);
-void __kmemcg_cache_deactivate(struct kmem_cache *s);
-void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
void slab_kmem_cache_release(struct kmem_cache *);
-void kmem_cache_shrink_all(struct kmem_cache *s);
struct seq_file;
struct file;
@@ -272,199 +208,208 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
static inline int cache_vmstat_idx(struct kmem_cache *s)
{
return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
- NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE;
+ NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B;
}
-#ifdef CONFIG_MEMCG_KMEM
-
-/* List of all root caches. */
-extern struct list_head slab_root_caches;
-#define root_caches_node memcg_params.__root_caches_node
+#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SLUB_DEBUG_ON
+DECLARE_STATIC_KEY_TRUE(slub_debug_enabled);
+#else
+DECLARE_STATIC_KEY_FALSE(slub_debug_enabled);
+#endif
+extern void print_tracking(struct kmem_cache *s, void *object);
+#else
+static inline void print_tracking(struct kmem_cache *s, void *object)
+{
+}
+#endif
/*
- * Iterate over all memcg caches of the given root cache. The caller must hold
- * slab_mutex.
+ * Returns true if any of the specified slub_debug flags is enabled for the
+ * cache. Use only for flags parsed by setup_slub_debug() as it also enables
+ * the static key.
*/
-#define for_each_memcg_cache(iter, root) \
- list_for_each_entry(iter, &(root)->memcg_params.children, \
- memcg_params.children_node)
-
-static inline bool is_root_cache(struct kmem_cache *s)
+static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags)
{
- return !s->memcg_params.root_cache;
+#ifdef CONFIG_SLUB_DEBUG
+ VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS));
+ if (static_branch_unlikely(&slub_debug_enabled))
+ return s->flags & flags;
+#endif
+ return false;
}
-static inline bool slab_equal_or_root(struct kmem_cache *s,
- struct kmem_cache *p)
+#ifdef CONFIG_MEMCG_KMEM
+static inline struct obj_cgroup **page_obj_cgroups(struct page *page)
{
- return p == s || p == s->memcg_params.root_cache;
+ /*
+ * page->mem_cgroup and page->obj_cgroups are sharing the same
+ * space. To distinguish between them in case we don't know for sure
+ * that the page is a slab page (e.g. page_cgroup_ino()), let's
+ * always set the lowest bit of obj_cgroups.
+ */
+ return (struct obj_cgroup **)
+ ((unsigned long)page->obj_cgroups & ~0x1UL);
}
-/*
- * We use suffixes to the name in memcg because we can't have caches
- * created in the system with the same name. But when we print them
- * locally, better refer to them with the base name
- */
-static inline const char *cache_name(struct kmem_cache *s)
+static inline bool page_has_obj_cgroups(struct page *page)
{
- if (!is_root_cache(s))
- s = s->memcg_params.root_cache;
- return s->name;
+ return ((unsigned long)page->obj_cgroups & 0x1UL);
}
-static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
+ gfp_t gfp);
+
+static inline void memcg_free_page_obj_cgroups(struct page *page)
{
- if (is_root_cache(s))
- return s;
- return s->memcg_params.root_cache;
+ kfree(page_obj_cgroups(page));
+ page->obj_cgroups = NULL;
}
-/*
- * Expects a pointer to a slab page. Please note, that PageSlab() check
- * isn't sufficient, as it returns true also for tail compound slab pages,
- * which do not have slab_cache pointer set.
- * So this function assumes that the page can pass PageSlab() && !PageTail()
- * check.
- *
- * The kmem_cache can be reparented asynchronously. The caller must ensure
- * the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex.
- */
-static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
+static inline size_t obj_full_size(struct kmem_cache *s)
{
- struct kmem_cache *s;
-
- s = READ_ONCE(page->slab_cache);
- if (s && !is_root_cache(s))
- return READ_ONCE(s->memcg_params.memcg);
-
- return NULL;
+ /*
+ * For each accounted object there is an extra space which is used
+ * to store obj_cgroup membership. Charge it too.
+ */
+ return s->size + sizeof(struct obj_cgroup *);
}
-/*
- * Charge the slab page belonging to the non-root kmem_cache.
- * Can be called for non-root kmem_caches only.
- */
-static __always_inline int memcg_charge_slab(struct page *page,
- gfp_t gfp, int order,
- struct kmem_cache *s)
+static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ size_t objects,
+ gfp_t flags)
{
- int nr_pages = 1 << order;
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
- int ret;
-
- rcu_read_lock();
- memcg = READ_ONCE(s->memcg_params.memcg);
- while (memcg && !css_tryget_online(&memcg->css))
- memcg = parent_mem_cgroup(memcg);
- rcu_read_unlock();
+ struct obj_cgroup *objcg;
- if (unlikely(!memcg || mem_cgroup_is_root(memcg))) {
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- nr_pages);
- percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages);
- return 0;
- }
+ if (memcg_kmem_bypass())
+ return NULL;
- ret = memcg_kmem_charge(memcg, gfp, nr_pages);
- if (ret)
- goto out;
+ objcg = get_obj_cgroup_from_current();
+ if (!objcg)
+ return NULL;
- lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
- mod_lruvec_state(lruvec, cache_vmstat_idx(s), nr_pages);
+ if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) {
+ obj_cgroup_put(objcg);
+ return NULL;
+ }
- /* transer try_charge() page references to kmem_cache */
- percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages);
- css_put_many(&memcg->css, nr_pages);
-out:
- css_put(&memcg->css);
- return ret;
+ return objcg;
}
-/*
- * Uncharge a slab page belonging to a non-root kmem_cache.
- * Can be called for non-root kmem_caches only.
- */
-static __always_inline void memcg_uncharge_slab(struct page *page, int order,
- struct kmem_cache *s)
+static inline void mod_objcg_state(struct obj_cgroup *objcg,
+ struct pglist_data *pgdat,
+ int idx, int nr)
{
- int nr_pages = 1 << order;
struct mem_cgroup *memcg;
struct lruvec *lruvec;
rcu_read_lock();
- memcg = READ_ONCE(s->memcg_params.memcg);
- if (likely(!mem_cgroup_is_root(memcg))) {
- lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
- mod_lruvec_state(lruvec, cache_vmstat_idx(s), -nr_pages);
- memcg_kmem_uncharge(memcg, nr_pages);
- } else {
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- -nr_pages);
- }
+ memcg = obj_cgroup_memcg(objcg);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
rcu_read_unlock();
+}
+
+static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup *objcg,
+ gfp_t flags, size_t size,
+ void **p)
+{
+ struct page *page;
+ unsigned long off;
+ size_t i;
+
+ if (!objcg)
+ return;
- percpu_ref_put_many(&s->memcg_params.refcnt, nr_pages);
+ flags &= ~__GFP_ACCOUNT;
+ for (i = 0; i < size; i++) {
+ if (likely(p[i])) {
+ page = virt_to_head_page(p[i]);
+
+ if (!page_has_obj_cgroups(page) &&
+ memcg_alloc_page_obj_cgroups(page, s, flags)) {
+ obj_cgroup_uncharge(objcg, obj_full_size(s));
+ continue;
+ }
+
+ off = obj_to_index(s, page, p[i]);
+ obj_cgroup_get(objcg);
+ page_obj_cgroups(page)[off] = objcg;
+ mod_objcg_state(objcg, page_pgdat(page),
+ cache_vmstat_idx(s), obj_full_size(s));
+ } else {
+ obj_cgroup_uncharge(objcg, obj_full_size(s));
+ }
+ }
+ obj_cgroup_put(objcg);
}
-extern void slab_init_memcg_params(struct kmem_cache *);
-extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg);
+static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
+ void *p)
+{
+ struct obj_cgroup *objcg;
+ unsigned int off;
-#else /* CONFIG_MEMCG_KMEM */
+ if (!memcg_kmem_enabled())
+ return;
-/* If !memcg, all caches are root. */
-#define slab_root_caches slab_caches
-#define root_caches_node list
+ if (!page_has_obj_cgroups(page))
+ return;
-#define for_each_memcg_cache(iter, root) \
- for ((void)(iter), (void)(root); 0; )
+ off = obj_to_index(s, page, p);
+ objcg = page_obj_cgroups(page)[off];
+ page_obj_cgroups(page)[off] = NULL;
-static inline bool is_root_cache(struct kmem_cache *s)
-{
- return true;
-}
+ if (!objcg)
+ return;
-static inline bool slab_equal_or_root(struct kmem_cache *s,
- struct kmem_cache *p)
-{
- return s == p;
-}
+ obj_cgroup_uncharge(objcg, obj_full_size(s));
+ mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
+ -obj_full_size(s));
-static inline const char *cache_name(struct kmem_cache *s)
-{
- return s->name;
+ obj_cgroup_put(objcg);
}
-static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+#else /* CONFIG_MEMCG_KMEM */
+static inline bool page_has_obj_cgroups(struct page *page)
{
- return s;
+ return false;
}
-static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
+static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
{
return NULL;
}
-static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
- struct kmem_cache *s)
+static inline int memcg_alloc_page_obj_cgroups(struct page *page,
+ struct kmem_cache *s, gfp_t gfp)
{
return 0;
}
-static inline void memcg_uncharge_slab(struct page *page, int order,
- struct kmem_cache *s)
+static inline void memcg_free_page_obj_cgroups(struct page *page)
{
}
-static inline void slab_init_memcg_params(struct kmem_cache *s)
+static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ size_t objects,
+ gfp_t flags)
{
+ return NULL;
}
-static inline void memcg_link_cache(struct kmem_cache *s,
- struct mem_cgroup *memcg)
+static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup *objcg,
+ gfp_t flags, size_t size,
+ void **p)
{
}
+static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
+ void *p)
+{
+}
#endif /* CONFIG_MEMCG_KMEM */
static inline struct kmem_cache *virt_to_cache(const void *obj)
@@ -478,51 +423,36 @@ static inline struct kmem_cache *virt_to_cache(const void *obj)
return page->slab_cache;
}
-static __always_inline int charge_slab_page(struct page *page,
- gfp_t gfp, int order,
- struct kmem_cache *s)
+static __always_inline void account_slab_page(struct page *page, int order,
+ struct kmem_cache *s)
{
- if (is_root_cache(s)) {
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- 1 << order);
- return 0;
- }
-
- return memcg_charge_slab(page, gfp, order, s);
+ mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ PAGE_SIZE << order);
}
-static __always_inline void uncharge_slab_page(struct page *page, int order,
- struct kmem_cache *s)
+static __always_inline void unaccount_slab_page(struct page *page, int order,
+ struct kmem_cache *s)
{
- if (is_root_cache(s)) {
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- -(1 << order));
- return;
- }
+ if (memcg_kmem_enabled())
+ memcg_free_page_obj_cgroups(page);
- memcg_uncharge_slab(page, order, s);
+ mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ -(PAGE_SIZE << order));
}
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
struct kmem_cache *cachep;
- /*
- * When kmemcg is not being used, both assignments should return the
- * same value. but we don't want to pay the assignment price in that
- * case. If it is not compiled in, the compiler should be smart enough
- * to not do even the assignment. In that case, slab_equal_or_root
- * will also be a constant.
- */
- if (!memcg_kmem_enabled() &&
- !IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
- !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS))
+ if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
+ !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS))
return s;
cachep = virt_to_cache(x);
- WARN_ONCE(cachep && !slab_equal_or_root(cachep, s),
+ if (WARN(cachep && cachep != s,
"%s: Wrong slab cache. %s but object is from %s\n",
- __func__, s->name, cachep->name);
+ __func__, s->name, cachep->name))
+ print_tracking(cachep, x);
return cachep;
}
@@ -557,7 +487,8 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
}
static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
- gfp_t flags)
+ struct obj_cgroup **objcgp,
+ size_t size, gfp_t flags)
{
flags &= gfp_allowed_mask;
@@ -571,13 +502,14 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
if (memcg_kmem_enabled() &&
((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)))
- return memcg_kmem_get_cache(s);
+ *objcgp = memcg_slab_pre_alloc_hook(s, size, flags);
return s;
}
-static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
- size_t size, void **p)
+static inline void slab_post_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup *objcg,
+ gfp_t flags, size_t size, void **p)
{
size_t i;
@@ -590,7 +522,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
}
if (memcg_kmem_enabled())
- memcg_kmem_put_cache(s);
+ memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
}
#ifndef CONFIG_SLOB
@@ -645,9 +577,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
void *slab_start(struct seq_file *m, loff_t *pos);
void *slab_next(struct seq_file *m, void *p, loff_t *pos);
void slab_stop(struct seq_file *m, void *p);
-void *memcg_slab_start(struct seq_file *m, loff_t *pos);
-void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos);
-void memcg_slab_stop(struct seq_file *m, void *p);
int memcg_slab_show(struct seq_file *m, void *p);
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index fe8b68482670..a513f3237155 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -26,6 +26,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/kmem.h>
+#include "internal.h"
+
#include "slab.h"
enum slab_state slab_state;
@@ -128,152 +130,6 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
return i;
}
-#ifdef CONFIG_MEMCG_KMEM
-
-LIST_HEAD(slab_root_caches);
-static DEFINE_SPINLOCK(memcg_kmem_wq_lock);
-
-static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref);
-
-void slab_init_memcg_params(struct kmem_cache *s)
-{
- s->memcg_params.root_cache = NULL;
- RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
- INIT_LIST_HEAD(&s->memcg_params.children);
- s->memcg_params.dying = false;
-}
-
-static int init_memcg_params(struct kmem_cache *s,
- struct kmem_cache *root_cache)
-{
- struct memcg_cache_array *arr;
-
- if (root_cache) {
- int ret = percpu_ref_init(&s->memcg_params.refcnt,
- kmemcg_cache_shutdown,
- 0, GFP_KERNEL);
- if (ret)
- return ret;
-
- s->memcg_params.root_cache = root_cache;
- INIT_LIST_HEAD(&s->memcg_params.children_node);
- INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
- return 0;
- }
-
- slab_init_memcg_params(s);
-
- if (!memcg_nr_cache_ids)
- return 0;
-
- arr = kvzalloc(sizeof(struct memcg_cache_array) +
- memcg_nr_cache_ids * sizeof(void *),
- GFP_KERNEL);
- if (!arr)
- return -ENOMEM;
-
- RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
- return 0;
-}
-
-static void destroy_memcg_params(struct kmem_cache *s)
-{
- if (is_root_cache(s)) {
- kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
- } else {
- mem_cgroup_put(s->memcg_params.memcg);
- WRITE_ONCE(s->memcg_params.memcg, NULL);
- percpu_ref_exit(&s->memcg_params.refcnt);
- }
-}
-
-static void free_memcg_params(struct rcu_head *rcu)
-{
- struct memcg_cache_array *old;
-
- old = container_of(rcu, struct memcg_cache_array, rcu);
- kvfree(old);
-}
-
-static int update_memcg_params(struct kmem_cache *s, int new_array_size)
-{
- struct memcg_cache_array *old, *new;
-
- new = kvzalloc(sizeof(struct memcg_cache_array) +
- new_array_size * sizeof(void *), GFP_KERNEL);
- if (!new)
- return -ENOMEM;
-
- old = rcu_dereference_protected(s->memcg_params.memcg_caches,
- lockdep_is_held(&slab_mutex));
- if (old)
- memcpy(new->entries, old->entries,
- memcg_nr_cache_ids * sizeof(void *));
-
- rcu_assign_pointer(s->memcg_params.memcg_caches, new);
- if (old)
- call_rcu(&old->rcu, free_memcg_params);
- return 0;
-}
-
-int memcg_update_all_caches(int num_memcgs)
-{
- struct kmem_cache *s;
- int ret = 0;
-
- mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_root_caches, root_caches_node) {
- ret = update_memcg_params(s, num_memcgs);
- /*
- * Instead of freeing the memory, we'll just leave the caches
- * up to this point in an updated state.
- */
- if (ret)
- break;
- }
- mutex_unlock(&slab_mutex);
- return ret;
-}
-
-void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg)
-{
- if (is_root_cache(s)) {
- list_add(&s->root_caches_node, &slab_root_caches);
- } else {
- css_get(&memcg->css);
- s->memcg_params.memcg = memcg;
- list_add(&s->memcg_params.children_node,
- &s->memcg_params.root_cache->memcg_params.children);
- list_add(&s->memcg_params.kmem_caches_node,
- &s->memcg_params.memcg->kmem_caches);
- }
-}
-
-static void memcg_unlink_cache(struct kmem_cache *s)
-{
- if (is_root_cache(s)) {
- list_del(&s->root_caches_node);
- } else {
- list_del(&s->memcg_params.children_node);
- list_del(&s->memcg_params.kmem_caches_node);
- }
-}
-#else
-static inline int init_memcg_params(struct kmem_cache *s,
- struct kmem_cache *root_cache)
-{
- return 0;
-}
-
-static inline void destroy_memcg_params(struct kmem_cache *s)
-{
-}
-
-static inline void memcg_unlink_cache(struct kmem_cache *s)
-{
-}
-#endif /* CONFIG_MEMCG_KMEM */
-
/*
* Figure out what the alignment of the objects will be given a set of
* flags, a user specified alignment and the size of the objects.
@@ -311,9 +167,6 @@ int slab_unmergeable(struct kmem_cache *s)
if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
return 1;
- if (!is_root_cache(s))
- return 1;
-
if (s->ctor)
return 1;
@@ -326,14 +179,6 @@ int slab_unmergeable(struct kmem_cache *s)
if (s->refcount < 0)
return 1;
-#ifdef CONFIG_MEMCG_KMEM
- /*
- * Skip the dying kmem_cache.
- */
- if (s->memcg_params.dying)
- return 1;
-#endif
-
return 0;
}
@@ -356,7 +201,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
if (flags & SLAB_NEVER_MERGE)
return NULL;
- list_for_each_entry_reverse(s, &slab_root_caches, root_caches_node) {
+ list_for_each_entry_reverse(s, &slab_caches, list) {
if (slab_unmergeable(s))
continue;
@@ -388,7 +233,7 @@ static struct kmem_cache *create_cache(const char *name,
unsigned int object_size, unsigned int align,
slab_flags_t flags, unsigned int useroffset,
unsigned int usersize, void (*ctor)(void *),
- struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+ struct kmem_cache *root_cache)
{
struct kmem_cache *s;
int err;
@@ -408,24 +253,18 @@ static struct kmem_cache *create_cache(const char *name,
s->useroffset = useroffset;
s->usersize = usersize;
- err = init_memcg_params(s, root_cache);
- if (err)
- goto out_free_cache;
-
err = __kmem_cache_create(s, flags);
if (err)
goto out_free_cache;
s->refcount = 1;
list_add(&s->list, &slab_caches);
- memcg_link_cache(s, memcg);
out:
if (err)
return ERR_PTR(err);
return s;
out_free_cache:
- destroy_memcg_params(s);
kmem_cache_free(kmem_cache, s);
goto out;
}
@@ -471,7 +310,6 @@ kmem_cache_create_usercopy(const char *name,
get_online_cpus();
get_online_mems();
- memcg_get_cache_ids();
mutex_lock(&slab_mutex);
@@ -512,7 +350,7 @@ kmem_cache_create_usercopy(const char *name,
s = create_cache(cache_name, size,
calculate_alignment(flags, align, size),
- flags, useroffset, usersize, ctor, NULL, NULL);
+ flags, useroffset, usersize, ctor, NULL);
if (IS_ERR(s)) {
err = PTR_ERR(s);
kfree_const(cache_name);
@@ -521,7 +359,6 @@ kmem_cache_create_usercopy(const char *name,
out_unlock:
mutex_unlock(&slab_mutex);
- memcg_put_cache_ids();
put_online_mems();
put_online_cpus();
@@ -614,7 +451,6 @@ static int shutdown_cache(struct kmem_cache *s)
if (__kmem_cache_shutdown(s) != 0)
return -EBUSY;
- memcg_unlink_cache(s);
list_del(&s->list);
if (s->flags & SLAB_TYPESAFE_BY_RCU) {
@@ -635,311 +471,9 @@ static int shutdown_cache(struct kmem_cache *s)
return 0;
}
-#ifdef CONFIG_MEMCG_KMEM
-/*
- * memcg_create_kmem_cache - Create a cache for a memory cgroup.
- * @memcg: The memory cgroup the new cache is for.
- * @root_cache: The parent of the new cache.
- *
- * This function attempts to create a kmem cache that will serve allocation
- * requests going from @memcg to @root_cache. The new cache inherits properties
- * from its parent.
- */
-void memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *root_cache)
-{
- static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
- struct cgroup_subsys_state *css = &memcg->css;
- struct memcg_cache_array *arr;
- struct kmem_cache *s = NULL;
- char *cache_name;
- int idx;
-
- get_online_cpus();
- get_online_mems();
-
- mutex_lock(&slab_mutex);
-
- /*
- * The memory cgroup could have been offlined while the cache
- * creation work was pending.
- */
- if (memcg->kmem_state != KMEM_ONLINE)
- goto out_unlock;
-
- idx = memcg_cache_id(memcg);
- arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
- lockdep_is_held(&slab_mutex));
-
- /*
- * Since per-memcg caches are created asynchronously on first
- * allocation (see memcg_kmem_get_cache()), several threads can try to
- * create the same cache, but only one of them may succeed.
- */
- if (arr->entries[idx])
- goto out_unlock;
-
- cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
- cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name,
- css->serial_nr, memcg_name_buf);
- if (!cache_name)
- goto out_unlock;
-
- s = create_cache(cache_name, root_cache->object_size,
- root_cache->align,
- root_cache->flags & CACHE_CREATE_MASK,
- root_cache->useroffset, root_cache->usersize,
- root_cache->ctor, memcg, root_cache);
- /*
- * If we could not create a memcg cache, do not complain, because
- * that's not critical at all as we can always proceed with the root
- * cache.
- */
- if (IS_ERR(s)) {
- kfree(cache_name);
- goto out_unlock;
- }
-
- /*
- * Since readers won't lock (see memcg_kmem_get_cache()), we need a
- * barrier here to ensure nobody will see the kmem_cache partially
- * initialized.
- */
- smp_wmb();
- arr->entries[idx] = s;
-
-out_unlock:
- mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
-}
-
-static void kmemcg_workfn(struct work_struct *work)
-{
- struct kmem_cache *s = container_of(work, struct kmem_cache,
- memcg_params.work);
-
- get_online_cpus();
- get_online_mems();
-
- mutex_lock(&slab_mutex);
- s->memcg_params.work_fn(s);
- mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
-}
-
-static void kmemcg_rcufn(struct rcu_head *head)
-{
- struct kmem_cache *s = container_of(head, struct kmem_cache,
- memcg_params.rcu_head);
-
- /*
- * We need to grab blocking locks. Bounce to ->work. The
- * work item shares the space with the RCU head and can't be
- * initialized earlier.
- */
- INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
- queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
-}
-
-static void kmemcg_cache_shutdown_fn(struct kmem_cache *s)
-{
- WARN_ON(shutdown_cache(s));
-}
-
-static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref)
-{
- struct kmem_cache *s = container_of(percpu_ref, struct kmem_cache,
- memcg_params.refcnt);
- unsigned long flags;
-
- spin_lock_irqsave(&memcg_kmem_wq_lock, flags);
- if (s->memcg_params.root_cache->memcg_params.dying)
- goto unlock;
-
- s->memcg_params.work_fn = kmemcg_cache_shutdown_fn;
- INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
- queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
-
-unlock:
- spin_unlock_irqrestore(&memcg_kmem_wq_lock, flags);
-}
-
-static void kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
-{
- __kmemcg_cache_deactivate_after_rcu(s);
- percpu_ref_kill(&s->memcg_params.refcnt);
-}
-
-static void kmemcg_cache_deactivate(struct kmem_cache *s)
-{
- if (WARN_ON_ONCE(is_root_cache(s)))
- return;
-
- __kmemcg_cache_deactivate(s);
- s->flags |= SLAB_DEACTIVATED;
-
- /*
- * memcg_kmem_wq_lock is used to synchronize memcg_params.dying
- * flag and make sure that no new kmem_cache deactivation tasks
- * are queued (see flush_memcg_workqueue() ).
- */
- spin_lock_irq(&memcg_kmem_wq_lock);
- if (s->memcg_params.root_cache->memcg_params.dying)
- goto unlock;
-
- s->memcg_params.work_fn = kmemcg_cache_deactivate_after_rcu;
- call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn);
-unlock:
- spin_unlock_irq(&memcg_kmem_wq_lock);
-}
-
-void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg,
- struct mem_cgroup *parent)
-{
- int idx;
- struct memcg_cache_array *arr;
- struct kmem_cache *s, *c;
- unsigned int nr_reparented;
-
- idx = memcg_cache_id(memcg);
-
- get_online_cpus();
- get_online_mems();
-
- mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_root_caches, root_caches_node) {
- arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
- lockdep_is_held(&slab_mutex));
- c = arr->entries[idx];
- if (!c)
- continue;
-
- kmemcg_cache_deactivate(c);
- arr->entries[idx] = NULL;
- }
- nr_reparented = 0;
- list_for_each_entry(s, &memcg->kmem_caches,
- memcg_params.kmem_caches_node) {
- WRITE_ONCE(s->memcg_params.memcg, parent);
- css_put(&memcg->css);
- nr_reparented++;
- }
- if (nr_reparented) {
- list_splice_init(&memcg->kmem_caches,
- &parent->kmem_caches);
- css_get_many(&parent->css, nr_reparented);
- }
- mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
-}
-
-static int shutdown_memcg_caches(struct kmem_cache *s)
-{
- struct memcg_cache_array *arr;
- struct kmem_cache *c, *c2;
- LIST_HEAD(busy);
- int i;
-
- BUG_ON(!is_root_cache(s));
-
- /*
- * First, shutdown active caches, i.e. caches that belong to online
- * memory cgroups.
- */
- arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
- lockdep_is_held(&slab_mutex));
- for_each_memcg_cache_index(i) {
- c = arr->entries[i];
- if (!c)
- continue;
- if (shutdown_cache(c))
- /*
- * The cache still has objects. Move it to a temporary
- * list so as not to try to destroy it for a second
- * time while iterating over inactive caches below.
- */
- list_move(&c->memcg_params.children_node, &busy);
- else
- /*
- * The cache is empty and will be destroyed soon. Clear
- * the pointer to it in the memcg_caches array so that
- * it will never be accessed even if the root cache
- * stays alive.
- */
- arr->entries[i] = NULL;
- }
-
- /*
- * Second, shutdown all caches left from memory cgroups that are now
- * offline.
- */
- list_for_each_entry_safe(c, c2, &s->memcg_params.children,
- memcg_params.children_node)
- shutdown_cache(c);
-
- list_splice(&busy, &s->memcg_params.children);
-
- /*
- * A cache being destroyed must be empty. In particular, this means
- * that all per memcg caches attached to it must be empty too.
- */
- if (!list_empty(&s->memcg_params.children))
- return -EBUSY;
- return 0;
-}
-
-static void memcg_set_kmem_cache_dying(struct kmem_cache *s)
-{
- spin_lock_irq(&memcg_kmem_wq_lock);
- s->memcg_params.dying = true;
- spin_unlock_irq(&memcg_kmem_wq_lock);
-}
-
-static void flush_memcg_workqueue(struct kmem_cache *s)
-{
- /*
- * SLAB and SLUB deactivate the kmem_caches through call_rcu. Make
- * sure all registered rcu callbacks have been invoked.
- */
- rcu_barrier();
-
- /*
- * SLAB and SLUB create memcg kmem_caches through workqueue and SLUB
- * deactivates the memcg kmem_caches through workqueue. Make sure all
- * previous workitems on workqueue are processed.
- */
- if (likely(memcg_kmem_cache_wq))
- flush_workqueue(memcg_kmem_cache_wq);
-
- /*
- * If we're racing with children kmem_cache deactivation, it might
- * take another rcu grace period to complete their destruction.
- * At this moment the corresponding percpu_ref_kill() call should be
- * done, but it might take another rcu grace period to complete
- * switching to the atomic mode.
- * Please, note that we check without grabbing the slab_mutex. It's safe
- * because at this moment the children list can't grow.
- */
- if (!list_empty(&s->memcg_params.children))
- rcu_barrier();
-}
-#else
-static inline int shutdown_memcg_caches(struct kmem_cache *s)
-{
- return 0;
-}
-#endif /* CONFIG_MEMCG_KMEM */
-
void slab_kmem_cache_release(struct kmem_cache *s)
{
__kmem_cache_release(s);
- destroy_memcg_params(s);
kfree_const(s->name);
kmem_cache_free(kmem_cache, s);
}
@@ -960,26 +494,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->refcount)
goto out_unlock;
-#ifdef CONFIG_MEMCG_KMEM
- memcg_set_kmem_cache_dying(s);
-
- mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
-
- flush_memcg_workqueue(s);
-
- get_online_cpus();
- get_online_mems();
-
- mutex_lock(&slab_mutex);
-#endif
-
- err = shutdown_memcg_caches(s);
- if (!err)
- err = shutdown_cache(s);
-
+ err = shutdown_cache(s);
if (err) {
pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
s->name);
@@ -1016,43 +531,6 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
}
EXPORT_SYMBOL(kmem_cache_shrink);
-/**
- * kmem_cache_shrink_all - shrink a cache and all memcg caches for root cache
- * @s: The cache pointer
- */
-void kmem_cache_shrink_all(struct kmem_cache *s)
-{
- struct kmem_cache *c;
-
- if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || !is_root_cache(s)) {
- kmem_cache_shrink(s);
- return;
- }
-
- get_online_cpus();
- get_online_mems();
- kasan_cache_shrink(s);
- __kmem_cache_shrink(s);
-
- /*
- * We have to take the slab_mutex to protect from the memcg list
- * modification.
- */
- mutex_lock(&slab_mutex);
- for_each_memcg_cache(c, s) {
- /*
- * Don't need to shrink deactivated memcg caches.
- */
- if (s->flags & SLAB_DEACTIVATED)
- continue;
- kasan_cache_shrink(c);
- __kmem_cache_shrink(c);
- }
- mutex_unlock(&slab_mutex);
- put_online_mems();
- put_online_cpus();
-}
-
bool slab_is_available(void)
{
return slab_state >= UP;
@@ -1081,8 +559,6 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
s->useroffset = useroffset;
s->usersize = usersize;
- slab_init_memcg_params(s);
-
err = __kmem_cache_create(s, flags);
if (err)
@@ -1103,7 +579,6 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
create_boot_cache(s, name, size, flags, useroffset, usersize);
list_add(&s->list, &slab_caches);
- memcg_link_cache(s, NULL);
s->refcount = 1;
return s;
}
@@ -1332,6 +807,18 @@ void __init create_kmalloc_caches(slab_flags_t flags)
}
#endif /* !CONFIG_SLOB */
+gfp_t kmalloc_fix_flags(gfp_t flags)
+{
+ gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
+
+ flags &= ~GFP_SLAB_BUG_MASK;
+ pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
+ invalid_mask, &invalid_mask, flags, &flags);
+ dump_stack();
+
+ return flags;
+}
+
/*
* To avoid unnecessary overhead, we pass through large allocation requests
* directly to the page allocator. We use __GFP_COMP, because we will need to
@@ -1342,12 +829,15 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
void *ret = NULL;
struct page *page;
+ if (unlikely(flags & GFP_SLAB_BUG_MASK))
+ flags = kmalloc_fix_flags(flags);
+
flags |= __GFP_COMP;
page = alloc_pages(flags, order);
if (likely(page)) {
ret = page_address(page);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
- 1 << order);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
}
ret = kasan_kmalloc_large(ret, size, flags);
/* As ret might get tagged, call kmemleak hook after KASAN. */
@@ -1444,12 +934,12 @@ static void print_slabinfo_header(struct seq_file *m)
void *slab_start(struct seq_file *m, loff_t *pos)
{
mutex_lock(&slab_mutex);
- return seq_list_start(&slab_root_caches, *pos);
+ return seq_list_start(&slab_caches, *pos);
}
void *slab_next(struct seq_file *m, void *p, loff_t *pos)
{
- return seq_list_next(p, &slab_root_caches, pos);
+ return seq_list_next(p, &slab_caches, pos);
}
void slab_stop(struct seq_file *m, void *p)
@@ -1457,27 +947,6 @@ void slab_stop(struct seq_file *m, void *p)
mutex_unlock(&slab_mutex);
}
-static void
-memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
-{
- struct kmem_cache *c;
- struct slabinfo sinfo;
-
- if (!is_root_cache(s))
- return;
-
- for_each_memcg_cache(c, s) {
- memset(&sinfo, 0, sizeof(sinfo));
- get_slabinfo(c, &sinfo);
-
- info->active_slabs += sinfo.active_slabs;
- info->num_slabs += sinfo.num_slabs;
- info->shared_avail += sinfo.shared_avail;
- info->active_objs += sinfo.active_objs;
- info->num_objs += sinfo.num_objs;
- }
-}
-
static void cache_show(struct kmem_cache *s, struct seq_file *m)
{
struct slabinfo sinfo;
@@ -1485,10 +954,8 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m)
memset(&sinfo, 0, sizeof(sinfo));
get_slabinfo(s, &sinfo);
- memcg_accumulate_slabinfo(s, &sinfo);
-
seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
- cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
+ s->name, sinfo.active_objs, sinfo.num_objs, s->size,
sinfo.objects_per_slab, (1 << sinfo.cache_order));
seq_printf(m, " : tunables %4u %4u %4u",
@@ -1501,9 +968,9 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m)
static int slab_show(struct seq_file *m, void *p)
{
- struct kmem_cache *s = list_entry(p, struct kmem_cache, root_caches_node);
+ struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
- if (p == slab_root_caches.next)
+ if (p == slab_caches.next)
print_slabinfo_header(m);
cache_show(s, m);
return 0;
@@ -1530,13 +997,13 @@ void dump_unreclaimable_slab(void)
pr_info("Name Used Total\n");
list_for_each_entry_safe(s, s2, &slab_caches, list) {
- if (!is_root_cache(s) || (s->flags & SLAB_RECLAIM_ACCOUNT))
+ if (s->flags & SLAB_RECLAIM_ACCOUNT)
continue;
get_slabinfo(s, &sinfo);
if (sinfo.num_objs > 0)
- pr_info("%-17s %10luKB %10luKB\n", cache_name(s),
+ pr_info("%-17s %10luKB %10luKB\n", s->name,
(sinfo.active_objs * s->size) / 1024,
(sinfo.num_objs * s->size) / 1024);
}
@@ -1544,35 +1011,12 @@ void dump_unreclaimable_slab(void)
}
#if defined(CONFIG_MEMCG_KMEM)
-void *memcg_slab_start(struct seq_file *m, loff_t *pos)
-{
- struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
-
- mutex_lock(&slab_mutex);
- return seq_list_start(&memcg->kmem_caches, *pos);
-}
-
-void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos)
-{
- struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
-
- return seq_list_next(p, &memcg->kmem_caches, pos);
-}
-
-void memcg_slab_stop(struct seq_file *m, void *p)
-{
- mutex_unlock(&slab_mutex);
-}
-
int memcg_slab_show(struct seq_file *m, void *p)
{
- struct kmem_cache *s = list_entry(p, struct kmem_cache,
- memcg_params.kmem_caches_node);
- struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
-
- if (p == memcg->kmem_caches.next)
- print_slabinfo_header(m);
- cache_show(s, m);
+ /*
+ * Deprecated.
+ * Please, take a look at tools/cgroup/slabinfo.py .
+ */
return 0;
}
#endif
@@ -1618,73 +1062,15 @@ static int __init slab_proc_init(void)
}
module_init(slab_proc_init);
-#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_MEMCG_KMEM)
-/*
- * Display information about kmem caches that have child memcg caches.
- */
-static int memcg_slabinfo_show(struct seq_file *m, void *unused)
-{
- struct kmem_cache *s, *c;
- struct slabinfo sinfo;
-
- mutex_lock(&slab_mutex);
- seq_puts(m, "# <name> <css_id[:dead|deact]> <active_objs> <num_objs>");
- seq_puts(m, " <active_slabs> <num_slabs>\n");
- list_for_each_entry(s, &slab_root_caches, root_caches_node) {
- /*
- * Skip kmem caches that don't have any memcg children.
- */
- if (list_empty(&s->memcg_params.children))
- continue;
-
- memset(&sinfo, 0, sizeof(sinfo));
- get_slabinfo(s, &sinfo);
- seq_printf(m, "%-17s root %6lu %6lu %6lu %6lu\n",
- cache_name(s), sinfo.active_objs, sinfo.num_objs,
- sinfo.active_slabs, sinfo.num_slabs);
-
- for_each_memcg_cache(c, s) {
- struct cgroup_subsys_state *css;
- char *status = "";
-
- css = &c->memcg_params.memcg->css;
- if (!(css->flags & CSS_ONLINE))
- status = ":dead";
- else if (c->flags & SLAB_DEACTIVATED)
- status = ":deact";
-
- memset(&sinfo, 0, sizeof(sinfo));
- get_slabinfo(c, &sinfo);
- seq_printf(m, "%-17s %4d%-6s %6lu %6lu %6lu %6lu\n",
- cache_name(c), css->id, status,
- sinfo.active_objs, sinfo.num_objs,
- sinfo.active_slabs, sinfo.num_slabs);
- }
- }
- mutex_unlock(&slab_mutex);
- return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(memcg_slabinfo);
-
-static int __init memcg_slabinfo_init(void)
-{
- debugfs_create_file("memcg_slabinfo", S_IFREG | S_IRUGO,
- NULL, NULL, &memcg_slabinfo_fops);
- return 0;
-}
-
-late_initcall(memcg_slabinfo_init);
-#endif /* CONFIG_DEBUG_FS && CONFIG_MEMCG_KMEM */
#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
static __always_inline void *__do_krealloc(const void *p, size_t new_size,
gfp_t flags)
{
void *ret;
- size_t ks = 0;
+ size_t ks;
- if (p)
- ks = ksize(p);
+ ks = ksize(p);
if (ks >= new_size) {
p = kasan_krealloc((void *)p, new_size, flags);
@@ -1729,28 +1115,27 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
EXPORT_SYMBOL(krealloc);
/**
- * kzfree - like kfree but zero memory
+ * kfree_sensitive - Clear sensitive information in memory before freeing
* @p: object to free memory of
*
* The memory of the object @p points to is zeroed before freed.
- * If @p is %NULL, kzfree() does nothing.
+ * If @p is %NULL, kfree_sensitive() does nothing.
*
* Note: this function zeroes the whole allocated buffer which can be a good
* deal bigger than the requested buffer size passed to kmalloc(). So be
* careful when using this function in performance sensitive code.
*/
-void kzfree(const void *p)
+void kfree_sensitive(const void *p)
{
size_t ks;
void *mem = (void *)p;
- if (unlikely(ZERO_OR_NULL_PTR(mem)))
- return;
ks = ksize(mem);
- memzero_explicit(mem, ks);
+ if (ks)
+ memzero_explicit(mem, ks);
kfree(mem);
}
-EXPORT_SYMBOL(kzfree);
+EXPORT_SYMBOL(kfree_sensitive);
/**
* ksize - get the actual amount of memory allocated for a given object
@@ -1770,8 +1155,6 @@ size_t ksize(const void *objp)
{
size_t size;
- if (WARN_ON_ONCE(!objp))
- return 0;
/*
* We need to check that the pointed to object is valid, and only then
* unpoison the shadow memory below. We use __kasan_check_read(), to
@@ -1785,7 +1168,7 @@ size_t ksize(const void *objp)
* We want to perform the check before __ksize(), to avoid potentially
* crashing in __ksize() due to accessing invalid metadata.
*/
- if (unlikely(objp == ZERO_SIZE_PTR) || !__kasan_check_read(objp, 1))
+ if (unlikely(ZERO_OR_NULL_PTR(objp)) || !__kasan_check_read(objp, 1))
return 0;
size = __ksize(objp);
diff --git a/mm/slob.c b/mm/slob.c
index ac2aecfbc7a8..7cc9805c8091 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -202,8 +202,8 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
if (!page)
return NULL;
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
- 1 << order);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
return page_address(page);
}
@@ -214,8 +214,8 @@ static void slob_free_pages(void *b, int order)
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += 1 << order;
- mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
- -(1 << order));
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
__free_pages(sp, order);
}
@@ -552,8 +552,8 @@ void kfree(const void *block)
slob_free(m, *m + align);
} else {
unsigned int order = compound_order(sp);
- mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
- -(1 << order));
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
__free_pages(sp, order);
}
diff --git a/mm/slub.c b/mm/slub.c
index f226d66408ee..68c02b2eecd9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -114,18 +114,22 @@
* the fast path and disables lockless freelists.
*/
-static inline int kmem_cache_debug(struct kmem_cache *s)
-{
#ifdef CONFIG_SLUB_DEBUG
- return unlikely(s->flags & SLAB_DEBUG_FLAGS);
+#ifdef CONFIG_SLUB_DEBUG_ON
+DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
#else
- return 0;
+DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
+#endif
+
+static inline bool kmem_cache_debug(struct kmem_cache *s)
+{
+ return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
}
void *fixup_red_left(struct kmem_cache *s, void *p)
{
- if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
+ if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
p += s->red_left_pad;
return p;
@@ -214,14 +218,10 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
#ifdef CONFIG_SYSFS
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
-static void memcg_propagate_slab_attrs(struct kmem_cache *s);
-static void sysfs_slab_remove(struct kmem_cache *s);
#else
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
-static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
-static inline void sysfs_slab_remove(struct kmem_cache *s) { }
#endif
static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -313,12 +313,6 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
__p < (__addr) + (__objects) * (__s)->size; \
__p += (__s)->size)
-/* Determine object index from a given position */
-static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
-{
- return (kasan_reset_tag(p) - addr) / s->size;
-}
-
static inline unsigned int order_objects(unsigned int order, unsigned int size)
{
return ((unsigned int)PAGE_SIZE << order) / size;
@@ -461,7 +455,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
bitmap_zero(object_map, page->objects);
for (p = page->freelist; p; p = get_freepointer(s, p))
- set_bit(slab_index(p, s, addr), object_map);
+ set_bit(__obj_to_index(s, addr, p), object_map);
return object_map;
}
@@ -469,8 +463,6 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
static void put_map(unsigned long *map) __releases(&object_map_lock)
{
VM_BUG_ON(map != object_map);
- lockdep_assert_held(&object_map_lock);
-
spin_unlock(&object_map_lock);
}
@@ -499,7 +491,7 @@ static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
static slab_flags_t slub_debug;
#endif
-static char *slub_debug_slabs;
+static char *slub_debug_string;
static int disable_higher_order_debug;
/*
@@ -634,7 +626,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time)
#endif
}
-static void print_tracking(struct kmem_cache *s, void *object)
+void print_tracking(struct kmem_cache *s, void *object)
{
unsigned long pr_time = jiffies;
if (!(s->flags & SLAB_STORE_USER))
@@ -1112,7 +1104,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
static void setup_object_debug(struct kmem_cache *s, struct page *page,
void *object)
{
- if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
+ if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
return;
init_object(s, object, SLUB_RED_INACTIVE);
@@ -1122,7 +1114,7 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
static
void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
{
- if (!(s->flags & SLAB_POISON))
+ if (!kmem_cache_debug_flags(s, SLAB_POISON))
return;
metadata_access_enable();
@@ -1262,69 +1254,135 @@ out:
return ret;
}
-static int __init setup_slub_debug(char *str)
+/*
+ * Parse a block of slub_debug options. Blocks are delimited by ';'
+ *
+ * @str: start of block
+ * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
+ * @slabs: return start of list of slabs, or NULL when there's no list
+ * @init: assume this is initial parsing and not per-kmem-create parsing
+ *
+ * returns the start of next block if there's any, or NULL
+ */
+static char *
+parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
{
- slub_debug = DEBUG_DEFAULT_FLAGS;
- if (*str++ != '=' || !*str)
- /*
- * No options specified. Switch on full debugging.
- */
- goto out;
+ bool higher_order_disable = false;
- if (*str == ',')
+ /* Skip any completely empty blocks */
+ while (*str && *str == ';')
+ str++;
+
+ if (*str == ',') {
/*
* No options but restriction on slabs. This means full
* debugging for slabs matching a pattern.
*/
+ *flags = DEBUG_DEFAULT_FLAGS;
goto check_slabs;
+ }
+ *flags = 0;
- slub_debug = 0;
- if (*str == '-')
- /*
- * Switch off all debugging measures.
- */
- goto out;
-
- /*
- * Determine which debug features should be switched on
- */
- for (; *str && *str != ','; str++) {
+ /* Determine which debug features should be switched on */
+ for (; *str && *str != ',' && *str != ';'; str++) {
switch (tolower(*str)) {
+ case '-':
+ *flags = 0;
+ break;
case 'f':
- slub_debug |= SLAB_CONSISTENCY_CHECKS;
+ *flags |= SLAB_CONSISTENCY_CHECKS;
break;
case 'z':
- slub_debug |= SLAB_RED_ZONE;
+ *flags |= SLAB_RED_ZONE;
break;
case 'p':
- slub_debug |= SLAB_POISON;
+ *flags |= SLAB_POISON;
break;
case 'u':
- slub_debug |= SLAB_STORE_USER;
+ *flags |= SLAB_STORE_USER;
break;
case 't':
- slub_debug |= SLAB_TRACE;
+ *flags |= SLAB_TRACE;
break;
case 'a':
- slub_debug |= SLAB_FAILSLAB;
+ *flags |= SLAB_FAILSLAB;
break;
case 'o':
/*
* Avoid enabling debugging on caches if its minimum
* order would increase as a result.
*/
- disable_higher_order_debug = 1;
+ higher_order_disable = true;
break;
default:
- pr_err("slub_debug option '%c' unknown. skipped\n",
- *str);
+ if (init)
+ pr_err("slub_debug option '%c' unknown. skipped\n", *str);
}
}
-
check_slabs:
if (*str == ',')
- slub_debug_slabs = str + 1;
+ *slabs = ++str;
+ else
+ *slabs = NULL;
+
+ /* Skip over the slab list */
+ while (*str && *str != ';')
+ str++;
+
+ /* Skip any completely empty blocks */
+ while (*str && *str == ';')
+ str++;
+
+ if (init && higher_order_disable)
+ disable_higher_order_debug = 1;
+
+ if (*str)
+ return str;
+ else
+ return NULL;
+}
+
+static int __init setup_slub_debug(char *str)
+{
+ slab_flags_t flags;
+ char *saved_str;
+ char *slab_list;
+ bool global_slub_debug_changed = false;
+ bool slab_list_specified = false;
+
+ slub_debug = DEBUG_DEFAULT_FLAGS;
+ if (*str++ != '=' || !*str)
+ /*
+ * No options specified. Switch on full debugging.
+ */
+ goto out;
+
+ saved_str = str;
+ while (str) {
+ str = parse_slub_debug_flags(str, &flags, &slab_list, true);
+
+ if (!slab_list) {
+ slub_debug = flags;
+ global_slub_debug_changed = true;
+ } else {
+ slab_list_specified = true;
+ }
+ }
+
+ /*
+ * For backwards compatibility, a single list of flags with list of
+ * slabs means debugging is only enabled for those slabs, so the global
+ * slub_debug should be 0. We can extended that to multiple lists as
+ * long as there is no option specifying flags without a slab list.
+ */
+ if (slab_list_specified) {
+ if (!global_slub_debug_changed)
+ slub_debug = 0;
+ slub_debug_string = saved_str;
+ }
out:
+ if (slub_debug != 0 || slub_debug_string)
+ static_branch_enable(&slub_debug_enabled);
if ((static_branch_unlikely(&init_on_alloc) ||
static_branch_unlikely(&init_on_free)) &&
(slub_debug & SLAB_POISON))
@@ -1352,36 +1410,47 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
{
char *iter;
size_t len;
+ char *next_block;
+ slab_flags_t block_flags;
/* If slub_debug = 0, it folds into the if conditional. */
- if (!slub_debug_slabs)
+ if (!slub_debug_string)
return flags | slub_debug;
len = strlen(name);
- iter = slub_debug_slabs;
- while (*iter) {
- char *end, *glob;
- size_t cmplen;
-
- end = strchrnul(iter, ',');
+ next_block = slub_debug_string;
+ /* Go through all blocks of debug options, see if any matches our slab's name */
+ while (next_block) {
+ next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
+ if (!iter)
+ continue;
+ /* Found a block that has a slab list, search it */
+ while (*iter) {
+ char *end, *glob;
+ size_t cmplen;
+
+ end = strchrnul(iter, ',');
+ if (next_block && next_block < end)
+ end = next_block - 1;
+
+ glob = strnchr(iter, end - iter, '*');
+ if (glob)
+ cmplen = glob - iter;
+ else
+ cmplen = max_t(size_t, len, (end - iter));
- glob = strnchr(iter, end - iter, '*');
- if (glob)
- cmplen = glob - iter;
- else
- cmplen = max_t(size_t, len, (end - iter));
+ if (!strncmp(name, iter, cmplen)) {
+ flags |= block_flags;
+ return flags;
+ }
- if (!strncmp(name, iter, cmplen)) {
- flags |= slub_debug;
- break;
+ if (!*end || *end == ';')
+ break;
+ iter = end + 1;
}
-
- if (!*end)
- break;
- iter = end + 1;
}
- return flags;
+ return slub_debug;
}
#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
@@ -1470,6 +1539,11 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
if (!(s->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(x, s->object_size);
+ /* Use KCSAN to help debug racy use-after-free. */
+ if (!(s->flags & SLAB_TYPESAFE_BY_RCU))
+ __kcsan_check_access(x, s->object_size,
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
+
/* KASAN might put x into memory quarantine, delaying its reuse */
return kasan_slab_free(s, x, _RET_IP_);
}
@@ -1546,10 +1620,8 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
else
page = __alloc_pages_node(node, flags, order);
- if (page && charge_slab_page(page, flags, order, s)) {
- __free_pages(page, order);
- page = NULL;
- }
+ if (page)
+ account_slab_page(page, order, s);
return page;
}
@@ -1745,13 +1817,8 @@ out:
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
- if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
- gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
- flags &= ~GFP_SLAB_BUG_MASK;
- pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
- invalid_mask, &invalid_mask, flags, &flags);
- dump_stack();
- }
+ if (unlikely(flags & GFP_SLAB_BUG_MASK))
+ flags = kmalloc_fix_flags(flags);
return allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
@@ -1762,7 +1829,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
int order = compound_order(page);
int pages = 1 << order;
- if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
void *p;
slab_pad_check(s, page);
@@ -1777,7 +1844,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
page->mapping = NULL;
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- uncharge_slab_page(page, order, s);
+ unaccount_slab_page(page, order, s);
__free_pages(page, order);
}
@@ -2744,8 +2811,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
struct kmem_cache_cpu *c;
struct page *page;
unsigned long tid;
+ struct obj_cgroup *objcg = NULL;
- s = slab_pre_alloc_hook(s, gfpflags);
+ s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
if (!s)
return NULL;
redo:
@@ -2821,7 +2889,7 @@ redo:
if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
memset(object, 0, s->object_size);
- slab_post_alloc_hook(s, gfpflags, 1, &object);
+ slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
return object;
}
@@ -3026,6 +3094,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
void *tail_obj = tail ? : head;
struct kmem_cache_cpu *c;
unsigned long tid;
+
+ memcg_slab_free_hook(s, page, head);
redo:
/*
* Determine the currently cpus per cpu slab.
@@ -3205,9 +3275,10 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
{
struct kmem_cache_cpu *c;
int i;
+ struct obj_cgroup *objcg = NULL;
/* memcg and kmem_cache debug support */
- s = slab_pre_alloc_hook(s, flags);
+ s = slab_pre_alloc_hook(s, &objcg, size, flags);
if (unlikely(!s))
return false;
/*
@@ -3261,11 +3332,11 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
}
/* memcg and kmem_cache debug support */
- slab_post_alloc_hook(s, flags, size, p);
+ slab_post_alloc_hook(s, objcg, flags, size, p);
return i;
error:
local_irq_enable();
- slab_post_alloc_hook(s, flags, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p);
__kmem_cache_free_bulk(s, i, p);
return 0;
}
@@ -3675,6 +3746,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*/
size = ALIGN(size, s->align);
s->size = size;
+ s->reciprocal_size = reciprocal_value(size);
if (forced_order >= 0)
order = forced_order;
else
@@ -3779,7 +3851,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
map = get_map(s, page);
for_each_object(p, s, addr, page->objects) {
- if (!test_bit(slab_index(p, s, addr), map)) {
+ if (!test_bit(__obj_to_index(s, addr, p), map)) {
pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
@@ -3842,7 +3914,6 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
if (n->nr_partial || slabs_node(s, node))
return 1;
}
- sysfs_slab_remove(s);
return 0;
}
@@ -3912,8 +3983,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
page = alloc_pages_node(node, flags, order);
if (page) {
ptr = page_address(page);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
- 1 << order);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
}
return kmalloc_large_node_hook(ptr, size, flags);
@@ -3980,7 +4051,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
offset = (ptr - page_address(page)) % s->size;
/* Adjust for redzone and reject if within the redzone. */
- if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
+ if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
if (offset < s->red_left_pad)
usercopy_abort("SLUB object in left red zone",
s->name, to_user, offset, n);
@@ -4044,8 +4115,8 @@ void kfree(const void *x)
BUG_ON(!PageCompound(page));
kfree_hook(object);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
- -(1 << order));
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
__free_pages(page, order);
return;
}
@@ -4126,36 +4197,6 @@ int __kmem_cache_shrink(struct kmem_cache *s)
return ret;
}
-#ifdef CONFIG_MEMCG
-void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
-{
- /*
- * Called with all the locks held after a sched RCU grace period.
- * Even if @s becomes empty after shrinking, we can't know that @s
- * doesn't have allocations already in-flight and thus can't
- * destroy @s until the associated memcg is released.
- *
- * However, let's remove the sysfs files for empty caches here.
- * Each cache has a lot of interface files which aren't
- * particularly useful for empty draining caches; otherwise, we can
- * easily end up with millions of unnecessary sysfs files on
- * systems which have a lot of memory and transient cgroups.
- */
- if (!__kmem_cache_shrink(s))
- sysfs_slab_remove(s);
-}
-
-void __kmemcg_cache_deactivate(struct kmem_cache *s)
-{
- /*
- * Disable empty slabs caching. Used to avoid pinning offline
- * memory cgroups by kmem pages that can be freed.
- */
- slub_set_cpu_partial(s, 0);
- s->min_partial = 0;
-}
-#endif /* CONFIG_MEMCG */
-
static int slab_mem_going_offline_callback(void *arg)
{
struct kmem_cache *s;
@@ -4310,9 +4351,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
p->slab_cache = s;
#endif
}
- slab_init_memcg_params(s);
list_add(&s->list, &slab_caches);
- memcg_link_cache(s, NULL);
return s;
}
@@ -4367,7 +4406,7 @@ struct kmem_cache *
__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *))
{
- struct kmem_cache *s, *c;
+ struct kmem_cache *s;
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
@@ -4380,11 +4419,6 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
s->object_size = max(s->object_size, size);
s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
- for_each_memcg_cache(c, s) {
- c->object_size = s->object_size;
- c->inuse = max(c->inuse, ALIGN(size, sizeof(void *)));
- }
-
if (sysfs_slab_alias(s, name)) {
s->refcount--;
s = NULL;
@@ -4406,7 +4440,6 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
if (slab_state <= UP)
return 0;
- memcg_propagate_slab_attrs(s);
err = sysfs_slab_add(s);
if (err)
__kmem_cache_release(s);
@@ -4495,7 +4528,7 @@ static void validate_slab(struct kmem_cache *s, struct page *page)
/* Now we know that a valid freelist exists */
map = get_map(s, page);
for_each_object(p, s, addr, page->objects) {
- u8 val = test_bit(slab_index(p, s, addr), map) ?
+ u8 val = test_bit(__obj_to_index(s, addr, p), map) ?
SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
if (!check_object(s, page, p, val))
@@ -4686,7 +4719,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
map = get_map(s, page);
for_each_object(p, s, addr, page->objects)
- if (!test_bit(slab_index(p, s, addr), map))
+ if (!test_bit(__obj_to_index(s, addr, p), map))
add_location(t, s, get_track(s, p, alloc));
put_map(map);
}
@@ -4970,20 +5003,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
return x + sprintf(buf + x, "\n");
}
-#ifdef CONFIG_SLUB_DEBUG
-static int any_slab_objects(struct kmem_cache *s)
-{
- int node;
- struct kmem_cache_node *n;
-
- for_each_kmem_cache_node(s, node, n)
- if (atomic_long_read(&n->total_objects))
- return 1;
-
- return 0;
-}
-#endif
-
#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
#define to_slab(n) container_of(n, struct kmem_cache, kobj)
@@ -5025,28 +5044,11 @@ static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(objs_per_slab);
-static ssize_t order_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- unsigned int order;
- int err;
-
- err = kstrtouint(buf, 10, &order);
- if (err)
- return err;
-
- if (order > slub_max_order || order < slub_min_order)
- return -EINVAL;
-
- calculate_sizes(s, order);
- return length;
-}
-
static ssize_t order_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%u\n", oo_order(s->oo));
}
-SLAB_ATTR(order);
+SLAB_ATTR_RO(order);
static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
{
@@ -5168,16 +5170,7 @@ static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
}
-
-static ssize_t reclaim_account_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- s->flags &= ~SLAB_RECLAIM_ACCOUNT;
- if (buf[0] == '1')
- s->flags |= SLAB_RECLAIM_ACCOUNT;
- return length;
-}
-SLAB_ATTR(reclaim_account);
+SLAB_ATTR_RO(reclaim_account);
static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
{
@@ -5222,104 +5215,34 @@ static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
}
-
-static ssize_t sanity_checks_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- s->flags &= ~SLAB_CONSISTENCY_CHECKS;
- if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
- s->flags |= SLAB_CONSISTENCY_CHECKS;
- }
- return length;
-}
-SLAB_ATTR(sanity_checks);
+SLAB_ATTR_RO(sanity_checks);
static ssize_t trace_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
}
-
-static ssize_t trace_store(struct kmem_cache *s, const char *buf,
- size_t length)
-{
- /*
- * Tracing a merged cache is going to give confusing results
- * as well as cause other issues like converting a mergeable
- * cache into an umergeable one.
- */
- if (s->refcount > 1)
- return -EINVAL;
-
- s->flags &= ~SLAB_TRACE;
- if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
- s->flags |= SLAB_TRACE;
- }
- return length;
-}
-SLAB_ATTR(trace);
+SLAB_ATTR_RO(trace);
static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
}
-static ssize_t red_zone_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- if (any_slab_objects(s))
- return -EBUSY;
-
- s->flags &= ~SLAB_RED_ZONE;
- if (buf[0] == '1') {
- s->flags |= SLAB_RED_ZONE;
- }
- calculate_sizes(s, -1);
- return length;
-}
-SLAB_ATTR(red_zone);
+SLAB_ATTR_RO(red_zone);
static ssize_t poison_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
}
-static ssize_t poison_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- if (any_slab_objects(s))
- return -EBUSY;
-
- s->flags &= ~SLAB_POISON;
- if (buf[0] == '1') {
- s->flags |= SLAB_POISON;
- }
- calculate_sizes(s, -1);
- return length;
-}
-SLAB_ATTR(poison);
+SLAB_ATTR_RO(poison);
static ssize_t store_user_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
}
-static ssize_t store_user_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- if (any_slab_objects(s))
- return -EBUSY;
-
- s->flags &= ~SLAB_STORE_USER;
- if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
- s->flags |= SLAB_STORE_USER;
- }
- calculate_sizes(s, -1);
- return length;
-}
-SLAB_ATTR(store_user);
+SLAB_ATTR_RO(store_user);
static ssize_t validate_show(struct kmem_cache *s, char *buf)
{
@@ -5362,19 +5285,7 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
}
-
-static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
- size_t length)
-{
- if (s->refcount > 1)
- return -EINVAL;
-
- s->flags &= ~SLAB_FAILSLAB;
- if (buf[0] == '1')
- s->flags |= SLAB_FAILSLAB;
- return length;
-}
-SLAB_ATTR(failslab);
+SLAB_ATTR_RO(failslab);
#endif
static ssize_t shrink_show(struct kmem_cache *s, char *buf)
@@ -5386,7 +5297,7 @@ static ssize_t shrink_store(struct kmem_cache *s,
const char *buf, size_t length)
{
if (buf[0] == '1')
- kmem_cache_shrink_all(s);
+ kmem_cache_shrink(s);
else
return -EINVAL;
return length;
@@ -5610,98 +5521,9 @@ static ssize_t slab_attr_store(struct kobject *kobj,
return -EIO;
err = attribute->store(s, buf, len);
-#ifdef CONFIG_MEMCG
- if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
- struct kmem_cache *c;
-
- mutex_lock(&slab_mutex);
- if (s->max_attr_size < len)
- s->max_attr_size = len;
-
- /*
- * This is a best effort propagation, so this function's return
- * value will be determined by the parent cache only. This is
- * basically because not all attributes will have a well
- * defined semantics for rollbacks - most of the actions will
- * have permanent effects.
- *
- * Returning the error value of any of the children that fail
- * is not 100 % defined, in the sense that users seeing the
- * error code won't be able to know anything about the state of
- * the cache.
- *
- * Only returning the error code for the parent cache at least
- * has well defined semantics. The cache being written to
- * directly either failed or succeeded, in which case we loop
- * through the descendants with best-effort propagation.
- */
- for_each_memcg_cache(c, s)
- attribute->store(c, buf, len);
- mutex_unlock(&slab_mutex);
- }
-#endif
return err;
}
-static void memcg_propagate_slab_attrs(struct kmem_cache *s)
-{
-#ifdef CONFIG_MEMCG
- int i;
- char *buffer = NULL;
- struct kmem_cache *root_cache;
-
- if (is_root_cache(s))
- return;
-
- root_cache = s->memcg_params.root_cache;
-
- /*
- * This mean this cache had no attribute written. Therefore, no point
- * in copying default values around
- */
- if (!root_cache->max_attr_size)
- return;
-
- for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
- char mbuf[64];
- char *buf;
- struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
- ssize_t len;
-
- if (!attr || !attr->store || !attr->show)
- continue;
-
- /*
- * It is really bad that we have to allocate here, so we will
- * do it only as a fallback. If we actually allocate, though,
- * we can just use the allocated buffer until the end.
- *
- * Most of the slub attributes will tend to be very small in
- * size, but sysfs allows buffers up to a page, so they can
- * theoretically happen.
- */
- if (buffer)
- buf = buffer;
- else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) &&
- !IS_ENABLED(CONFIG_SLUB_STATS))
- buf = mbuf;
- else {
- buffer = (char *) get_zeroed_page(GFP_KERNEL);
- if (WARN_ON(!buffer))
- continue;
- buf = buffer;
- }
-
- len = attr->show(root_cache, buf);
- if (len > 0)
- attr->store(s, buf, len);
- }
-
- if (buffer)
- free_page((unsigned long)buffer);
-#endif /* CONFIG_MEMCG */
-}
-
static void kmem_cache_release(struct kobject *k)
{
slab_kmem_cache_release(to_slab(k));
@@ -5721,10 +5543,6 @@ static struct kset *slab_kset;
static inline struct kset *cache_kset(struct kmem_cache *s)
{
-#ifdef CONFIG_MEMCG
- if (!is_root_cache(s))
- return s->memcg_params.root_cache->memcg_kset;
-#endif
return slab_kset;
}
@@ -5767,27 +5585,6 @@ static char *create_unique_id(struct kmem_cache *s)
return name;
}
-static void sysfs_slab_remove_workfn(struct work_struct *work)
-{
- struct kmem_cache *s =
- container_of(work, struct kmem_cache, kobj_remove_work);
-
- if (!s->kobj.state_in_sysfs)
- /*
- * For a memcg cache, this may be called during
- * deactivation and again on shutdown. Remove only once.
- * A cache is never shut down before deactivation is
- * complete, so no need to worry about synchronization.
- */
- goto out;
-
-#ifdef CONFIG_MEMCG
- kset_unregister(s->memcg_kset);
-#endif
-out:
- kobject_put(&s->kobj);
-}
-
static int sysfs_slab_add(struct kmem_cache *s)
{
int err;
@@ -5795,8 +5592,6 @@ static int sysfs_slab_add(struct kmem_cache *s)
struct kset *kset = cache_kset(s);
int unmergeable = slab_unmergeable(s);
- INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn);
-
if (!kset) {
kobject_init(&s->kobj, &slab_ktype);
return 0;
@@ -5833,16 +5628,6 @@ static int sysfs_slab_add(struct kmem_cache *s)
if (err)
goto out_del_kobj;
-#ifdef CONFIG_MEMCG
- if (is_root_cache(s) && memcg_sysfs_enabled) {
- s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
- if (!s->memcg_kset) {
- err = -ENOMEM;
- goto out_del_kobj;
- }
- }
-#endif
-
if (!unmergeable) {
/* Setup first alias */
sysfs_slab_alias(s, s->name);
@@ -5856,19 +5641,6 @@ out_del_kobj:
goto out;
}
-static void sysfs_slab_remove(struct kmem_cache *s)
-{
- if (slab_state < FULL)
- /*
- * Sysfs has not been setup yet so no need to remove the
- * cache from sysfs.
- */
- return;
-
- kobject_get(&s->kobj);
- schedule_work(&s->kobj_remove_work);
-}
-
void sysfs_slab_unlink(struct kmem_cache *s)
{
if (slab_state >= FULL)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 0db7738d76e9..16183d85a7d5 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -69,11 +69,19 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
__pa(MAX_DMA_ADDRESS));
}
+static void * __meminit altmap_alloc_block_buf(unsigned long size,
+ struct vmem_altmap *altmap);
+
/* need to make sure size is all the same during early stage */
-void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
+void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
+ struct vmem_altmap *altmap)
{
- void *ptr = sparse_buffer_alloc(size);
+ void *ptr;
+
+ if (altmap)
+ return altmap_alloc_block_buf(size, altmap);
+ ptr = sparse_buffer_alloc(size);
if (!ptr)
ptr = vmemmap_alloc_block(size, node);
return ptr;
@@ -94,15 +102,8 @@ static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
return 0;
}
-/**
- * altmap_alloc_block_buf - allocate pages from the device page map
- * @altmap: device page map
- * @size: size (in bytes) of the allocation
- *
- * Allocations are aligned to the size of the request.
- */
-void * __meminit altmap_alloc_block_buf(unsigned long size,
- struct vmem_altmap *altmap)
+static void * __meminit altmap_alloc_block_buf(unsigned long size,
+ struct vmem_altmap *altmap)
{
unsigned long pfn, nr_pfns, nr_align;
@@ -139,12 +140,15 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
start, end - 1);
}
-pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
+pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
+ struct vmem_altmap *altmap)
{
pte_t *pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte)) {
pte_t entry;
- void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
+ void *p;
+
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
if (!p)
return NULL;
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -212,8 +216,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
return pgd;
}
-int __meminit vmemmap_populate_basepages(unsigned long start,
- unsigned long end, int node)
+int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
+ int node, struct vmem_altmap *altmap)
{
unsigned long addr = start;
pgd_t *pgd;
@@ -235,7 +239,7 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
pmd = vmemmap_pmd_populate(pud, addr, node);
if (!pmd)
return -ENOMEM;
- pte = vmemmap_pte_populate(pmd, addr, node);
+ pte = vmemmap_pte_populate(pmd, addr, node, altmap);
if (!pte)
return -ENOMEM;
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
@@ -247,20 +251,12 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
struct page * __meminit __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
- unsigned long start;
- unsigned long end;
-
- /*
- * The minimum granularity of memmap extensions is
- * PAGES_PER_SUBSECTION as allocations are tracked in the
- * 'subsection_map' bitmap of the section.
- */
- end = ALIGN(pfn + nr_pages, PAGES_PER_SUBSECTION);
- pfn &= PAGE_SUBSECTION_MASK;
- nr_pages = end - pfn;
-
- start = (unsigned long) pfn_to_page(pfn);
- end = start + nr_pages * sizeof(struct page);
+ unsigned long start = (unsigned long) pfn_to_page(pfn);
+ unsigned long end = start + nr_pages * sizeof(struct page);
+
+ if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
+ !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
+ return NULL;
if (vmemmap_populate(start, end, nid, altmap))
return NULL;
diff --git a/mm/sparse.c b/mm/sparse.c
index b2b9a3e34696..fcc3d176f1ea 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -16,7 +16,6 @@
#include "internal.h"
#include <asm/dma.h>
-#include <asm/pgalloc.h>
/*
* Permanent SPARSEMEM data:
@@ -250,7 +249,7 @@ void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
#endif
/* Record a memory area against a node. */
-void __init memory_present(int nid, unsigned long start, unsigned long end)
+static void __init memory_present(int nid, unsigned long start, unsigned long end)
{
unsigned long pfn;
@@ -286,11 +285,11 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
}
/*
- * Mark all memblocks as present using memory_present(). This is a
- * convenience function that is useful for a number of arches
- * to mark all of the systems memory as present during initialization.
+ * Mark all memblocks as present using memory_present().
+ * This is a convenience function that is useful to mark all of the systems
+ * memory as present during initialization.
*/
-void __init memblocks_present(void)
+static void __init memblocks_present(void)
{
struct memblock_region *reg;
@@ -575,9 +574,13 @@ failed:
*/
void __init sparse_init(void)
{
- unsigned long pnum_begin = first_present_section_nr();
- int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
- unsigned long pnum_end, map_count = 1;
+ unsigned long pnum_end, pnum_begin, map_count = 1;
+ int nid_begin;
+
+ memblocks_present();
+
+ pnum_begin = first_present_section_nr();
+ nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
set_pageblock_order();
@@ -825,10 +828,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
}
- if (section_is_early && memmap)
- free_map_bootmem(memmap);
- else
+ /*
+ * The memmap of early sections is always fully populated. See
+ * section_activate() and pfn_valid() .
+ */
+ if (!section_is_early)
depopulate_section_memmap(pfn, nr_pages, altmap);
+ else if (memmap)
+ free_map_bootmem(memmap);
if (empty)
ms->section_mem_map = (unsigned long)NULL;
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 0975adc72253..3e6453573a89 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -46,8 +46,7 @@ static void __drain_swap_slots_cache(unsigned int type);
static void deactivate_swap_slots_cache(void);
static void reactivate_swap_slots_cache(void);
-#define use_swap_slot_cache (swap_slot_cache_active && \
- swap_slot_cache_enabled && swap_slot_cache_initialized)
+#define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled)
#define SLOTS_CACHE 0x1
#define SLOTS_CACHE_RET 0x2
@@ -94,7 +93,7 @@ static bool check_cache_active(void)
{
long pages;
- if (!swap_slot_cache_enabled || !swap_slot_cache_initialized)
+ if (!swap_slot_cache_enabled)
return false;
pages = get_nr_swap_pages();
@@ -136,9 +135,16 @@ static int alloc_swap_slot_cache(unsigned int cpu)
mutex_lock(&swap_slots_cache_mutex);
cache = &per_cpu(swp_slots, cpu);
- if (cache->slots || cache->slots_ret)
+ if (cache->slots || cache->slots_ret) {
/* cache already allocated */
- goto out;
+ mutex_unlock(&swap_slots_cache_mutex);
+
+ kvfree(slots);
+ kvfree(slots_ret);
+
+ return 0;
+ }
+
if (!cache->lock_initialized) {
mutex_init(&cache->alloc_lock);
spin_lock_init(&cache->free_lock);
@@ -155,15 +161,8 @@ static int alloc_swap_slot_cache(unsigned int cpu)
*/
mb();
cache->slots = slots;
- slots = NULL;
cache->slots_ret = slots_ret;
- slots_ret = NULL;
-out:
mutex_unlock(&swap_slots_cache_mutex);
- if (slots)
- kvfree(slots);
- if (slots_ret)
- kvfree(slots_ret);
return 0;
}
@@ -240,21 +239,19 @@ static int free_slot_cache(unsigned int cpu)
int enable_swap_slots_cache(void)
{
- int ret = 0;
-
mutex_lock(&swap_slots_cache_enable_mutex);
- if (swap_slot_cache_initialized) {
- __reenable_swap_slots_cache();
- goto out_unlock;
- }
+ if (!swap_slot_cache_initialized) {
+ int ret;
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
- alloc_swap_slot_cache, free_slot_cache);
- if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
- "without swap slots cache.\n", __func__))
- goto out_unlock;
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
+ alloc_swap_slot_cache, free_slot_cache);
+ if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
+ "without swap slots cache.\n", __func__))
+ goto out_unlock;
+
+ swap_slot_cache_initialized = true;
+ }
- swap_slot_cache_initialized = true;
__reenable_swap_slots_cache();
out_unlock:
mutex_unlock(&swap_slots_cache_enable_mutex);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 05889e8e3c97..e82f4f8b1f63 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -725,7 +725,7 @@ static void swap_ra_info(struct vm_fault *vmf,
/**
* swap_vma_readahead - swap in pages in hope we need them soon
- * @entry: swap entry of this memory
+ * @fentry: swap entry of this memory
* @gfp_mask: memory allocation flags
* @vmf: fault information
*
diff --git a/mm/util.c b/mm/util.c
index c63c8e47be57..5ef378a2a038 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -503,8 +503,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
- &populate, &uf);
+ ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
+ &uf);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
@@ -746,6 +746,47 @@ int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
return ret;
}
+static void sync_overcommit_as(struct work_struct *dummy)
+{
+ percpu_counter_sync(&vm_committed_as);
+}
+
+int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int new_policy;
+ int ret;
+
+ /*
+ * The deviation of sync_overcommit_as could be big with loose policy
+ * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
+ * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
+ * with the strict "NEVER", and to avoid possible race condtion (even
+ * though user usually won't too frequently do the switching to policy
+ * OVERCOMMIT_NEVER), the switch is done in the following order:
+ * 1. changing the batch
+ * 2. sync percpu count on each CPU
+ * 3. switch the policy
+ */
+ if (write) {
+ t = *table;
+ t.data = &new_policy;
+ ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+ mm_compute_batch(new_policy);
+ if (new_policy == OVERCOMMIT_NEVER)
+ schedule_on_each_cpu(sync_overcommit_as);
+ sysctl_overcommit_memory = new_policy;
+ } else {
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ }
+
+ return ret;
+}
+
int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
@@ -787,10 +828,15 @@ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
* balancing memory across competing virtual machines that are hosted.
* Several metrics drive this policy engine including the guest reported
* memory commitment.
+ *
+ * The time cost of this is very low for small platforms, and for big
+ * platform like a 2S/36C/72T Skylake server, in worst case where
+ * vm_committed_as's spinlock is under severe contention, the time cost
+ * could be about 30~40 microseconds.
*/
unsigned long vm_memory_committed(void)
{
- return percpu_counter_read_positive(&vm_committed_as);
+ return percpu_counter_sum_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5a2b55c8dd9a..b482d240f9a2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -7,6 +7,7 @@
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
* Numa awareness, Christoph Lameter, SGI, June 2005
+ * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
*/
#include <linux/vmalloc.h>
@@ -25,7 +26,7 @@
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/rbtree.h>
-#include <linux/radix-tree.h>
+#include <linux/xarray.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
@@ -41,6 +42,7 @@
#include <asm/shmparam.h>
#include "internal.h"
+#include "pgalloc-track.h"
bool is_vmalloc_addr(const void *x)
{
@@ -173,7 +175,6 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
- start = addr;
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
@@ -511,6 +512,10 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
/*
* This function returns back addresses of parent node
* and its left or right link for further processing.
+ *
+ * Otherwise NULL is returned. In that case all further
+ * steps regarding inserting of conflicting overlap range
+ * have to be declined and actually considered as a bug.
*/
static __always_inline struct rb_node **
find_va_links(struct vmap_area *va,
@@ -549,8 +554,12 @@ find_va_links(struct vmap_area *va,
else if (va->va_end > tmp_va->va_start &&
va->va_start >= tmp_va->va_end)
link = &(*link)->rb_right;
- else
- BUG();
+ else {
+ WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
+ va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
+
+ return NULL;
+ }
} while (*link);
*parent = &tmp_va->rb_node;
@@ -632,43 +641,17 @@ unlink_va(struct vmap_area *va, struct rb_root *root)
#if DEBUG_AUGMENT_PROPAGATE_CHECK
static void
-augment_tree_propagate_check(struct rb_node *n)
+augment_tree_propagate_check(void)
{
struct vmap_area *va;
- struct rb_node *node;
- unsigned long size;
- bool found = false;
-
- if (n == NULL)
- return;
+ unsigned long computed_size;
- va = rb_entry(n, struct vmap_area, rb_node);
- size = va->subtree_max_size;
- node = n;
-
- while (node) {
- va = rb_entry(node, struct vmap_area, rb_node);
-
- if (get_subtree_max_size(node->rb_left) == size) {
- node = node->rb_left;
- } else {
- if (va_size(va) == size) {
- found = true;
- break;
- }
-
- node = node->rb_right;
- }
- }
-
- if (!found) {
- va = rb_entry(n, struct vmap_area, rb_node);
- pr_emerg("tree is corrupted: %lu, %lu\n",
- va_size(va), va->subtree_max_size);
+ list_for_each_entry(va, &free_vmap_area_list, list) {
+ computed_size = compute_subtree_max_size(va);
+ if (computed_size != va->subtree_max_size)
+ pr_emerg("tree is corrupted: %lu, %lu\n",
+ va_size(va), va->subtree_max_size);
}
-
- augment_tree_propagate_check(n->rb_left);
- augment_tree_propagate_check(n->rb_right);
}
#endif
@@ -702,28 +685,15 @@ augment_tree_propagate_check(struct rb_node *n)
static __always_inline void
augment_tree_propagate_from(struct vmap_area *va)
{
- struct rb_node *node = &va->rb_node;
- unsigned long new_va_sub_max_size;
-
- while (node) {
- va = rb_entry(node, struct vmap_area, rb_node);
- new_va_sub_max_size = compute_subtree_max_size(va);
-
- /*
- * If the newly calculated maximum available size of the
- * subtree is equal to the current one, then it means that
- * the tree is propagated correctly. So we have to stop at
- * this point to save cycles.
- */
- if (va->subtree_max_size == new_va_sub_max_size)
- break;
-
- va->subtree_max_size = new_va_sub_max_size;
- node = rb_parent(&va->rb_node);
- }
+ /*
+ * Populate the tree from bottom towards the root until
+ * the calculated maximum available size of checked node
+ * is equal to its current one.
+ */
+ free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
#if DEBUG_AUGMENT_PROPAGATE_CHECK
- augment_tree_propagate_check(free_vmap_area_root.rb_node);
+ augment_tree_propagate_check();
#endif
}
@@ -735,7 +705,8 @@ insert_vmap_area(struct vmap_area *va,
struct rb_node *parent;
link = find_va_links(va, root, NULL, &parent);
- link_va(va, root, parent, link, head);
+ if (link)
+ link_va(va, root, parent, link, head);
}
static void
@@ -751,8 +722,10 @@ insert_vmap_area_augment(struct vmap_area *va,
else
link = find_va_links(va, root, NULL, &parent);
- link_va(va, root, parent, link, head);
- augment_tree_propagate_from(va);
+ if (link) {
+ link_va(va, root, parent, link, head);
+ augment_tree_propagate_from(va);
+ }
}
/*
@@ -760,6 +733,11 @@ insert_vmap_area_augment(struct vmap_area *va,
* and next free blocks. If coalesce is not done a new
* free area is inserted. If VA has been merged, it is
* freed.
+ *
+ * Please note, it can return NULL in case of overlap
+ * ranges, followed by WARN() report. Despite it is a
+ * buggy behaviour, a system can be alive and keep
+ * ongoing.
*/
static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area *va,
@@ -776,6 +754,8 @@ merge_or_add_vmap_area(struct vmap_area *va,
* inserted, unless it is merged with its sibling/siblings.
*/
link = find_va_links(va, root, NULL, &parent);
+ if (!link)
+ return NULL;
/*
* Get next node of VA to check if merging can be done.
@@ -796,9 +776,6 @@ merge_or_add_vmap_area(struct vmap_area *va,
if (sibling->va_start == va->va_end) {
sibling->va_start = va->va_start;
- /* Check and update the tree if needed. */
- augment_tree_propagate_from(sibling);
-
/* Free vmap_area object. */
kmem_cache_free(vmap_area_cachep, va);
@@ -818,14 +795,18 @@ merge_or_add_vmap_area(struct vmap_area *va,
if (next->prev != head) {
sibling = list_entry(next->prev, struct vmap_area, list);
if (sibling->va_end == va->va_start) {
- sibling->va_end = va->va_end;
-
- /* Check and update the tree if needed. */
- augment_tree_propagate_from(sibling);
-
+ /*
+ * If both neighbors are coalesced, it is important
+ * to unlink the "next" node first, followed by merging
+ * with "previous" one. Otherwise the tree might not be
+ * fully populated if a sibling's augmented value is
+ * "normalized" because of rotation operations.
+ */
if (merged)
unlink_va(va, root);
+ sibling->va_end = va->va_end;
+
/* Free vmap_area object. */
kmem_cache_free(vmap_area_cachep, va);
@@ -836,11 +817,13 @@ merge_or_add_vmap_area(struct vmap_area *va,
}
insert:
- if (!merged) {
+ if (!merged)
link_va(va, root, parent, link, head);
- augment_tree_propagate_from(va);
- }
+ /*
+ * Last step is to check and update the tree.
+ */
+ augment_tree_propagate_from(va);
return va;
}
@@ -1381,6 +1364,9 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
va = merge_or_add_vmap_area(va, &free_vmap_area_root,
&free_vmap_area_list);
+ if (!va)
+ continue;
+
if (is_vmalloc_or_module_addr((void *)orig_start))
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end);
@@ -1513,12 +1499,11 @@ struct vmap_block {
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
/*
- * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
+ * XArray of vmap blocks, indexed by address, to quickly find a vmap block
* in the free path. Could get rid of this if we change the API to return a
* "cookie" from alloc, to be passed to free. But no big deal yet.
*/
-static DEFINE_SPINLOCK(vmap_block_tree_lock);
-static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
+static DEFINE_XARRAY(vmap_blocks);
/*
* We should probably have a fallback mechanism to allocate virtual memory
@@ -1575,13 +1560,6 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
return ERR_CAST(va);
}
- err = radix_tree_preload(gfp_mask);
- if (unlikely(err)) {
- kfree(vb);
- free_vmap_area(va);
- return ERR_PTR(err);
- }
-
vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
@@ -1594,11 +1572,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
INIT_LIST_HEAD(&vb->free_list);
vb_idx = addr_to_vb_idx(va->va_start);
- spin_lock(&vmap_block_tree_lock);
- err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
- spin_unlock(&vmap_block_tree_lock);
- BUG_ON(err);
- radix_tree_preload_end();
+ err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
+ if (err) {
+ kfree(vb);
+ free_vmap_area(va);
+ return ERR_PTR(err);
+ }
vbq = &get_cpu_var(vmap_block_queue);
spin_lock(&vbq->lock);
@@ -1612,12 +1591,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
static void free_vmap_block(struct vmap_block *vb)
{
struct vmap_block *tmp;
- unsigned long vb_idx;
- vb_idx = addr_to_vb_idx(vb->va->va_start);
- spin_lock(&vmap_block_tree_lock);
- tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
- spin_unlock(&vmap_block_tree_lock);
+ tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
BUG_ON(tmp != vb);
free_vmap_area_noflush(vb->va);
@@ -1723,7 +1698,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
static void vb_free(unsigned long addr, unsigned long size)
{
unsigned long offset;
- unsigned long vb_idx;
unsigned int order;
struct vmap_block *vb;
@@ -1733,14 +1707,8 @@ static void vb_free(unsigned long addr, unsigned long size)
flush_cache_vunmap(addr, addr + size);
order = get_order(size);
-
offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
-
- vb_idx = addr_to_vb_idx(addr);
- rcu_read_lock();
- vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
- rcu_read_unlock();
- BUG_ON(!vb);
+ vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
unmap_kernel_range_noflush(addr, size);
@@ -3383,8 +3351,9 @@ recovery:
orig_end = vas[area]->va_end;
va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
&free_vmap_area_list);
- kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
+ if (va)
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
vas[area] = NULL;
}
@@ -3432,8 +3401,9 @@ err_free_shadow:
orig_end = vas[area]->va_end;
va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
&free_vmap_area_list);
- kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
+ if (va)
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
vas[area] = NULL;
kfree(vms[area]);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 749d239c62b2..72da290b171b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -170,11 +170,6 @@ struct scan_control {
* From 0 .. 200. Higher means more swappy.
*/
int vm_swappiness = 60;
-/*
- * The total number of pages which are beyond the high watermark within all
- * zones.
- */
-unsigned long vm_total_pages;
static void set_task_reclaim_state(struct task_struct *task,
struct reclaim_state *rs)
@@ -915,7 +910,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* order to detect refaults, thus thrashing, later on.
*
* But don't store shadows in an address space that is
- * already exiting. This is not just an optizimation,
+ * already exiting. This is not just an optimization,
* inode reclaim needs to empty out the radix tree or
* the nodes are lost. Don't plant shadows behind its
* back.
@@ -2035,7 +2030,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- __count_vm_events(PGREFILL, nr_scanned);
+ if (!cgroup_reclaim(sc))
+ __count_vm_events(PGREFILL, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
@@ -2331,7 +2327,8 @@ out:
unsigned long protection;
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- protection = mem_cgroup_protection(memcg,
+ protection = mem_cgroup_protection(sc->target_mem_cgroup,
+ memcg,
sc->memcg_low_reclaim);
if (protection) {
@@ -2619,14 +2616,15 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
unsigned long reclaimed;
unsigned long scanned;
- switch (mem_cgroup_protected(target_memcg, memcg)) {
- case MEMCG_PROT_MIN:
+ mem_cgroup_calculate_protection(target_memcg, memcg);
+
+ if (mem_cgroup_below_min(memcg)) {
/*
* Hard protection.
* If there is no reclaimable memory, OOM.
*/
continue;
- case MEMCG_PROT_LOW:
+ } else if (mem_cgroup_below_low(memcg)) {
/*
* Soft protection.
* Respect the protection only as long as
@@ -2638,16 +2636,6 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
continue;
}
memcg_memory_event(memcg, MEMCG_LOW);
- break;
- case MEMCG_PROT_NONE:
- /*
- * All protection thresholds breached. We may
- * still choose to vary the scan pressure
- * applied based on by how much the cgroup in
- * question has exceeded its protection
- * thresholds (see get_scan_count).
- */
- break;
}
reclaimed = sc->nr_reclaimed;
@@ -3318,7 +3306,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
bool may_swap)
{
unsigned long nr_reclaimed;
- unsigned long pflags;
unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
@@ -3339,17 +3326,12 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
set_task_reclaim_state(current, &sc.reclaim_state);
-
trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
-
- psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
memalloc_noreclaim_restore(noreclaim_flag);
- psi_memstall_leave(&pflags);
-
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
set_task_reclaim_state(current, NULL);
@@ -4222,7 +4204,8 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
* unmapped file backed pages.
*/
if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
- node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+ node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
+ pgdat->min_slab_pages)
return NODE_RECLAIM_FULL;
/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 3fb23a21f6dd..2b866cbab11d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -341,6 +341,11 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
long x;
long t;
+ if (vmstat_item_in_bytes(item)) {
+ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
+ delta >>= PAGE_SHIFT;
+ }
+
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -398,6 +403,8 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
s8 __percpu *p = pcp->vm_node_stat_diff + item;
s8 v, t;
+ VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
@@ -442,6 +449,8 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
s8 __percpu *p = pcp->vm_node_stat_diff + item;
s8 v, t;
+ VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
@@ -541,6 +550,11 @@ static inline void mod_node_state(struct pglist_data *pgdat,
s8 __percpu *p = pcp->vm_node_stat_diff + item;
long o, n, t, z;
+ if (vmstat_item_in_bytes(item)) {
+ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
+ delta >>= PAGE_SHIFT;
+ }
+
do {
z = 0; /* overflow to node counters */
@@ -989,8 +1003,8 @@ unsigned long sum_zone_numa_state(int node,
/*
* Determine the per node value of a stat item.
*/
-unsigned long node_page_state(struct pglist_data *pgdat,
- enum node_stat_item item)
+unsigned long node_page_state_pages(struct pglist_data *pgdat,
+ enum node_stat_item item)
{
long x = atomic_long_read(&pgdat->vm_stat[item]);
#ifdef CONFIG_SMP
@@ -999,6 +1013,14 @@ unsigned long node_page_state(struct pglist_data *pgdat,
#endif
return x;
}
+
+unsigned long node_page_state(struct pglist_data *pgdat,
+ enum node_stat_item item)
+{
+ VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
+ return node_page_state_pages(pgdat, item);
+}
#endif
#ifdef CONFIG_COMPACTION
@@ -1118,10 +1140,6 @@ const char * const vmstat_text[] = {
"nr_zone_write_pending",
"nr_mlock",
"nr_page_table_pages",
- "nr_kernel_stack",
-#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
- "nr_shadow_call_stack",
-#endif
"nr_bounce",
#if IS_ENABLED(CONFIG_ZSMALLOC)
"nr_zspages",
@@ -1172,6 +1190,10 @@ const char * const vmstat_text[] = {
"nr_kernel_misc_reclaimable",
"nr_foll_pin_acquired",
"nr_foll_pin_released",
+ "nr_kernel_stack",
+#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
+ "nr_shadow_call_stack",
+#endif
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -1577,7 +1599,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m, "\n per-node stats");
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
seq_printf(m, "\n %-12s %lu", node_stat_name(i),
- node_page_state(pgdat, i));
+ node_page_state_pages(pgdat, i));
}
}
seq_printf(m,
@@ -1698,7 +1720,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
#endif
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- v[i] = global_node_page_state(i);
+ v[i] = global_node_page_state_pages(i);
v += NR_VM_NODE_STAT_ITEMS;
global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
diff --git a/mm/workingset.c b/mm/workingset.c
index 50b7937bab32..b199726924dd 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -486,8 +486,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
pages += lruvec_page_state_local(lruvec,
NR_LRU_BASE + i);
- pages += lruvec_page_state_local(lruvec, NR_SLAB_RECLAIMABLE);
- pages += lruvec_page_state_local(lruvec, NR_SLAB_UNRECLAIMABLE);
+ pages += lruvec_page_state_local(
+ lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
+ pages += lruvec_page_state_local(
+ lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
} else
#endif
pages = node_present_pages(sc->nid);
diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c
index 3286f9d527d3..f7a2f0e41105 100644
--- a/net/atm/mpoa_caches.c
+++ b/net/atm/mpoa_caches.c
@@ -180,7 +180,7 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc)
static void in_cache_put(in_cache_entry *entry)
{
if (refcount_dec_and_test(&entry->use)) {
- kzfree(entry);
+ kfree_sensitive(entry);
}
}
@@ -415,7 +415,7 @@ static eg_cache_entry *eg_cache_get_by_src_ip(__be32 ipaddr,
static void eg_cache_put(eg_cache_entry *entry)
{
if (refcount_dec_and_test(&entry->use)) {
- kzfree(entry);
+ kfree_sensitive(entry);
}
}
diff --git a/net/bluetooth/ecdh_helper.c b/net/bluetooth/ecdh_helper.c
index 2155ce802877..3226fe02e875 100644
--- a/net/bluetooth/ecdh_helper.c
+++ b/net/bluetooth/ecdh_helper.c
@@ -104,7 +104,7 @@ int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64],
free_all:
kpp_request_free(req);
free_tmp:
- kzfree(tmp);
+ kfree_sensitive(tmp);
return err;
}
@@ -151,9 +151,9 @@ int set_ecdh_privkey(struct crypto_kpp *tfm, const u8 private_key[32])
err = crypto_kpp_set_secret(tfm, buf, buf_len);
/* fall through */
free_all:
- kzfree(buf);
+ kfree_sensitive(buf);
free_tmp:
- kzfree(tmp);
+ kfree_sensitive(tmp);
return err;
}
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 433227f96c73..bf4bef13d935 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -753,9 +753,9 @@ static void smp_chan_destroy(struct l2cap_conn *conn)
complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags);
mgmt_smp_complete(hcon, complete);
- kzfree(smp->csrk);
- kzfree(smp->slave_csrk);
- kzfree(smp->link_key);
+ kfree_sensitive(smp->csrk);
+ kfree_sensitive(smp->slave_csrk);
+ kfree_sensitive(smp->link_key);
crypto_free_shash(smp->tfm_cmac);
crypto_free_kpp(smp->tfm_ecdh);
@@ -789,7 +789,7 @@ static void smp_chan_destroy(struct l2cap_conn *conn)
}
chan->data = NULL;
- kzfree(smp);
+ kfree_sensitive(smp);
hci_conn_drop(hcon);
}
@@ -1156,7 +1156,7 @@ static void sc_generate_link_key(struct smp_chan *smp)
const u8 salt[16] = { 0x31, 0x70, 0x6d, 0x74 };
if (smp_h7(smp->tfm_cmac, smp->tk, salt, smp->link_key)) {
- kzfree(smp->link_key);
+ kfree_sensitive(smp->link_key);
smp->link_key = NULL;
return;
}
@@ -1165,14 +1165,14 @@ static void sc_generate_link_key(struct smp_chan *smp)
const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 };
if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) {
- kzfree(smp->link_key);
+ kfree_sensitive(smp->link_key);
smp->link_key = NULL;
return;
}
}
if (smp_h6(smp->tfm_cmac, smp->link_key, lebr, smp->link_key)) {
- kzfree(smp->link_key);
+ kfree_sensitive(smp->link_key);
smp->link_key = NULL;
return;
}
@@ -1407,7 +1407,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn)
free_shash:
crypto_free_shash(smp->tfm_cmac);
zfree_smp:
- kzfree(smp);
+ kfree_sensitive(smp);
return NULL;
}
@@ -3278,7 +3278,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
if (IS_ERR(tfm_cmac)) {
BT_ERR("Unable to create CMAC crypto context");
- kzfree(smp);
+ kfree_sensitive(smp);
return ERR_CAST(tfm_cmac);
}
@@ -3286,7 +3286,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
if (IS_ERR(tfm_ecdh)) {
BT_ERR("Unable to create ECDH crypto context");
crypto_free_shash(tfm_cmac);
- kzfree(smp);
+ kfree_sensitive(smp);
return ERR_CAST(tfm_ecdh);
}
@@ -3300,7 +3300,7 @@ create_chan:
if (smp) {
crypto_free_shash(smp->tfm_cmac);
crypto_free_kpp(smp->tfm_ecdh);
- kzfree(smp);
+ kfree_sensitive(smp);
}
return ERR_PTR(-ENOMEM);
}
@@ -3347,7 +3347,7 @@ static void smp_del_chan(struct l2cap_chan *chan)
chan->data = NULL;
crypto_free_shash(smp->tfm_cmac);
crypto_free_kpp(smp->tfm_ecdh);
- kzfree(smp);
+ kfree_sensitive(smp);
}
l2cap_chan_put(chan);
diff --git a/net/core/sock.c b/net/core/sock.c
index d29709e0790d..a2044b4b606b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2265,7 +2265,7 @@ static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
if (WARN_ON_ONCE(!mem))
return;
if (nullify)
- kzfree(mem);
+ kfree_sensitive(mem);
else
kfree(mem);
atomic_sub(size, &sk->sk_omem_alloc);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 19ad9586c720..c1a54f3d58f5 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -38,7 +38,7 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head)
struct tcp_fastopen_context *ctx =
container_of(head, struct tcp_fastopen_context, rcu);
- kzfree(ctx);
+ kfree_sensitive(ctx);
}
void tcp_fastopen_destroy_cipher(struct sock *sk)
diff --git a/net/mac80211/aead_api.c b/net/mac80211/aead_api.c
index c5fe95e49c68..d7b3d905d535 100644
--- a/net/mac80211/aead_api.c
+++ b/net/mac80211/aead_api.c
@@ -41,7 +41,7 @@ int aead_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len,
aead_request_set_ad(aead_req, sg[0].length);
crypto_aead_encrypt(aead_req);
- kzfree(aead_req);
+ kfree_sensitive(aead_req);
return 0;
}
@@ -76,7 +76,7 @@ int aead_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len,
aead_request_set_ad(aead_req, sg[0].length);
err = crypto_aead_decrypt(aead_req);
- kzfree(aead_req);
+ kfree_sensitive(aead_req);
return err;
}
diff --git a/net/mac80211/aes_gmac.c b/net/mac80211/aes_gmac.c
index 16ba09cb5def..6f3b3a0cc10a 100644
--- a/net/mac80211/aes_gmac.c
+++ b/net/mac80211/aes_gmac.c
@@ -60,7 +60,7 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce,
aead_request_set_ad(aead_req, GMAC_AAD_LEN + data_len);
crypto_aead_encrypt(aead_req);
- kzfree(aead_req);
+ kfree_sensitive(aead_req);
return 0;
}
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 9c2888004878..2df636c32432 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -732,7 +732,7 @@ static void ieee80211_key_free_common(struct ieee80211_key *key)
ieee80211_aes_gcm_key_free(key->u.gcmp.tfm);
break;
}
- kzfree(key);
+ kfree_sensitive(key);
}
static void __ieee80211_key_destroy(struct ieee80211_key *key,
diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c
index c079ee69d3d0..585d33144c33 100644
--- a/net/mac802154/llsec.c
+++ b/net/mac802154/llsec.c
@@ -49,7 +49,7 @@ void mac802154_llsec_destroy(struct mac802154_llsec *sec)
msl = container_of(sl, struct mac802154_llsec_seclevel, level);
list_del(&sl->list);
- kzfree(msl);
+ kfree_sensitive(msl);
}
list_for_each_entry_safe(dev, dn, &sec->table.devices, list) {
@@ -66,7 +66,7 @@ void mac802154_llsec_destroy(struct mac802154_llsec *sec)
mkey = container_of(key->key, struct mac802154_llsec_key, key);
list_del(&key->list);
llsec_key_put(mkey);
- kzfree(key);
+ kfree_sensitive(key);
}
}
@@ -155,7 +155,7 @@ err_tfm:
if (key->tfm[i])
crypto_free_aead(key->tfm[i]);
- kzfree(key);
+ kfree_sensitive(key);
return NULL;
}
@@ -170,7 +170,7 @@ static void llsec_key_release(struct kref *ref)
crypto_free_aead(key->tfm[i]);
crypto_free_sync_skcipher(key->tfm0);
- kzfree(key);
+ kfree_sensitive(key);
}
static struct mac802154_llsec_key*
@@ -261,7 +261,7 @@ int mac802154_llsec_key_add(struct mac802154_llsec *sec,
return 0;
fail:
- kzfree(new);
+ kfree_sensitive(new);
return -ENOMEM;
}
@@ -341,10 +341,10 @@ static void llsec_dev_free(struct mac802154_llsec_device *dev)
devkey);
list_del(&pos->list);
- kzfree(devkey);
+ kfree_sensitive(devkey);
}
- kzfree(dev);
+ kfree_sensitive(dev);
}
int mac802154_llsec_dev_add(struct mac802154_llsec *sec,
@@ -682,7 +682,7 @@ llsec_do_encrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec,
rc = crypto_aead_encrypt(req);
- kzfree(req);
+ kfree_sensitive(req);
return rc;
}
@@ -886,7 +886,7 @@ llsec_do_decrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec,
rc = crypto_aead_decrypt(req);
- kzfree(req);
+ kfree_sensitive(req);
skb_trim(skb, skb->len - authlen);
return rc;
@@ -926,7 +926,7 @@ llsec_update_devkey_record(struct mac802154_llsec_device *dev,
if (!devkey)
list_add_rcu(&next->devkey.list, &dev->dev.keys);
else
- kzfree(next);
+ kfree_sensitive(next);
spin_unlock_bh(&dev->lock);
}
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 83e97e8892e0..9e289c770574 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -49,7 +49,7 @@ void sctp_auth_key_put(struct sctp_auth_bytes *key)
return;
if (refcount_dec_and_test(&key->refcnt)) {
- kzfree(key);
+ kfree_sensitive(key);
SCTP_DBG_OBJCNT_DEC(keys);
}
}
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index e7180da1fc6a..794fb3001880 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -1003,7 +1003,7 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx,
err = 0;
out_err:
- kzfree(desc);
+ kfree_sensitive(desc);
crypto_free_shash(hmac);
dprintk("%s: returning %d\n", __func__, err);
return err;
@@ -1079,7 +1079,7 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx,
err = 0;
out_err:
- kzfree(desc);
+ kfree_sensitive(desc);
crypto_free_shash(hmac);
dprintk("%s: returning %d\n", __func__, err);
return err;
diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
index 3b7f721c023b..726c076950c0 100644
--- a/net/sunrpc/auth_gss/gss_krb5_keys.c
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -228,11 +228,11 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e,
ret = 0;
err_free_raw:
- kzfree(rawkey);
+ kfree_sensitive(rawkey);
err_free_out:
- kzfree(outblockdata);
+ kfree_sensitive(outblockdata);
err_free_in:
- kzfree(inblockdata);
+ kfree_sensitive(inblockdata);
err_free_cipher:
crypto_free_sync_skcipher(cipher);
err_return:
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 75b3c2e9e8f8..a84a5b289484 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -443,7 +443,7 @@ context_derive_keys_rc4(struct krb5_ctx *ctx)
desc->tfm = hmac;
err = crypto_shash_digest(desc, sigkeyconstant, slen, ctx->cksum);
- kzfree(desc);
+ kfree_sensitive(desc);
if (err)
goto out_err_free_hmac;
/*
diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c
index c8c47fc72653..001bcb0f2480 100644
--- a/net/tipc/crypto.c
+++ b/net/tipc/crypto.c
@@ -441,7 +441,7 @@ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey,
/* Allocate per-cpu TFM entry pointer */
tmp->tfm_entry = alloc_percpu(struct tipc_tfm *);
if (!tmp->tfm_entry) {
- kzfree(tmp);
+ kfree_sensitive(tmp);
return -ENOMEM;
}
@@ -491,7 +491,7 @@ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey,
/* Not any TFM is allocated? */
if (!tfm_cnt) {
free_percpu(tmp->tfm_entry);
- kzfree(tmp);
+ kfree_sensitive(tmp);
return err;
}
@@ -545,7 +545,7 @@ static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src)
aead->tfm_entry = alloc_percpu_gfp(struct tipc_tfm *, GFP_ATOMIC);
if (unlikely(!aead->tfm_entry)) {
- kzfree(aead);
+ kfree_sensitive(aead);
return -ENOMEM;
}
@@ -1352,7 +1352,7 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net,
/* Allocate statistic structure */
c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC);
if (!c->stats) {
- kzfree(c);
+ kfree_sensitive(c);
return -ENOMEM;
}
@@ -1408,7 +1408,7 @@ void tipc_crypto_stop(struct tipc_crypto **crypto)
free_percpu(c->stats);
*crypto = NULL;
- kzfree(c);
+ kfree_sensitive(c);
}
void tipc_crypto_timeout(struct tipc_crypto *rx)
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 1971d7e6eb55..354b0ccbdc24 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1125,7 +1125,7 @@ static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync)
}
#ifdef CONFIG_CFG80211_WEXT
- kzfree(wdev->wext.keys);
+ kfree_sensitive(wdev->wext.keys);
wdev->wext.keys = NULL;
#endif
/* only initialized if we have a netdev */
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index ae8fe66a9bb8..a0621bb76d8e 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -127,7 +127,7 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
return -EINVAL;
if (WARN_ON(wdev->connect_keys))
- kzfree(wdev->connect_keys);
+ kfree_sensitive(wdev->connect_keys);
wdev->connect_keys = connkeys;
wdev->ibss_fixed = params->channel_fixed;
@@ -161,7 +161,7 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext)
ASSERT_WDEV_LOCK(wdev);
- kzfree(wdev->connect_keys);
+ kfree_sensitive(wdev->connect_keys);
wdev->connect_keys = NULL;
rdev_set_qos_map(rdev, dev, NULL);
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index f5e842ba7673..1b4d6c87a5c5 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -131,7 +131,7 @@ static void lib80211_tkip_deinit(void *priv)
crypto_free_shash(_priv->tx_tfm_michael);
crypto_free_shash(_priv->rx_tfm_michael);
}
- kzfree(priv);
+ kfree_sensitive(priv);
}
static inline u16 RotR1(u16 val)
diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c
index dafc6f3571db..6ab9957b8f96 100644
--- a/net/wireless/lib80211_crypt_wep.c
+++ b/net/wireless/lib80211_crypt_wep.c
@@ -56,7 +56,7 @@ static void *lib80211_wep_init(int keyidx)
static void lib80211_wep_deinit(void *priv)
{
- kzfree(priv);
+ kfree_sensitive(priv);
}
/* Add WEP IV/key info to a frame that has at least 4 bytes of headroom */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 814e23d3ce7c..c04fc6cf6583 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -9836,7 +9836,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
if ((ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT) &&
no_ht) {
- kzfree(connkeys);
+ kfree_sensitive(connkeys);
return -EINVAL;
}
}
@@ -9848,7 +9848,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
int r = validate_pae_over_nl80211(rdev, info);
if (r < 0) {
- kzfree(connkeys);
+ kfree_sensitive(connkeys);
return r;
}
@@ -9861,7 +9861,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
wdev_lock(dev->ieee80211_ptr);
err = __cfg80211_join_ibss(rdev, dev, &ibss, connkeys);
if (err)
- kzfree(connkeys);
+ kfree_sensitive(connkeys);
else if (info->attrs[NL80211_ATTR_SOCKET_OWNER])
dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;
wdev_unlock(dev->ieee80211_ptr);
@@ -10289,7 +10289,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) {
if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]) {
- kzfree(connkeys);
+ kfree_sensitive(connkeys);
return -EINVAL;
}
memcpy(&connect.ht_capa,
@@ -10307,7 +10307,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) {
if (!info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) {
- kzfree(connkeys);
+ kfree_sensitive(connkeys);
return -EINVAL;
}
memcpy(&connect.vht_capa,
@@ -10321,7 +10321,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
(rdev->wiphy.features & NL80211_FEATURE_QUIET)) &&
!wiphy_ext_feature_isset(&rdev->wiphy,
NL80211_EXT_FEATURE_RRM)) {
- kzfree(connkeys);
+ kfree_sensitive(connkeys);
return -EINVAL;
}
connect.flags |= ASSOC_REQ_USE_RRM;
@@ -10329,21 +10329,21 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
connect.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
if (connect.pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) {
- kzfree(connkeys);
+ kfree_sensitive(connkeys);
return -EOPNOTSUPP;
}
if (info->attrs[NL80211_ATTR_BSS_SELECT]) {
/* bss selection makes no sense if bssid is set */
if (connect.bssid) {
- kzfree(connkeys);
+ kfree_sensitive(connkeys);
return -EINVAL;
}
err = parse_bss_select(info->attrs[NL80211_ATTR_BSS_SELECT],
wiphy, &connect.bss_select);
if (err) {
- kzfree(connkeys);
+ kfree_sensitive(connkeys);
return err;
}
}
@@ -10373,13 +10373,13 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
info->attrs[NL80211_ATTR_FILS_ERP_REALM] ||
info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] ||
info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
- kzfree(connkeys);
+ kfree_sensitive(connkeys);
return -EINVAL;
}
if (nla_get_flag(info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT])) {
if (!info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
- kzfree(connkeys);
+ kfree_sensitive(connkeys);
GENL_SET_ERR_MSG(info,
"external auth requires connection ownership");
return -EINVAL;
@@ -10392,7 +10392,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
err = cfg80211_connect(rdev, dev, &connect, connkeys,
connect.prev_bssid);
if (err)
- kzfree(connkeys);
+ kfree_sensitive(connkeys);
if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 15595cf401de..985f3c23f054 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -742,7 +742,7 @@ void __cfg80211_connect_result(struct net_device *dev,
}
if (cr->status != WLAN_STATUS_SUCCESS) {
- kzfree(wdev->connect_keys);
+ kfree_sensitive(wdev->connect_keys);
wdev->connect_keys = NULL;
wdev->ssid_len = 0;
wdev->conn_owner_nlportid = 0;
@@ -1098,7 +1098,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
wdev->current_bss = NULL;
wdev->ssid_len = 0;
wdev->conn_owner_nlportid = 0;
- kzfree(wdev->connect_keys);
+ kfree_sensitive(wdev->connect_keys);
wdev->connect_keys = NULL;
nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap);
@@ -1281,7 +1281,7 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
ASSERT_WDEV_LOCK(wdev);
- kzfree(wdev->connect_keys);
+ kfree_sensitive(wdev->connect_keys);
wdev->connect_keys = NULL;
wdev->conn_owner_nlportid = 0;
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 26a977343c3b..dfad1c0f57ad 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -871,7 +871,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
}
}
- kzfree(wdev->connect_keys);
+ kfree_sensitive(wdev->connect_keys);
wdev->connect_keys = NULL;
}
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index 73fd0eae08ca..73df23570d43 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -57,7 +57,7 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
err = cfg80211_connect(rdev, wdev->netdev,
&wdev->wext.connect, ck, prev_bssid);
if (err)
- kzfree(ck);
+ kfree_sensitive(ck);
return err;
}
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
index 03757cc60e06..f4beee1b0013 100644
--- a/scripts/Makefile.kasan
+++ b/scripts/Makefile.kasan
@@ -44,7 +44,8 @@ else
endif
CFLAGS_KASAN := -fsanitize=kernel-hwaddress \
- -mllvm -hwasan-instrument-stack=0 \
+ -mllvm -hwasan-instrument-stack=$(CONFIG_KASAN_STACK) \
+ -mllvm -hwasan-use-short-granules=0 \
$(instrumentation_flags)
endif # CONFIG_KASAN_SW_TAGS
diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter
index 8c965f6a9881..d7ca46c612b3 100755
--- a/scripts/bloat-o-meter
+++ b/scripts/bloat-o-meter
@@ -26,6 +26,8 @@ def getsizes(file, format):
sym = {}
with os.popen("nm --size-sort " + file) as f:
for line in f:
+ if line.startswith("\n") or ":" in line:
+ continue
size, type, name = line.split()
if type in format:
# strip generated symbols
diff --git a/scripts/coccinelle/free/devm_free.cocci b/scripts/coccinelle/free/devm_free.cocci
index 3357bf4dbd7c..da80050b91ff 100644
--- a/scripts/coccinelle/free/devm_free.cocci
+++ b/scripts/coccinelle/free/devm_free.cocci
@@ -89,7 +89,7 @@ position p;
(
kfree@p(x)
|
- kzfree@p(x)
+ kfree_sensitive@p(x)
|
krealloc@p(x, ...)
|
@@ -112,7 +112,7 @@ position p != safe.p;
(
* kfree@p(x)
|
-* kzfree@p(x)
+* kfree_sensitive@p(x)
|
* krealloc@p(x, ...)
|
diff --git a/scripts/coccinelle/free/ifnullfree.cocci b/scripts/coccinelle/free/ifnullfree.cocci
index b3290c4ee239..2045391e36a0 100644
--- a/scripts/coccinelle/free/ifnullfree.cocci
+++ b/scripts/coccinelle/free/ifnullfree.cocci
@@ -21,7 +21,7 @@ expression E;
(
kfree(E);
|
- kzfree(E);
+ kfree_sensitive(E);
|
debugfs_remove(E);
|
@@ -42,7 +42,7 @@ position p;
@@
* if (E != NULL)
-* \(kfree@p\|kzfree@p\|debugfs_remove@p\|debugfs_remove_recursive@p\|
+* \(kfree@p\|kfree_sensitive@p\|debugfs_remove@p\|debugfs_remove_recursive@p\|
* usb_free_urb@p\|kmem_cache_destroy@p\|mempool_destroy@p\|
* dma_pool_destroy@p\)(E);
diff --git a/scripts/coccinelle/free/kfree.cocci b/scripts/coccinelle/free/kfree.cocci
index e9d50e718e46..168568386034 100644
--- a/scripts/coccinelle/free/kfree.cocci
+++ b/scripts/coccinelle/free/kfree.cocci
@@ -24,7 +24,7 @@ position p1;
(
* kfree@p1(E)
|
-* kzfree@p1(E)
+* kfree_sensitive@p1(E)
)
@print expression@
@@ -68,7 +68,7 @@ while (1) { ...
(
* kfree@ok(E)
|
-* kzfree@ok(E)
+* kfree_sensitive@ok(E)
)
... when != break;
when != goto l;
@@ -86,7 +86,7 @@ position free.p1!=loop.ok,p2!={print.p,sz.p};
(
* kfree@p1(E,...)
|
-* kzfree@p1(E,...)
+* kfree_sensitive@p1(E,...)
)
...
(
diff --git a/scripts/coccinelle/free/kfreeaddr.cocci b/scripts/coccinelle/free/kfreeaddr.cocci
index cfaf308328d8..142af6337a04 100644
--- a/scripts/coccinelle/free/kfreeaddr.cocci
+++ b/scripts/coccinelle/free/kfreeaddr.cocci
@@ -20,7 +20,7 @@ position p;
(
* kfree@p(&e->f)
|
-* kzfree@p(&e->f)
+* kfree_sensitive@p(&e->f)
)
@script:python depends on org@
diff --git a/scripts/const_structs.checkpatch b/scripts/const_structs.checkpatch
index ac5f1267151d..e9df9cc28a85 100644
--- a/scripts/const_structs.checkpatch
+++ b/scripts/const_structs.checkpatch
@@ -44,6 +44,7 @@ platform_hibernation_ops
platform_suspend_ops
proto_ops
regmap_access_table
+regulator_ops
rpc_pipe_ops
rtc_class_ops
sd_desc
diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 0869def435ee..90398347e366 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -3,18 +3,68 @@
# (c) 2014, Sasha Levin <sasha.levin@oracle.com>
#set -x
-if [[ $# < 2 ]]; then
+if [[ $# < 1 ]]; then
echo "Usage:"
- echo " $0 [vmlinux] [base path] [modules path]"
+ echo " $0 -r <release> | <vmlinux> [base path] [modules path]"
exit 1
fi
-vmlinux=$1
-basepath=$2
-modpath=$3
+if [[ $1 == "-r" ]] ; then
+ vmlinux=""
+ basepath="auto"
+ modpath=""
+ release=$2
+
+ for fn in {,/usr/lib/debug}/boot/vmlinux-$release{,.debug} /lib/modules/$release{,/build}/vmlinux ; do
+ if [ -e "$fn" ] ; then
+ vmlinux=$fn
+ break
+ fi
+ done
+
+ if [[ $vmlinux == "" ]] ; then
+ echo "ERROR! vmlinux image for release $release is not found" >&2
+ exit 2
+ fi
+else
+ vmlinux=$1
+ basepath=${2-auto}
+ modpath=$3
+ release=""
+fi
+
declare -A cache
declare -A modcache
+find_module() {
+ if [[ "$modpath" != "" ]] ; then
+ for fn in $(find "$modpath" -name "${module//_/[-_]}.ko*") ; do
+ if readelf -WS "$fn" | grep -qwF .debug_line ; then
+ echo $fn
+ return
+ fi
+ done
+ return 1
+ fi
+
+ modpath=$(dirname "$vmlinux")
+ find_module && return
+
+ if [[ $release == "" ]] ; then
+ release=$(gdb -ex 'print init_uts_ns.name.release' -ex 'quit' -quiet -batch "$vmlinux" | sed -n 's/\$1 = "\(.*\)".*/\1/p')
+ fi
+
+ for dn in {/usr/lib/debug,}/lib/modules/$release ; do
+ if [ -e "$dn" ] ; then
+ modpath="$dn"
+ find_module && return
+ fi
+ done
+
+ modpath=""
+ return 1
+}
+
parse_symbol() {
# The structure of symbol at this point is:
# ([name]+[offset]/[total length])
@@ -27,12 +77,11 @@ parse_symbol() {
elif [[ "${modcache[$module]+isset}" == "isset" ]]; then
local objfile=${modcache[$module]}
else
- if [[ $modpath == "" ]]; then
+ local objfile=$(find_module)
+ if [[ $objfile == "" ]] ; then
echo "WARNING! Modules path isn't set, but is needed to parse this symbol" >&2
return
fi
- local objfile=$(find "$modpath" -name "${module//_/[-_]}.ko*" -print -quit)
- [[ $objfile == "" ]] && return
modcache[$module]=$objfile
fi
@@ -56,7 +105,11 @@ parse_symbol() {
if [[ "${cache[$module,$name]+isset}" == "isset" ]]; then
local base_addr=${cache[$module,$name]}
else
- local base_addr=$(nm "$objfile" | grep -i ' t ' | awk "/ $name\$/ {print \$1}" | head -n1)
+ local base_addr=$(nm "$objfile" | awk '$3 == "'$name'" && ($2 == "t" || $2 == "T") {print $1; exit}')
+ if [[ $base_addr == "" ]] ; then
+ # address not found
+ return
+ fi
cache[$module,$name]="$base_addr"
fi
# Let's start doing the math to get the exact address into the
@@ -148,6 +201,14 @@ handle_line() {
echo "${words[@]}" "$symbol $module"
}
+if [[ $basepath == "auto" ]] ; then
+ module=""
+ symbol="kernel_init+0x0/0x0"
+ parse_symbol
+ basepath=${symbol#kernel_init (}
+ basepath=${basepath%/init/main.c:*)}
+fi
+
while read line; do
# Let's see if we have an address in the line
if [[ $line =~ \[\<([^]]+)\>\] ]] ||
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index c45e9afaab2d..f253681e7e2a 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -149,6 +149,7 @@ arbitary||arbitrary
architechture||architecture
arguement||argument
arguements||arguments
+arithmatic||arithmetic
aritmetic||arithmetic
arne't||aren't
arraival||arrival
@@ -454,6 +455,7 @@ destorys||destroys
destroied||destroyed
detabase||database
deteced||detected
+detectt||detect
develope||develop
developement||development
developped||developed
@@ -545,6 +547,7 @@ entires||entries
entites||entities
entrys||entries
enocded||encoded
+enought||enough
enterily||entirely
enviroiment||environment
enviroment||environment
@@ -556,11 +559,14 @@ equivelant||equivalent
equivilant||equivalent
eror||error
errorr||error
+errror||error
estbalishment||establishment
etsablishment||establishment
etsbalishment||establishment
+evalution||evaluation
excecutable||executable
exceded||exceeded
+exceds||exceeds
exceeed||exceed
excellant||excellent
execeeded||exceeded
@@ -583,6 +589,7 @@ explictly||explicitly
expresion||expression
exprimental||experimental
extened||extended
+exteneded||extended||extended
extensability||extensibility
extention||extension
extenstion||extension
@@ -610,10 +617,12 @@ feautures||features
fetaure||feature
fetaures||features
fileystem||filesystem
+fimrware||firmware
fimware||firmware
firmare||firmware
firmaware||firmware
firware||firmware
+firwmare||firmware
finanize||finalize
findn||find
finilizes||finalizes
@@ -661,6 +670,7 @@ globel||global
grabing||grabbing
grahical||graphical
grahpical||graphical
+granularty||granularity
grapic||graphic
grranted||granted
guage||gauge
@@ -906,6 +916,7 @@ miximum||maximum
mmnemonic||mnemonic
mnay||many
modfiy||modify
+modifer||modifier
modulues||modules
momery||memory
memomry||memory
@@ -915,6 +926,7 @@ monochromo||monochrome
monocrome||monochrome
mopdule||module
mroe||more
+multipler||multiplier
mulitplied||multiplied
multidimensionnal||multidimensional
multipe||multiple
@@ -952,6 +964,7 @@ occassionally||occasionally
occationally||occasionally
occurance||occurrence
occurances||occurrences
+occurd||occurred
occured||occurred
occurence||occurrence
occure||occurred
@@ -1058,6 +1071,7 @@ precission||precision
preemptable||preemptible
prefered||preferred
prefferably||preferably
+prefitler||prefilter
premption||preemption
prepaired||prepared
preperation||preparation
@@ -1101,6 +1115,7 @@ pronunce||pronounce
propery||property
propigate||propagate
propigation||propagation
+propogation||propagation
propogate||propagate
prosess||process
protable||portable
@@ -1316,6 +1331,7 @@ sturcture||structure
subdirectoires||subdirectories
suble||subtle
substract||subtract
+submited||submitted
submition||submission
suceed||succeed
succesfully||successfully
@@ -1324,6 +1340,7 @@ successed||succeeded
successfull||successful
successfuly||successfully
sucessfully||successfully
+sucessful||successful
sucess||success
superflous||superfluous
superseeded||superseded
@@ -1409,6 +1426,7 @@ transormed||transformed
trasfer||transfer
trasmission||transmission
treshold||threshold
+triggerd||triggered
trigerred||triggered
trigerring||triggering
trun||turn
@@ -1421,6 +1439,7 @@ uknown||unknown
usccess||success
usupported||unsupported
uncommited||uncommitted
+uncompatible||incompatible
unconditionaly||unconditionally
undeflow||underflow
underun||underrun
diff --git a/scripts/tags.sh b/scripts/tags.sh
index 4e18ae5282a6..32d3f53af10b 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -91,20 +91,10 @@ all_sources()
all_compiled_sources()
{
- for i in $(all_sources); do
- case "$i" in
- *.[cS])
- j=${i/\.[cS]/\.o}
- j="${j#$tree}"
- if [ -e $j ]; then
- echo $i
- fi
- ;;
- *)
- echo $i
- ;;
- esac
- done
+ realpath -es $([ -z "$KBUILD_ABS_SRCTREE" ] && echo --relative-to=.) \
+ include/generated/autoconf.h $(find -name "*.cmd" -exec \
+ grep -Poh '(?(?=^source_.* \K).*|(?=^ \K\S).*(?= \\))' {} \+ |
+ awk '!a[$0]++') | sort -u
}
all_target_sources()
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index 1c898055a476..7b0e13ce7dc7 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -40,8 +40,8 @@ void aa_free_domain_entries(struct aa_domain *domain)
return;
for (i = 0; i < domain->size; i++)
- kzfree(domain->table[i]);
- kzfree(domain->table);
+ kfree_sensitive(domain->table[i]);
+ kfree_sensitive(domain->table);
domain->table = NULL;
}
}
diff --git a/security/apparmor/include/file.h b/security/apparmor/include/file.h
index aff26fc71407..d4f8948517d9 100644
--- a/security/apparmor/include/file.h
+++ b/security/apparmor/include/file.h
@@ -72,7 +72,7 @@ static inline void aa_free_file_ctx(struct aa_file_ctx *ctx)
{
if (ctx) {
aa_put_label(rcu_access_pointer(ctx->label));
- kzfree(ctx);
+ kfree_sensitive(ctx);
}
}
diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c
index af4f50fda9e3..4c010c9a6af1 100644
--- a/security/apparmor/policy.c
+++ b/security/apparmor/policy.c
@@ -187,9 +187,9 @@ static void aa_free_data(void *ptr, void *arg)
{
struct aa_data *data = ptr;
- kzfree(data->data);
- kzfree(data->key);
- kzfree(data);
+ kfree_sensitive(data->data);
+ kfree_sensitive(data->key);
+ kfree_sensitive(data);
}
/**
@@ -217,19 +217,19 @@ void aa_free_profile(struct aa_profile *profile)
aa_put_profile(rcu_access_pointer(profile->parent));
aa_put_ns(profile->ns);
- kzfree(profile->rename);
+ kfree_sensitive(profile->rename);
aa_free_file_rules(&profile->file);
aa_free_cap_rules(&profile->caps);
aa_free_rlimit_rules(&profile->rlimits);
for (i = 0; i < profile->xattr_count; i++)
- kzfree(profile->xattrs[i]);
- kzfree(profile->xattrs);
+ kfree_sensitive(profile->xattrs[i]);
+ kfree_sensitive(profile->xattrs);
for (i = 0; i < profile->secmark_count; i++)
- kzfree(profile->secmark[i].label);
- kzfree(profile->secmark);
- kzfree(profile->dirname);
+ kfree_sensitive(profile->secmark[i].label);
+ kfree_sensitive(profile->secmark);
+ kfree_sensitive(profile->dirname);
aa_put_dfa(profile->xmatch);
aa_put_dfa(profile->policy.dfa);
@@ -237,14 +237,14 @@ void aa_free_profile(struct aa_profile *profile)
rht = profile->data;
profile->data = NULL;
rhashtable_free_and_destroy(rht, aa_free_data, NULL);
- kzfree(rht);
+ kfree_sensitive(rht);
}
- kzfree(profile->hash);
+ kfree_sensitive(profile->hash);
aa_put_loaddata(profile->rawdata);
aa_label_destroy(&profile->label);
- kzfree(profile);
+ kfree_sensitive(profile);
}
/**
diff --git a/security/apparmor/policy_ns.c b/security/apparmor/policy_ns.c
index d7ef540027a5..70921d95fb40 100644
--- a/security/apparmor/policy_ns.c
+++ b/security/apparmor/policy_ns.c
@@ -121,9 +121,9 @@ static struct aa_ns *alloc_ns(const char *prefix, const char *name)
return ns;
fail_unconfined:
- kzfree(ns->base.hname);
+ kfree_sensitive(ns->base.hname);
fail_ns:
- kzfree(ns);
+ kfree_sensitive(ns);
return NULL;
}
@@ -145,7 +145,7 @@ void aa_free_ns(struct aa_ns *ns)
ns->unconfined->ns = NULL;
aa_free_profile(ns->unconfined);
- kzfree(ns);
+ kfree_sensitive(ns);
}
/**
diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c
index b67322abcc33..dc345ac93205 100644
--- a/security/apparmor/policy_unpack.c
+++ b/security/apparmor/policy_unpack.c
@@ -163,10 +163,10 @@ static void do_loaddata_free(struct work_struct *work)
aa_put_ns(ns);
}
- kzfree(d->hash);
- kzfree(d->name);
+ kfree_sensitive(d->hash);
+ kfree_sensitive(d->name);
kvfree(d->data);
- kzfree(d);
+ kfree_sensitive(d);
}
void aa_loaddata_kref(struct kref *kref)
@@ -894,7 +894,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name)
while (unpack_strdup(e, &key, NULL)) {
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data) {
- kzfree(key);
+ kfree_sensitive(key);
goto fail;
}
@@ -902,8 +902,8 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name)
data->size = unpack_blob(e, &data->data, NULL);
data->data = kvmemdup(data->data, data->size);
if (data->size && !data->data) {
- kzfree(data->key);
- kzfree(data);
+ kfree_sensitive(data->key);
+ kfree_sensitive(data);
goto fail;
}
@@ -1037,7 +1037,7 @@ void aa_load_ent_free(struct aa_load_ent *ent)
aa_put_profile(ent->old);
aa_put_profile(ent->new);
kfree(ent->ns_name);
- kzfree(ent);
+ kfree_sensitive(ent);
}
}
diff --git a/security/keys/big_key.c b/security/keys/big_key.c
index dd708e8f13c0..691347dea3c1 100644
--- a/security/keys/big_key.c
+++ b/security/keys/big_key.c
@@ -138,7 +138,7 @@ int big_key_preparse(struct key_preparsed_payload *prep)
err_fput:
fput(file);
err_enckey:
- kzfree(enckey);
+ kfree_sensitive(enckey);
error:
memzero_explicit(buf, enclen);
kvfree(buf);
@@ -155,7 +155,7 @@ void big_key_free_preparse(struct key_preparsed_payload *prep)
path_put(path);
}
- kzfree(prep->payload.data[big_key_data]);
+ kfree_sensitive(prep->payload.data[big_key_data]);
}
/*
@@ -187,7 +187,7 @@ void big_key_destroy(struct key *key)
path->mnt = NULL;
path->dentry = NULL;
}
- kzfree(key->payload.data[big_key_data]);
+ kfree_sensitive(key->payload.data[big_key_data]);
key->payload.data[big_key_data] = NULL;
}
diff --git a/security/keys/dh.c b/security/keys/dh.c
index c4c629bb1c03..1abfa70ed6e1 100644
--- a/security/keys/dh.c
+++ b/security/keys/dh.c
@@ -58,9 +58,9 @@ error:
static void dh_free_data(struct dh *dh)
{
- kzfree(dh->key);
- kzfree(dh->p);
- kzfree(dh->g);
+ kfree_sensitive(dh->key);
+ kfree_sensitive(dh->p);
+ kfree_sensitive(dh->g);
}
struct dh_completion {
@@ -126,7 +126,7 @@ static void kdf_dealloc(struct kdf_sdesc *sdesc)
if (sdesc->shash.tfm)
crypto_free_shash(sdesc->shash.tfm);
- kzfree(sdesc);
+ kfree_sensitive(sdesc);
}
/*
@@ -220,7 +220,7 @@ static int keyctl_dh_compute_kdf(struct kdf_sdesc *sdesc,
ret = -EFAULT;
err:
- kzfree(outbuf);
+ kfree_sensitive(outbuf);
return ret;
}
@@ -395,11 +395,11 @@ long __keyctl_dh_compute(struct keyctl_dh_params __user *params,
out6:
kpp_request_free(req);
out5:
- kzfree(outbuf);
+ kfree_sensitive(outbuf);
out4:
crypto_free_kpp(tfm);
out3:
- kzfree(secret);
+ kfree_sensitive(secret);
out2:
dh_free_data(&dh_inputs);
out1:
diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c
index 14cf81d1a30b..deebbf14eeca 100644
--- a/security/keys/encrypted-keys/encrypted.c
+++ b/security/keys/encrypted-keys/encrypted.c
@@ -370,7 +370,7 @@ static int get_derived_key(u8 *derived_key, enum derived_key_type key_type,
master_keylen);
ret = crypto_shash_tfm_digest(hash_tfm, derived_buf, derived_buf_len,
derived_key);
- kzfree(derived_buf);
+ kfree_sensitive(derived_buf);
return ret;
}
@@ -812,13 +812,13 @@ static int encrypted_instantiate(struct key *key,
ret = encrypted_init(epayload, key->description, format, master_desc,
decrypted_datalen, hex_encoded_iv);
if (ret < 0) {
- kzfree(epayload);
+ kfree_sensitive(epayload);
goto out;
}
rcu_assign_keypointer(key, epayload);
out:
- kzfree(datablob);
+ kfree_sensitive(datablob);
return ret;
}
@@ -827,7 +827,7 @@ static void encrypted_rcu_free(struct rcu_head *rcu)
struct encrypted_key_payload *epayload;
epayload = container_of(rcu, struct encrypted_key_payload, rcu);
- kzfree(epayload);
+ kfree_sensitive(epayload);
}
/*
@@ -885,7 +885,7 @@ static int encrypted_update(struct key *key, struct key_preparsed_payload *prep)
rcu_assign_keypointer(key, new_epayload);
call_rcu(&epayload->rcu, encrypted_rcu_free);
out:
- kzfree(buf);
+ kfree_sensitive(buf);
return ret;
}
@@ -946,7 +946,7 @@ static long encrypted_read(const struct key *key, char *buffer,
memzero_explicit(derived_key, sizeof(derived_key));
memcpy(buffer, ascii_buf, asciiblob_len);
- kzfree(ascii_buf);
+ kfree_sensitive(ascii_buf);
return asciiblob_len;
out:
@@ -961,7 +961,7 @@ out:
*/
static void encrypted_destroy(struct key *key)
{
- kzfree(key->payload.data[0]);
+ kfree_sensitive(key->payload.data[0]);
}
struct key_type key_type_encrypted = {
diff --git a/security/keys/trusted-keys/trusted_tpm1.c b/security/keys/trusted-keys/trusted_tpm1.c
index 8001ab07e63b..b9fe02e5f84f 100644
--- a/security/keys/trusted-keys/trusted_tpm1.c
+++ b/security/keys/trusted-keys/trusted_tpm1.c
@@ -68,7 +68,7 @@ static int TSS_sha1(const unsigned char *data, unsigned int datalen,
}
ret = crypto_shash_digest(&sdesc->shash, data, datalen, digest);
- kzfree(sdesc);
+ kfree_sensitive(sdesc);
return ret;
}
@@ -112,7 +112,7 @@ static int TSS_rawhmac(unsigned char *digest, const unsigned char *key,
if (!ret)
ret = crypto_shash_final(&sdesc->shash, digest);
out:
- kzfree(sdesc);
+ kfree_sensitive(sdesc);
return ret;
}
@@ -166,7 +166,7 @@ int TSS_authhmac(unsigned char *digest, const unsigned char *key,
paramdigest, TPM_NONCE_SIZE, h1,
TPM_NONCE_SIZE, h2, 1, &c, 0, 0);
out:
- kzfree(sdesc);
+ kfree_sensitive(sdesc);
return ret;
}
EXPORT_SYMBOL_GPL(TSS_authhmac);
@@ -251,7 +251,7 @@ int TSS_checkhmac1(unsigned char *buffer,
if (memcmp(testhmac, authdata, SHA1_DIGEST_SIZE))
ret = -EINVAL;
out:
- kzfree(sdesc);
+ kfree_sensitive(sdesc);
return ret;
}
EXPORT_SYMBOL_GPL(TSS_checkhmac1);
@@ -353,7 +353,7 @@ static int TSS_checkhmac2(unsigned char *buffer,
if (memcmp(testhmac2, authdata2, SHA1_DIGEST_SIZE))
ret = -EINVAL;
out:
- kzfree(sdesc);
+ kfree_sensitive(sdesc);
return ret;
}
@@ -563,7 +563,7 @@ static int tpm_seal(struct tpm_buf *tb, uint16_t keytype,
*bloblen = storedsize;
}
out:
- kzfree(td);
+ kfree_sensitive(td);
return ret;
}
@@ -1031,12 +1031,12 @@ static int trusted_instantiate(struct key *key,
if (!ret && options->pcrlock)
ret = pcrlock(options->pcrlock);
out:
- kzfree(datablob);
- kzfree(options);
+ kfree_sensitive(datablob);
+ kfree_sensitive(options);
if (!ret)
rcu_assign_keypointer(key, payload);
else
- kzfree(payload);
+ kfree_sensitive(payload);
return ret;
}
@@ -1045,7 +1045,7 @@ static void trusted_rcu_free(struct rcu_head *rcu)
struct trusted_key_payload *p;
p = container_of(rcu, struct trusted_key_payload, rcu);
- kzfree(p);
+ kfree_sensitive(p);
}
/*
@@ -1087,13 +1087,13 @@ static int trusted_update(struct key *key, struct key_preparsed_payload *prep)
ret = datablob_parse(datablob, new_p, new_o);
if (ret != Opt_update) {
ret = -EINVAL;
- kzfree(new_p);
+ kfree_sensitive(new_p);
goto out;
}
if (!new_o->keyhandle) {
ret = -EINVAL;
- kzfree(new_p);
+ kfree_sensitive(new_p);
goto out;
}
@@ -1107,22 +1107,22 @@ static int trusted_update(struct key *key, struct key_preparsed_payload *prep)
ret = key_seal(new_p, new_o);
if (ret < 0) {
pr_info("trusted_key: key_seal failed (%d)\n", ret);
- kzfree(new_p);
+ kfree_sensitive(new_p);
goto out;
}
if (new_o->pcrlock) {
ret = pcrlock(new_o->pcrlock);
if (ret < 0) {
pr_info("trusted_key: pcrlock failed (%d)\n", ret);
- kzfree(new_p);
+ kfree_sensitive(new_p);
goto out;
}
}
rcu_assign_keypointer(key, new_p);
call_rcu(&p->rcu, trusted_rcu_free);
out:
- kzfree(datablob);
- kzfree(new_o);
+ kfree_sensitive(datablob);
+ kfree_sensitive(new_o);
return ret;
}
@@ -1154,7 +1154,7 @@ static long trusted_read(const struct key *key, char *buffer,
*/
static void trusted_destroy(struct key *key)
{
- kzfree(key->payload.data[0]);
+ kfree_sensitive(key->payload.data[0]);
}
struct key_type key_type_trusted = {
diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c
index 07d4287e9084..749e2a4dcb13 100644
--- a/security/keys/user_defined.c
+++ b/security/keys/user_defined.c
@@ -82,7 +82,7 @@ EXPORT_SYMBOL_GPL(user_preparse);
*/
void user_free_preparse(struct key_preparsed_payload *prep)
{
- kzfree(prep->payload.data[0]);
+ kfree_sensitive(prep->payload.data[0]);
}
EXPORT_SYMBOL_GPL(user_free_preparse);
@@ -91,7 +91,7 @@ static void user_free_payload_rcu(struct rcu_head *head)
struct user_key_payload *payload;
payload = container_of(head, struct user_key_payload, rcu);
- kzfree(payload);
+ kfree_sensitive(payload);
}
/*
@@ -147,7 +147,7 @@ void user_destroy(struct key *key)
{
struct user_key_payload *upayload = key->payload.data[0];
- kzfree(upayload);
+ kfree_sensitive(upayload);
}
EXPORT_SYMBOL_GPL(user_destroy);
diff --git a/tools/cgroup/memcg_slabinfo.py b/tools/cgroup/memcg_slabinfo.py
new file mode 100644
index 000000000000..c4225ed63565
--- /dev/null
+++ b/tools/cgroup/memcg_slabinfo.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env drgn
+#
+# Copyright (C) 2020 Roman Gushchin <guro@fb.com>
+# Copyright (C) 2020 Facebook
+
+from os import stat
+import argparse
+import sys
+
+from drgn.helpers.linux import list_for_each_entry, list_empty
+from drgn.helpers.linux import for_each_page
+from drgn.helpers.linux.cpumask import for_each_online_cpu
+from drgn.helpers.linux.percpu import per_cpu_ptr
+from drgn import container_of, FaultError, Object
+
+
+DESC = """
+This is a drgn script to provide slab statistics for memory cgroups.
+It supports cgroup v2 and v1 and can emulate memory.kmem.slabinfo
+interface of cgroup v1.
+For drgn, visit https://github.com/osandov/drgn.
+"""
+
+
+MEMCGS = {}
+
+OO_SHIFT = 16
+OO_MASK = ((1 << OO_SHIFT) - 1)
+
+
+def err(s):
+ print('slabinfo.py: error: %s' % s, file=sys.stderr, flush=True)
+ sys.exit(1)
+
+
+def find_memcg_ids(css=prog['root_mem_cgroup'].css, prefix=''):
+ if not list_empty(css.children.address_of_()):
+ for css in list_for_each_entry('struct cgroup_subsys_state',
+ css.children.address_of_(),
+ 'sibling'):
+ name = prefix + '/' + css.cgroup.kn.name.string_().decode('utf-8')
+ memcg = container_of(css, 'struct mem_cgroup', 'css')
+ MEMCGS[css.cgroup.kn.id.value_()] = memcg
+ find_memcg_ids(css, name)
+
+
+def is_root_cache(s):
+ try:
+ return False if s.memcg_params.root_cache else True
+ except AttributeError:
+ return True
+
+
+def cache_name(s):
+ if is_root_cache(s):
+ return s.name.string_().decode('utf-8')
+ else:
+ return s.memcg_params.root_cache.name.string_().decode('utf-8')
+
+
+# SLUB
+
+def oo_order(s):
+ return s.oo.x >> OO_SHIFT
+
+
+def oo_objects(s):
+ return s.oo.x & OO_MASK
+
+
+def count_partial(n, fn):
+ nr_pages = 0
+ for page in list_for_each_entry('struct page', n.partial.address_of_(),
+ 'lru'):
+ nr_pages += fn(page)
+ return nr_pages
+
+
+def count_free(page):
+ return page.objects - page.inuse
+
+
+def slub_get_slabinfo(s, cfg):
+ nr_slabs = 0
+ nr_objs = 0
+ nr_free = 0
+
+ for node in range(cfg['nr_nodes']):
+ n = s.node[node]
+ nr_slabs += n.nr_slabs.counter.value_()
+ nr_objs += n.total_objects.counter.value_()
+ nr_free += count_partial(n, count_free)
+
+ return {'active_objs': nr_objs - nr_free,
+ 'num_objs': nr_objs,
+ 'active_slabs': nr_slabs,
+ 'num_slabs': nr_slabs,
+ 'objects_per_slab': oo_objects(s),
+ 'cache_order': oo_order(s),
+ 'limit': 0,
+ 'batchcount': 0,
+ 'shared': 0,
+ 'shared_avail': 0}
+
+
+def cache_show(s, cfg, objs):
+ if cfg['allocator'] == 'SLUB':
+ sinfo = slub_get_slabinfo(s, cfg)
+ else:
+ err('SLAB isn\'t supported yet')
+
+ if cfg['shared_slab_pages']:
+ sinfo['active_objs'] = objs
+ sinfo['num_objs'] = objs
+
+ print('%-17s %6lu %6lu %6u %4u %4d'
+ ' : tunables %4u %4u %4u'
+ ' : slabdata %6lu %6lu %6lu' % (
+ cache_name(s), sinfo['active_objs'], sinfo['num_objs'],
+ s.size, sinfo['objects_per_slab'], 1 << sinfo['cache_order'],
+ sinfo['limit'], sinfo['batchcount'], sinfo['shared'],
+ sinfo['active_slabs'], sinfo['num_slabs'],
+ sinfo['shared_avail']))
+
+
+def detect_kernel_config():
+ cfg = {}
+
+ cfg['nr_nodes'] = prog['nr_online_nodes'].value_()
+
+ if prog.type('struct kmem_cache').members[1][1] == 'flags':
+ cfg['allocator'] = 'SLUB'
+ elif prog.type('struct kmem_cache').members[1][1] == 'batchcount':
+ cfg['allocator'] = 'SLAB'
+ else:
+ err('Can\'t determine the slab allocator')
+
+ cfg['shared_slab_pages'] = False
+ try:
+ if prog.type('struct obj_cgroup'):
+ cfg['shared_slab_pages'] = True
+ except:
+ pass
+
+ return cfg
+
+
+def for_each_slab_page(prog):
+ PGSlab = 1 << prog.constant('PG_slab')
+ PGHead = 1 << prog.constant('PG_head')
+
+ for page in for_each_page(prog):
+ try:
+ if page.flags.value_() & PGSlab:
+ yield page
+ except FaultError:
+ pass
+
+
+def main():
+ parser = argparse.ArgumentParser(description=DESC,
+ formatter_class=
+ argparse.RawTextHelpFormatter)
+ parser.add_argument('cgroup', metavar='CGROUP',
+ help='Target memory cgroup')
+ args = parser.parse_args()
+
+ try:
+ cgroup_id = stat(args.cgroup).st_ino
+ find_memcg_ids()
+ memcg = MEMCGS[cgroup_id]
+ except KeyError:
+ err('Can\'t find the memory cgroup')
+
+ cfg = detect_kernel_config()
+
+ print('# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>'
+ ' : tunables <limit> <batchcount> <sharedfactor>'
+ ' : slabdata <active_slabs> <num_slabs> <sharedavail>')
+
+ if cfg['shared_slab_pages']:
+ obj_cgroups = set()
+ stats = {}
+ caches = {}
+
+ # find memcg pointers belonging to the specified cgroup
+ obj_cgroups.add(memcg.objcg.value_())
+ for ptr in list_for_each_entry('struct obj_cgroup',
+ memcg.objcg_list.address_of_(),
+ 'list'):
+ obj_cgroups.add(ptr.value_())
+
+ # look over all slab pages, belonging to non-root memcgs
+ # and look for objects belonging to the given memory cgroup
+ for page in for_each_slab_page(prog):
+ objcg_vec_raw = page.obj_cgroups.value_()
+ if objcg_vec_raw == 0:
+ continue
+ cache = page.slab_cache
+ if not cache:
+ continue
+ addr = cache.value_()
+ caches[addr] = cache
+ # clear the lowest bit to get the true obj_cgroups
+ objcg_vec = Object(prog, page.obj_cgroups.type_,
+ value=objcg_vec_raw & ~1)
+
+ if addr not in stats:
+ stats[addr] = 0
+
+ for i in range(oo_objects(cache)):
+ if objcg_vec[i].value_() in obj_cgroups:
+ stats[addr] += 1
+
+ for addr in caches:
+ if stats[addr] > 0:
+ cache_show(caches[addr], cfg, stats[addr])
+
+ else:
+ for s in list_for_each_entry('struct kmem_cache',
+ memcg.kmem_caches.address_of_(),
+ 'memcg_params.kmem_caches_node'):
+ cache_show(s, cfg, None)
+
+
+main()
diff --git a/tools/include/linux/jhash.h b/tools/include/linux/jhash.h
index 348c6f47e4cc..af8d0fe1c6ce 100644
--- a/tools/include/linux/jhash.h
+++ b/tools/include/linux/jhash.h
@@ -5,7 +5,7 @@
*
* Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
*
- * http://burtleburtle.net/bob/hash/
+ * https://burtleburtle.net/bob/hash/
*
* These are the credits from Bob's sources:
*
diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c
index 06ac7bd2144b..727396de6be5 100644
--- a/tools/lib/rbtree.c
+++ b/tools/lib/rbtree.c
@@ -13,7 +13,7 @@
#include <linux/export.h>
/*
- * red-black trees properties: http://en.wikipedia.org/wiki/Rbtree
+ * red-black trees properties: https://en.wikipedia.org/wiki/Rbtree
*
* 1) A node is either red or black
* 2) The root is black
diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h
index b77837f75a0d..ad7799c85429 100644
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -379,7 +379,7 @@ enum tep_errno {
* errno since SUS requires the errno has distinct positive values.
* See 'Issue 6' in the link below.
*
- * http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/errno.h.html
+ * https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/errno.h.html
*/
__TEP_ERRNO__START = -100000,
diff --git a/tools/testing/ktest/examples/README b/tools/testing/ktest/examples/README
index a12d295a09d8..4f048789b260 100644
--- a/tools/testing/ktest/examples/README
+++ b/tools/testing/ktest/examples/README
@@ -11,7 +11,7 @@ crosstests.conf - this config shows an example of testing a git repo against
lots of different architectures. It only does build tests, but makes
it easy to compile test different archs. You can download the arch
cross compilers from:
- http://kernel.org/pub/tools/crosstool/files/bin/x86_64/
+ https://kernel.org/pub/tools/crosstool/files/bin/x86_64/
test.conf - A generic example of a config. This is based on an actual config
used to perform real testing.
diff --git a/tools/testing/ktest/examples/crosstests.conf b/tools/testing/ktest/examples/crosstests.conf
index 6907f32590b2..3b15e85f26bd 100644
--- a/tools/testing/ktest/examples/crosstests.conf
+++ b/tools/testing/ktest/examples/crosstests.conf
@@ -3,7 +3,7 @@
#
# In this config, it is expected that the tool chains from:
#
-# http://kernel.org/pub/tools/crosstool/files/bin/x86_64/
+# https://kernel.org/pub/tools/crosstool/files/bin/x86_64/
#
# running on a x86_64 system have been downloaded and installed into:
#
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index e03bc15ce731..9018f45d631d 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -32,6 +32,7 @@ TARGETS += lkdtm
TARGETS += membarrier
TARGETS += memfd
TARGETS += memory-hotplug
+TARGETS += mincore
TARGETS += mount
TARGETS += mqueue
TARGETS += net
diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore
index aa6de65b0838..84cfcabea838 100644
--- a/tools/testing/selftests/cgroup/.gitignore
+++ b/tools/testing/selftests/cgroup/.gitignore
@@ -2,3 +2,4 @@
test_memcontrol
test_core
test_freezer
+test_kmem \ No newline at end of file
diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile
index 967f268fde74..f027d933595b 100644
--- a/tools/testing/selftests/cgroup/Makefile
+++ b/tools/testing/selftests/cgroup/Makefile
@@ -6,11 +6,13 @@ all:
TEST_FILES := with_stress.sh
TEST_PROGS := test_stress.sh
TEST_GEN_PROGS = test_memcontrol
+TEST_GEN_PROGS += test_kmem
TEST_GEN_PROGS += test_core
TEST_GEN_PROGS += test_freezer
include ../lib.mk
$(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h
+$(OUTPUT)/test_kmem: cgroup_util.c ../clone3/clone3_selftests.h
$(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h
$(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h
diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c
index 8a637ca7d73a..05853b0b8831 100644
--- a/tools/testing/selftests/cgroup/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/cgroup_util.c
@@ -106,7 +106,7 @@ int cg_read_strcmp(const char *cgroup, const char *control,
/* Handle the case of comparing against empty string */
if (!expected)
- size = 32;
+ return -1;
else
size = strlen(expected) + 1;
diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c
new file mode 100644
index 000000000000..5224dae216e5
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_kmem.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <linux/limits.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <errno.h>
+#include <sys/sysinfo.h>
+#include <pthread.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+
+static int alloc_dcache(const char *cgroup, void *arg)
+{
+ unsigned long i;
+ struct stat st;
+ char buf[128];
+
+ for (i = 0; i < (unsigned long)arg; i++) {
+ snprintf(buf, sizeof(buf),
+ "/something-non-existent-with-a-long-name-%64lu-%d",
+ i, getpid());
+ stat(buf, &st);
+ }
+
+ return 0;
+}
+
+/*
+ * This test allocates 100000 of negative dentries with long names.
+ * Then it checks that "slab" in memory.stat is larger than 1M.
+ * Then it sets memory.high to 1M and checks that at least 1/2
+ * of slab memory has been reclaimed.
+ */
+static int test_kmem_basic(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cg = NULL;
+ long slab0, slab1, current;
+
+ cg = cg_name(root, "kmem_basic_test");
+ if (!cg)
+ goto cleanup;
+
+ if (cg_create(cg))
+ goto cleanup;
+
+ if (cg_run(cg, alloc_dcache, (void *)100000))
+ goto cleanup;
+
+ slab0 = cg_read_key_long(cg, "memory.stat", "slab ");
+ if (slab0 < (1 << 20))
+ goto cleanup;
+
+ cg_write(cg, "memory.high", "1M");
+ slab1 = cg_read_key_long(cg, "memory.stat", "slab ");
+ if (slab1 <= 0)
+ goto cleanup;
+
+ current = cg_read_long(cg, "memory.current");
+ if (current <= 0)
+ goto cleanup;
+
+ if (slab1 < slab0 / 2 && current < slab0 / 2)
+ ret = KSFT_PASS;
+cleanup:
+ cg_destroy(cg);
+ free(cg);
+
+ return ret;
+}
+
+static void *alloc_kmem_fn(void *arg)
+{
+ alloc_dcache(NULL, (void *)100);
+ return NULL;
+}
+
+static int alloc_kmem_smp(const char *cgroup, void *arg)
+{
+ int nr_threads = 2 * get_nprocs();
+ pthread_t *tinfo;
+ unsigned long i;
+ int ret = -1;
+
+ tinfo = calloc(nr_threads, sizeof(pthread_t));
+ if (tinfo == NULL)
+ return -1;
+
+ for (i = 0; i < nr_threads; i++) {
+ if (pthread_create(&tinfo[i], NULL, &alloc_kmem_fn,
+ (void *)i)) {
+ free(tinfo);
+ return -1;
+ }
+ }
+
+ for (i = 0; i < nr_threads; i++) {
+ ret = pthread_join(tinfo[i], NULL);
+ if (ret)
+ break;
+ }
+
+ free(tinfo);
+ return ret;
+}
+
+static int cg_run_in_subcgroups(const char *parent,
+ int (*fn)(const char *cgroup, void *arg),
+ void *arg, int times)
+{
+ char *child;
+ int i;
+
+ for (i = 0; i < times; i++) {
+ child = cg_name_indexed(parent, "child", i);
+ if (!child)
+ return -1;
+
+ if (cg_create(child)) {
+ cg_destroy(child);
+ free(child);
+ return -1;
+ }
+
+ if (cg_run(child, fn, NULL)) {
+ cg_destroy(child);
+ free(child);
+ return -1;
+ }
+
+ cg_destroy(child);
+ free(child);
+ }
+
+ return 0;
+}
+
+/*
+ * The test creates and destroys a large number of cgroups. In each cgroup it
+ * allocates some slab memory (mostly negative dentries) using 2 * NR_CPUS
+ * threads. Then it checks the sanity of numbers on the parent level:
+ * the total size of the cgroups should be roughly equal to
+ * anon + file + slab + kernel_stack.
+ */
+static int test_kmem_memcg_deletion(const char *root)
+{
+ long current, slab, anon, file, kernel_stack, sum;
+ int ret = KSFT_FAIL;
+ char *parent;
+
+ parent = cg_name(root, "kmem_memcg_deletion_test");
+ if (!parent)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ if (cg_run_in_subcgroups(parent, alloc_kmem_smp, NULL, 100))
+ goto cleanup;
+
+ current = cg_read_long(parent, "memory.current");
+ slab = cg_read_key_long(parent, "memory.stat", "slab ");
+ anon = cg_read_key_long(parent, "memory.stat", "anon ");
+ file = cg_read_key_long(parent, "memory.stat", "file ");
+ kernel_stack = cg_read_key_long(parent, "memory.stat", "kernel_stack ");
+ if (current < 0 || slab < 0 || anon < 0 || file < 0 ||
+ kernel_stack < 0)
+ goto cleanup;
+
+ sum = slab + anon + file + kernel_stack;
+ if (abs(sum - current) < 4096 * 32 * 2 * get_nprocs()) {
+ ret = KSFT_PASS;
+ } else {
+ printf("memory.current = %ld\n", current);
+ printf("slab + anon + file + kernel_stack = %ld\n", sum);
+ printf("slab = %ld\n", slab);
+ printf("anon = %ld\n", anon);
+ printf("file = %ld\n", file);
+ printf("kernel_stack = %ld\n", kernel_stack);
+ }
+
+cleanup:
+ cg_destroy(parent);
+ free(parent);
+
+ return ret;
+}
+
+/*
+ * The test reads the entire /proc/kpagecgroup. If the operation went
+ * successfully (and the kernel didn't panic), the test is treated as passed.
+ */
+static int test_kmem_proc_kpagecgroup(const char *root)
+{
+ unsigned long buf[128];
+ int ret = KSFT_FAIL;
+ ssize_t len;
+ int fd;
+
+ fd = open("/proc/kpagecgroup", O_RDONLY);
+ if (fd < 0)
+ return ret;
+
+ do {
+ len = read(fd, buf, sizeof(buf));
+ } while (len > 0);
+
+ if (len == 0)
+ ret = KSFT_PASS;
+
+ close(fd);
+ return ret;
+}
+
+static void *pthread_wait_fn(void *arg)
+{
+ sleep(100);
+ return NULL;
+}
+
+static int spawn_1000_threads(const char *cgroup, void *arg)
+{
+ int nr_threads = 1000;
+ pthread_t *tinfo;
+ unsigned long i;
+ long stack;
+ int ret = -1;
+
+ tinfo = calloc(nr_threads, sizeof(pthread_t));
+ if (tinfo == NULL)
+ return -1;
+
+ for (i = 0; i < nr_threads; i++) {
+ if (pthread_create(&tinfo[i], NULL, &pthread_wait_fn,
+ (void *)i)) {
+ free(tinfo);
+ return(-1);
+ }
+ }
+
+ stack = cg_read_key_long(cgroup, "memory.stat", "kernel_stack ");
+ if (stack >= 4096 * 1000)
+ ret = 0;
+
+ free(tinfo);
+ return ret;
+}
+
+/*
+ * The test spawns a process, which spawns 1000 threads. Then it checks
+ * that memory.stat's kernel_stack is at least 1000 pages large.
+ */
+static int test_kmem_kernel_stacks(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cg = NULL;
+
+ cg = cg_name(root, "kmem_kernel_stacks_test");
+ if (!cg)
+ goto cleanup;
+
+ if (cg_create(cg))
+ goto cleanup;
+
+ if (cg_run(cg, spawn_1000_threads, NULL))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+cleanup:
+ cg_destroy(cg);
+ free(cg);
+
+ return ret;
+}
+
+/*
+ * This test sequentionally creates 30 child cgroups, allocates some
+ * kernel memory in each of them, and deletes them. Then it checks
+ * that the number of dying cgroups on the parent level is 0.
+ */
+static int test_kmem_dead_cgroups(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *parent;
+ long dead;
+ int i;
+
+ parent = cg_name(root, "kmem_dead_cgroups_test");
+ if (!parent)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30))
+ goto cleanup;
+
+ for (i = 0; i < 5; i++) {
+ dead = cg_read_key_long(parent, "cgroup.stat",
+ "nr_dying_descendants ");
+ if (dead == 0) {
+ ret = KSFT_PASS;
+ break;
+ }
+ /*
+ * Reclaiming cgroups might take some time,
+ * let's wait a bit and repeat.
+ */
+ sleep(1);
+ }
+
+cleanup:
+ cg_destroy(parent);
+ free(parent);
+
+ return ret;
+}
+
+#define T(x) { x, #x }
+struct kmem_test {
+ int (*fn)(const char *root);
+ const char *name;
+} tests[] = {
+ T(test_kmem_basic),
+ T(test_kmem_memcg_deletion),
+ T(test_kmem_proc_kpagecgroup),
+ T(test_kmem_kernel_stacks),
+ T(test_kmem_dead_cgroups),
+};
+#undef T
+
+int main(int argc, char **argv)
+{
+ char root[PATH_MAX];
+ int i, ret = EXIT_SUCCESS;
+
+ if (cg_find_unified_root(root, sizeof(root)))
+ ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+ /*
+ * Check that memory controller is available:
+ * memory is listed in cgroup.controllers
+ */
+ if (cg_read_strstr(root, "cgroup.controllers", "memory"))
+ ksft_exit_skip("memory controller isn't available\n");
+
+ if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
+ if (cg_write(root, "cgroup.subtree_control", "+memory"))
+ ksft_exit_skip("Failed to set memory controller\n");
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ switch (tests[i].fn(root)) {
+ case KSFT_PASS:
+ ksft_test_result_pass("%s\n", tests[i].name);
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("%s\n", tests[i].name);
+ break;
+ default:
+ ret = EXIT_FAILURE;
+ ksft_test_result_fail("%s\n", tests[i].name);
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/mincore/.gitignore b/tools/testing/selftests/mincore/.gitignore
new file mode 100644
index 000000000000..15c4dfc2df00
--- /dev/null
+++ b/tools/testing/selftests/mincore/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0+
+mincore_selftest
diff --git a/tools/testing/selftests/mincore/Makefile b/tools/testing/selftests/mincore/Makefile
new file mode 100644
index 000000000000..38c7db1e8926
--- /dev/null
+++ b/tools/testing/selftests/mincore/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0+
+
+CFLAGS += -Wall
+
+TEST_GEN_PROGS := mincore_selftest
+include ../lib.mk
diff --git a/tools/testing/selftests/mincore/mincore_selftest.c b/tools/testing/selftests/mincore/mincore_selftest.c
new file mode 100644
index 000000000000..5a1e85ff5d32
--- /dev/null
+++ b/tools/testing/selftests/mincore/mincore_selftest.c
@@ -0,0 +1,361 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * kselftest suite for mincore().
+ *
+ * Copyright (C) 2020 Collabora, Ltd.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <fcntl.h>
+#include <string.h>
+
+#include "../kselftest.h"
+#include "../kselftest_harness.h"
+
+/* Default test file size: 4MB */
+#define MB (1UL << 20)
+#define FILE_SIZE (4 * MB)
+
+
+/*
+ * Tests the user interface. This test triggers most of the documented
+ * error conditions in mincore().
+ */
+TEST(basic_interface)
+{
+ int retval;
+ int page_size;
+ unsigned char vec[1];
+ char *addr;
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ /* Query a 0 byte sized range */
+ retval = mincore(0, 0, vec);
+ EXPECT_EQ(0, retval);
+
+ /* Addresses in the specified range are invalid or unmapped */
+ errno = 0;
+ retval = mincore(NULL, page_size, vec);
+ EXPECT_EQ(-1, retval);
+ EXPECT_EQ(ENOMEM, errno);
+
+ errno = 0;
+ addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(MAP_FAILED, addr) {
+ TH_LOG("mmap error: %s", strerror(errno));
+ }
+
+ /* <addr> argument is not page-aligned */
+ errno = 0;
+ retval = mincore(addr + 1, page_size, vec);
+ EXPECT_EQ(-1, retval);
+ EXPECT_EQ(EINVAL, errno);
+
+ /* <length> argument is too large */
+ errno = 0;
+ retval = mincore(addr, -1, vec);
+ EXPECT_EQ(-1, retval);
+ EXPECT_EQ(ENOMEM, errno);
+
+ /* <vec> argument points to an illegal address */
+ errno = 0;
+ retval = mincore(addr, page_size, NULL);
+ EXPECT_EQ(-1, retval);
+ EXPECT_EQ(EFAULT, errno);
+ munmap(addr, page_size);
+}
+
+
+/*
+ * Test mincore() behavior on a private anonymous page mapping.
+ * Check that the page is not loaded into memory right after the mapping
+ * but after accessing it (on-demand allocation).
+ * Then free the page and check that it's not memory-resident.
+ */
+TEST(check_anonymous_locked_pages)
+{
+ unsigned char vec[1];
+ char *addr;
+ int retval;
+ int page_size;
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ /* Map one page and check it's not memory-resident */
+ errno = 0;
+ addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(MAP_FAILED, addr) {
+ TH_LOG("mmap error: %s", strerror(errno));
+ }
+ retval = mincore(addr, page_size, vec);
+ ASSERT_EQ(0, retval);
+ ASSERT_EQ(0, vec[0]) {
+ TH_LOG("Page found in memory before use");
+ }
+
+ /* Touch the page and check again. It should now be in memory */
+ addr[0] = 1;
+ mlock(addr, page_size);
+ retval = mincore(addr, page_size, vec);
+ ASSERT_EQ(0, retval);
+ ASSERT_EQ(1, vec[0]) {
+ TH_LOG("Page not found in memory after use");
+ }
+
+ /*
+ * It shouldn't be memory-resident after unlocking it and
+ * marking it as unneeded.
+ */
+ munlock(addr, page_size);
+ madvise(addr, page_size, MADV_DONTNEED);
+ retval = mincore(addr, page_size, vec);
+ ASSERT_EQ(0, retval);
+ ASSERT_EQ(0, vec[0]) {
+ TH_LOG("Page in memory after being zapped");
+ }
+ munmap(addr, page_size);
+}
+
+
+/*
+ * Check mincore() behavior on huge pages.
+ * This test will be skipped if the mapping fails (ie. if there are no
+ * huge pages available).
+ *
+ * Make sure the system has at least one free huge page, check
+ * "HugePages_Free" in /proc/meminfo.
+ * Increment /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages if
+ * needed.
+ */
+TEST(check_huge_pages)
+{
+ unsigned char vec[1];
+ char *addr;
+ int retval;
+ int page_size;
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ errno = 0;
+ addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+ -1, 0);
+ if (addr == MAP_FAILED) {
+ if (errno == ENOMEM)
+ SKIP(return, "No huge pages available.");
+ else
+ TH_LOG("mmap error: %s", strerror(errno));
+ }
+ retval = mincore(addr, page_size, vec);
+ ASSERT_EQ(0, retval);
+ ASSERT_EQ(0, vec[0]) {
+ TH_LOG("Page found in memory before use");
+ }
+
+ addr[0] = 1;
+ mlock(addr, page_size);
+ retval = mincore(addr, page_size, vec);
+ ASSERT_EQ(0, retval);
+ ASSERT_EQ(1, vec[0]) {
+ TH_LOG("Page not found in memory after use");
+ }
+
+ munlock(addr, page_size);
+ munmap(addr, page_size);
+}
+
+
+/*
+ * Test mincore() behavior on a file-backed page.
+ * No pages should be loaded into memory right after the mapping. Then,
+ * accessing any address in the mapping range should load the page
+ * containing the address and a number of subsequent pages (readahead).
+ *
+ * The actual readahead settings depend on the test environment, so we
+ * can't make a lot of assumptions about that. This test covers the most
+ * general cases.
+ */
+TEST(check_file_mmap)
+{
+ unsigned char *vec;
+ int vec_size;
+ char *addr;
+ int retval;
+ int page_size;
+ int fd;
+ int i;
+ int ra_pages = 0;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ vec_size = FILE_SIZE / page_size;
+ if (FILE_SIZE % page_size)
+ vec_size++;
+
+ vec = calloc(vec_size, sizeof(unsigned char));
+ ASSERT_NE(NULL, vec) {
+ TH_LOG("Can't allocate array");
+ }
+
+ errno = 0;
+ fd = open(".", O_TMPFILE | O_RDWR, 0600);
+ ASSERT_NE(-1, fd) {
+ TH_LOG("Can't create temporary file: %s",
+ strerror(errno));
+ }
+ errno = 0;
+ retval = fallocate(fd, 0, 0, FILE_SIZE);
+ ASSERT_EQ(0, retval) {
+ TH_LOG("Error allocating space for the temporary file: %s",
+ strerror(errno));
+ }
+
+ /*
+ * Map the whole file, the pages shouldn't be fetched yet.
+ */
+ errno = 0;
+ addr = mmap(NULL, FILE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ ASSERT_NE(MAP_FAILED, addr) {
+ TH_LOG("mmap error: %s", strerror(errno));
+ }
+ retval = mincore(addr, FILE_SIZE, vec);
+ ASSERT_EQ(0, retval);
+ for (i = 0; i < vec_size; i++) {
+ ASSERT_EQ(0, vec[i]) {
+ TH_LOG("Unexpected page in memory");
+ }
+ }
+
+ /*
+ * Touch a page in the middle of the mapping. We expect the next
+ * few pages (the readahead window) to be populated too.
+ */
+ addr[FILE_SIZE / 2] = 1;
+ retval = mincore(addr, FILE_SIZE, vec);
+ ASSERT_EQ(0, retval);
+ ASSERT_EQ(1, vec[FILE_SIZE / 2 / page_size]) {
+ TH_LOG("Page not found in memory after use");
+ }
+
+ i = FILE_SIZE / 2 / page_size + 1;
+ while (i < vec_size && vec[i]) {
+ ra_pages++;
+ i++;
+ }
+ EXPECT_GT(ra_pages, 0) {
+ TH_LOG("No read-ahead pages found in memory");
+ }
+
+ EXPECT_LT(i, vec_size) {
+ TH_LOG("Read-ahead pages reached the end of the file");
+ }
+ /*
+ * End of the readahead window. The rest of the pages shouldn't
+ * be in memory.
+ */
+ if (i < vec_size) {
+ while (i < vec_size && !vec[i])
+ i++;
+ EXPECT_EQ(vec_size, i) {
+ TH_LOG("Unexpected page in memory beyond readahead window");
+ }
+ }
+
+ munmap(addr, FILE_SIZE);
+ close(fd);
+ free(vec);
+}
+
+
+/*
+ * Test mincore() behavior on a page backed by a tmpfs file. This test
+ * performs the same steps as the previous one. However, we don't expect
+ * any readahead in this case.
+ */
+TEST(check_tmpfs_mmap)
+{
+ unsigned char *vec;
+ int vec_size;
+ char *addr;
+ int retval;
+ int page_size;
+ int fd;
+ int i;
+ int ra_pages = 0;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ vec_size = FILE_SIZE / page_size;
+ if (FILE_SIZE % page_size)
+ vec_size++;
+
+ vec = calloc(vec_size, sizeof(unsigned char));
+ ASSERT_NE(NULL, vec) {
+ TH_LOG("Can't allocate array");
+ }
+
+ errno = 0;
+ fd = open("/dev/shm", O_TMPFILE | O_RDWR, 0600);
+ ASSERT_NE(-1, fd) {
+ TH_LOG("Can't create temporary file: %s",
+ strerror(errno));
+ }
+ errno = 0;
+ retval = fallocate(fd, 0, 0, FILE_SIZE);
+ ASSERT_EQ(0, retval) {
+ TH_LOG("Error allocating space for the temporary file: %s",
+ strerror(errno));
+ }
+
+ /*
+ * Map the whole file, the pages shouldn't be fetched yet.
+ */
+ errno = 0;
+ addr = mmap(NULL, FILE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ ASSERT_NE(MAP_FAILED, addr) {
+ TH_LOG("mmap error: %s", strerror(errno));
+ }
+ retval = mincore(addr, FILE_SIZE, vec);
+ ASSERT_EQ(0, retval);
+ for (i = 0; i < vec_size; i++) {
+ ASSERT_EQ(0, vec[i]) {
+ TH_LOG("Unexpected page in memory");
+ }
+ }
+
+ /*
+ * Touch a page in the middle of the mapping. We expect only
+ * that page to be fetched into memory.
+ */
+ addr[FILE_SIZE / 2] = 1;
+ retval = mincore(addr, FILE_SIZE, vec);
+ ASSERT_EQ(0, retval);
+ ASSERT_EQ(1, vec[FILE_SIZE / 2 / page_size]) {
+ TH_LOG("Page not found in memory after use");
+ }
+
+ i = FILE_SIZE / 2 / page_size + 1;
+ while (i < vec_size && vec[i]) {
+ ra_pages++;
+ i++;
+ }
+ ASSERT_EQ(ra_pages, 0) {
+ TH_LOG("Read-ahead pages found in memory");
+ }
+
+ munmap(addr, FILE_SIZE);
+ close(fd);
+ free(vec);
+}
+
+TEST_HARNESS_MAIN