diff options
250 files changed, 2276 insertions, 4084 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b74e13312fdc..00bb04972612 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1864,13 +1864,6 @@ Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y, the default is off. - kmemcheck= [X86] Boot-time kmemcheck enable/disable/one-shot mode - Valid arguments: 0, 1, 2 - kmemcheck=0 (disabled) - kmemcheck=1 (enabled) - kmemcheck=2 (one-shot mode) - Default: 2 (one-shot mode) - kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs. Default is 0 (don't ignore, but inject #GP) diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst index a81787cd47d7..e313925fb0fa 100644 --- a/Documentation/dev-tools/index.rst +++ b/Documentation/dev-tools/index.rst @@ -21,7 +21,6 @@ whole; patches welcome! kasan ubsan kmemleak - kmemcheck gdb-kernel-debugging kgdb kselftest diff --git a/Documentation/dev-tools/kmemcheck.rst b/Documentation/dev-tools/kmemcheck.rst deleted file mode 100644 index 7f3d1985de74..000000000000 --- a/Documentation/dev-tools/kmemcheck.rst +++ /dev/null @@ -1,733 +0,0 @@ -Getting started with kmemcheck -============================== - -Vegard Nossum <vegardno@ifi.uio.no> - - -Introduction ------------- - -kmemcheck is a debugging feature for the Linux Kernel. More specifically, it -is a dynamic checker that detects and warns about some uses of uninitialized -memory. - -Userspace programmers might be familiar with Valgrind's memcheck. The main -difference between memcheck and kmemcheck is that memcheck works for userspace -programs only, and kmemcheck works for the kernel only. The implementations -are of course vastly different. Because of this, kmemcheck is not as accurate -as memcheck, but it turns out to be good enough in practice to discover real -programmer errors that the compiler is not able to find through static -analysis. - -Enabling kmemcheck on a kernel will probably slow it down to the extent that -the machine will not be usable for normal workloads such as e.g. an -interactive desktop. kmemcheck will also cause the kernel to use about twice -as much memory as normal. For this reason, kmemcheck is strictly a debugging -feature. - - -Downloading ------------ - -As of version 2.6.31-rc1, kmemcheck is included in the mainline kernel. - - -Configuring and compiling -------------------------- - -kmemcheck only works for the x86 (both 32- and 64-bit) platform. A number of -configuration variables must have specific settings in order for the kmemcheck -menu to even appear in "menuconfig". These are: - -- ``CONFIG_CC_OPTIMIZE_FOR_SIZE=n`` - This option is located under "General setup" / "Optimize for size". - - Without this, gcc will use certain optimizations that usually lead to - false positive warnings from kmemcheck. An example of this is a 16-bit - field in a struct, where gcc may load 32 bits, then discard the upper - 16 bits. kmemcheck sees only the 32-bit load, and may trigger a - warning for the upper 16 bits (if they're uninitialized). - -- ``CONFIG_SLAB=y`` or ``CONFIG_SLUB=y`` - This option is located under "General setup" / "Choose SLAB - allocator". - -- ``CONFIG_FUNCTION_TRACER=n`` - This option is located under "Kernel hacking" / "Tracers" / "Kernel - Function Tracer" - - When function tracing is compiled in, gcc emits a call to another - function at the beginning of every function. This means that when the - page fault handler is called, the ftrace framework will be called - before kmemcheck has had a chance to handle the fault. If ftrace then - modifies memory that was tracked by kmemcheck, the result is an - endless recursive page fault. - -- ``CONFIG_DEBUG_PAGEALLOC=n`` - This option is located under "Kernel hacking" / "Memory Debugging" - / "Debug page memory allocations". - -In addition, I highly recommend turning on ``CONFIG_DEBUG_INFO=y``. This is also -located under "Kernel hacking". With this, you will be able to get line number -information from the kmemcheck warnings, which is extremely valuable in -debugging a problem. This option is not mandatory, however, because it slows -down the compilation process and produces a much bigger kernel image. - -Now the kmemcheck menu should be visible (under "Kernel hacking" / "Memory -Debugging" / "kmemcheck: trap use of uninitialized memory"). Here follows -a description of the kmemcheck configuration variables: - -- ``CONFIG_KMEMCHECK`` - This must be enabled in order to use kmemcheck at all... - -- ``CONFIG_KMEMCHECK_``[``DISABLED`` | ``ENABLED`` | ``ONESHOT``]``_BY_DEFAULT`` - This option controls the status of kmemcheck at boot-time. "Enabled" - will enable kmemcheck right from the start, "disabled" will boot the - kernel as normal (but with the kmemcheck code compiled in, so it can - be enabled at run-time after the kernel has booted), and "one-shot" is - a special mode which will turn kmemcheck off automatically after - detecting the first use of uninitialized memory. - - If you are using kmemcheck to actively debug a problem, then you - probably want to choose "enabled" here. - - The one-shot mode is mostly useful in automated test setups because it - can prevent floods of warnings and increase the chances of the machine - surviving in case something is really wrong. In other cases, the one- - shot mode could actually be counter-productive because it would turn - itself off at the very first error -- in the case of a false positive - too -- and this would come in the way of debugging the specific - problem you were interested in. - - If you would like to use your kernel as normal, but with a chance to - enable kmemcheck in case of some problem, it might be a good idea to - choose "disabled" here. When kmemcheck is disabled, most of the run- - time overhead is not incurred, and the kernel will be almost as fast - as normal. - -- ``CONFIG_KMEMCHECK_QUEUE_SIZE`` - Select the maximum number of error reports to store in an internal - (fixed-size) buffer. Since errors can occur virtually anywhere and in - any context, we need a temporary storage area which is guaranteed not - to generate any other page faults when accessed. The queue will be - emptied as soon as a tasklet may be scheduled. If the queue is full, - new error reports will be lost. - - The default value of 64 is probably fine. If some code produces more - than 64 errors within an irqs-off section, then the code is likely to - produce many, many more, too, and these additional reports seldom give - any more information (the first report is usually the most valuable - anyway). - - This number might have to be adjusted if you are not using serial - console or similar to capture the kernel log. If you are using the - "dmesg" command to save the log, then getting a lot of kmemcheck - warnings might overflow the kernel log itself, and the earlier reports - will get lost in that way instead. Try setting this to 10 or so on - such a setup. - -- ``CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT`` - Select the number of shadow bytes to save along with each entry of the - error-report queue. These bytes indicate what parts of an allocation - are initialized, uninitialized, etc. and will be displayed when an - error is detected to help the debugging of a particular problem. - - The number entered here is actually the logarithm of the number of - bytes that will be saved. So if you pick for example 5 here, kmemcheck - will save 2^5 = 32 bytes. - - The default value should be fine for debugging most problems. It also - fits nicely within 80 columns. - -- ``CONFIG_KMEMCHECK_PARTIAL_OK`` - This option (when enabled) works around certain GCC optimizations that - produce 32-bit reads from 16-bit variables where the upper 16 bits are - thrown away afterwards. - - The default value (enabled) is recommended. This may of course hide - some real errors, but disabling it would probably produce a lot of - false positives. - -- ``CONFIG_KMEMCHECK_BITOPS_OK`` - This option silences warnings that would be generated for bit-field - accesses where not all the bits are initialized at the same time. This - may also hide some real bugs. - - This option is probably obsolete, or it should be replaced with - the kmemcheck-/bitfield-annotations for the code in question. The - default value is therefore fine. - -Now compile the kernel as usual. - - -How to use ----------- - -Booting -~~~~~~~ - -First some information about the command-line options. There is only one -option specific to kmemcheck, and this is called "kmemcheck". It can be used -to override the default mode as chosen by the ``CONFIG_KMEMCHECK_*_BY_DEFAULT`` -option. Its possible settings are: - -- ``kmemcheck=0`` (disabled) -- ``kmemcheck=1`` (enabled) -- ``kmemcheck=2`` (one-shot mode) - -If SLUB debugging has been enabled in the kernel, it may take precedence over -kmemcheck in such a way that the slab caches which are under SLUB debugging -will not be tracked by kmemcheck. In order to ensure that this doesn't happen -(even though it shouldn't by default), use SLUB's boot option ``slub_debug``, -like this: ``slub_debug=-`` - -In fact, this option may also be used for fine-grained control over SLUB vs. -kmemcheck. For example, if the command line includes -``kmemcheck=1 slub_debug=,dentry``, then SLUB debugging will be used only -for the "dentry" slab cache, and with kmemcheck tracking all the other -caches. This is advanced usage, however, and is not generally recommended. - - -Run-time enable/disable -~~~~~~~~~~~~~~~~~~~~~~~ - -When the kernel has booted, it is possible to enable or disable kmemcheck at -run-time. WARNING: This feature is still experimental and may cause false -positive warnings to appear. Therefore, try not to use this. If you find that -it doesn't work properly (e.g. you see an unreasonable amount of warnings), I -will be happy to take bug reports. - -Use the file ``/proc/sys/kernel/kmemcheck`` for this purpose, e.g.:: - - $ echo 0 > /proc/sys/kernel/kmemcheck # disables kmemcheck - -The numbers are the same as for the ``kmemcheck=`` command-line option. - - -Debugging -~~~~~~~~~ - -A typical report will look something like this:: - - WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024) - 80000000000000000000000000000000000000000088ffff0000000000000000 - i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u - ^ - - Pid: 1856, comm: ntpdate Not tainted 2.6.29-rc5 #264 945P-A - RIP: 0010:[<ffffffff8104ede8>] [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190 - RSP: 0018:ffff88003cdf7d98 EFLAGS: 00210002 - RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009 - RDX: ffff88003e5d6018 RSI: ffff88003e5d6024 RDI: ffff88003cdf7e84 - RBP: ffff88003cdf7db8 R08: ffff88003e5d6000 R09: 0000000000000000 - R10: 0000000000000080 R11: 0000000000000000 R12: 000000000000000e - R13: ffff88003cdf7e78 R14: ffff88003d530710 R15: ffff88003d5a98c8 - FS: 0000000000000000(0000) GS:ffff880001982000(0063) knlGS:00000 - CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 - CR2: ffff88003f806ea0 CR3: 000000003c036000 CR4: 00000000000006a0 - DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 - DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400 - [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170 - [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390 - [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0 - [<ffffffff8100c7b5>] int_signal+0x12/0x17 - [<ffffffffffffffff>] 0xffffffffffffffff - -The single most valuable information in this report is the RIP (or EIP on 32- -bit) value. This will help us pinpoint exactly which instruction that caused -the warning. - -If your kernel was compiled with ``CONFIG_DEBUG_INFO=y``, then all we have to do -is give this address to the addr2line program, like this:: - - $ addr2line -e vmlinux -i ffffffff8104ede8 - arch/x86/include/asm/string_64.h:12 - include/asm-generic/siginfo.h:287 - kernel/signal.c:380 - kernel/signal.c:410 - -The "``-e vmlinux``" tells addr2line which file to look in. **IMPORTANT:** -This must be the vmlinux of the kernel that produced the warning in the -first place! If not, the line number information will almost certainly be -wrong. - -The "``-i``" tells addr2line to also print the line numbers of inlined -functions. In this case, the flag was very important, because otherwise, -it would only have printed the first line, which is just a call to -``memcpy()``, which could be called from a thousand places in the kernel, and -is therefore not very useful. These inlined functions would not show up in -the stack trace above, simply because the kernel doesn't load the extra -debugging information. This technique can of course be used with ordinary -kernel oopses as well. - -In this case, it's the caller of ``memcpy()`` that is interesting, and it can be -found in ``include/asm-generic/siginfo.h``, line 287:: - - 281 static inline void copy_siginfo(struct siginfo *to, struct siginfo *from) - 282 { - 283 if (from->si_code < 0) - 284 memcpy(to, from, sizeof(*to)); - 285 else - 286 /* _sigchld is currently the largest know union member */ - 287 memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld)); - 288 } - -Since this was a read (kmemcheck usually warns about reads only, though it can -warn about writes to unallocated or freed memory as well), it was probably the -"from" argument which contained some uninitialized bytes. Following the chain -of calls, we move upwards to see where "from" was allocated or initialized, -``kernel/signal.c``, line 380:: - - 359 static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) - 360 { - ... - 367 list_for_each_entry(q, &list->list, list) { - 368 if (q->info.si_signo == sig) { - 369 if (first) - 370 goto still_pending; - 371 first = q; - ... - 377 if (first) { - 378 still_pending: - 379 list_del_init(&first->list); - 380 copy_siginfo(info, &first->info); - 381 __sigqueue_free(first); - ... - 392 } - 393 } - -Here, it is ``&first->info`` that is being passed on to ``copy_siginfo()``. The -variable ``first`` was found on a list -- passed in as the second argument to -``collect_signal()``. We continue our journey through the stack, to figure out -where the item on "list" was allocated or initialized. We move to line 410:: - - 395 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, - 396 siginfo_t *info) - 397 { - ... - 410 collect_signal(sig, pending, info); - ... - 414 } - -Now we need to follow the ``pending`` pointer, since that is being passed on to -``collect_signal()`` as ``list``. At this point, we've run out of lines from the -"addr2line" output. Not to worry, we just paste the next addresses from the -kmemcheck stack dump, i.e.:: - - [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170 - [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390 - [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0 - [<ffffffff8100c7b5>] int_signal+0x12/0x17 - - $ addr2line -e vmlinux -i ffffffff8104f04e ffffffff81050bd8 \ - ffffffff8100b87d ffffffff8100c7b5 - kernel/signal.c:446 - kernel/signal.c:1806 - arch/x86/kernel/signal.c:805 - arch/x86/kernel/signal.c:871 - arch/x86/kernel/entry_64.S:694 - -Remember that since these addresses were found on the stack and not as the -RIP value, they actually point to the _next_ instruction (they are return -addresses). This becomes obvious when we look at the code for line 446:: - - 422 int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) - 423 { - ... - 431 signr = __dequeue_signal(&tsk->signal->shared_pending, - 432 mask, info); - 433 /* - 434 * itimer signal ? - 435 * - 436 * itimers are process shared and we restart periodic - 437 * itimers in the signal delivery path to prevent DoS - 438 * attacks in the high resolution timer case. This is - 439 * compliant with the old way of self restarting - 440 * itimers, as the SIGALRM is a legacy signal and only - 441 * queued once. Changing the restart behaviour to - 442 * restart the timer in the signal dequeue path is - 443 * reducing the timer noise on heavy loaded !highres - 444 * systems too. - 445 */ - 446 if (unlikely(signr == SIGALRM)) { - ... - 489 } - -So instead of looking at 446, we should be looking at 431, which is the line -that executes just before 446. Here we see that what we are looking for is -``&tsk->signal->shared_pending``. - -Our next task is now to figure out which function that puts items on this -``shared_pending`` list. A crude, but efficient tool, is ``git grep``:: - - $ git grep -n 'shared_pending' kernel/ - ... - kernel/signal.c:828: pending = group ? &t->signal->shared_pending : &t->pending; - kernel/signal.c:1339: pending = group ? &t->signal->shared_pending : &t->pending; - ... - -There were more results, but none of them were related to list operations, -and these were the only assignments. We inspect the line numbers more closely -and find that this is indeed where items are being added to the list:: - - 816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t, - 817 int group) - 818 { - ... - 828 pending = group ? &t->signal->shared_pending : &t->pending; - ... - 851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && - 852 (is_si_special(info) || - 853 info->si_code >= 0))); - 854 if (q) { - 855 list_add_tail(&q->list, &pending->list); - ... - 890 } - -and:: - - 1309 int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) - 1310 { - .... - 1339 pending = group ? &t->signal->shared_pending : &t->pending; - 1340 list_add_tail(&q->list, &pending->list); - .... - 1347 } - -In the first case, the list element we are looking for, ``q``, is being -returned from the function ``__sigqueue_alloc()``, which looks like an -allocation function. Let's take a look at it:: - - 187 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, - 188 int override_rlimit) - 189 { - 190 struct sigqueue *q = NULL; - 191 struct user_struct *user; - 192 - 193 /* - 194 * We won't get problems with the target's UID changing under us - 195 * because changing it requires RCU be used, and if t != current, the - 196 * caller must be holding the RCU readlock (by way of a spinlock) and - 197 * we use RCU protection here - 198 */ - 199 user = get_uid(__task_cred(t)->user); - 200 atomic_inc(&user->sigpending); - 201 if (override_rlimit || - 202 atomic_read(&user->sigpending) <= - 203 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) - 204 q = kmem_cache_alloc(sigqueue_cachep, flags); - 205 if (unlikely(q == NULL)) { - 206 atomic_dec(&user->sigpending); - 207 free_uid(user); - 208 } else { - 209 INIT_LIST_HEAD(&q->list); - 210 q->flags = 0; - 211 q->user = user; - 212 } - 213 - 214 return q; - 215 } - -We see that this function initializes ``q->list``, ``q->flags``, and -``q->user``. It seems that now is the time to look at the definition of -``struct sigqueue``, e.g.:: - - 14 struct sigqueue { - 15 struct list_head list; - 16 int flags; - 17 siginfo_t info; - 18 struct user_struct *user; - 19 }; - -And, you might remember, it was a ``memcpy()`` on ``&first->info`` that -caused the warning, so this makes perfect sense. It also seems reasonable -to assume that it is the caller of ``__sigqueue_alloc()`` that has the -responsibility of filling out (initializing) this member. - -But just which fields of the struct were uninitialized? Let's look at -kmemcheck's report again:: - - WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024) - 80000000000000000000000000000000000000000088ffff0000000000000000 - i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u - ^ - -These first two lines are the memory dump of the memory object itself, and -the shadow bytemap, respectively. The memory object itself is in this case -``&first->info``. Just beware that the start of this dump is NOT the start -of the object itself! The position of the caret (^) corresponds with the -address of the read (ffff88003e4a2024). - -The shadow bytemap dump legend is as follows: - -- i: initialized -- u: uninitialized -- a: unallocated (memory has been allocated by the slab layer, but has not - yet been handed off to anybody) -- f: freed (memory has been allocated by the slab layer, but has been freed - by the previous owner) - -In order to figure out where (relative to the start of the object) the -uninitialized memory was located, we have to look at the disassembly. For -that, we'll need the RIP address again:: - - RIP: 0010:[<ffffffff8104ede8>] [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190 - - $ objdump -d --no-show-raw-insn vmlinux | grep -C 8 ffffffff8104ede8: - ffffffff8104edc8: mov %r8,0x8(%r8) - ffffffff8104edcc: test %r10d,%r10d - ffffffff8104edcf: js ffffffff8104ee88 <__dequeue_signal+0x168> - ffffffff8104edd5: mov %rax,%rdx - ffffffff8104edd8: mov $0xc,%ecx - ffffffff8104eddd: mov %r13,%rdi - ffffffff8104ede0: mov $0x30,%eax - ffffffff8104ede5: mov %rdx,%rsi - ffffffff8104ede8: rep movsl %ds:(%rsi),%es:(%rdi) - ffffffff8104edea: test $0x2,%al - ffffffff8104edec: je ffffffff8104edf0 <__dequeue_signal+0xd0> - ffffffff8104edee: movsw %ds:(%rsi),%es:(%rdi) - ffffffff8104edf0: test $0x1,%al - ffffffff8104edf2: je ffffffff8104edf5 <__dequeue_signal+0xd5> - ffffffff8104edf4: movsb %ds:(%rsi),%es:(%rdi) - ffffffff8104edf5: mov %r8,%rdi - ffffffff8104edf8: callq ffffffff8104de60 <__sigqueue_free> - -As expected, it's the "``rep movsl``" instruction from the ``memcpy()`` -that causes the warning. We know about ``REP MOVSL`` that it uses the register -``RCX`` to count the number of remaining iterations. By taking a look at the -register dump again (from the kmemcheck report), we can figure out how many -bytes were left to copy:: - - RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009 - -By looking at the disassembly, we also see that ``%ecx`` is being loaded -with the value ``$0xc`` just before (ffffffff8104edd8), so we are very -lucky. Keep in mind that this is the number of iterations, not bytes. And -since this is a "long" operation, we need to multiply by 4 to get the -number of bytes. So this means that the uninitialized value was encountered -at 4 * (0xc - 0x9) = 12 bytes from the start of the object. - -We can now try to figure out which field of the "``struct siginfo``" that -was not initialized. This is the beginning of the struct:: - - 40 typedef struct siginfo { - 41 int si_signo; - 42 int si_errno; - 43 int si_code; - 44 - 45 union { - .. - 92 } _sifields; - 93 } siginfo_t; - -On 64-bit, the int is 4 bytes long, so it must the union member that has -not been initialized. We can verify this using gdb:: - - $ gdb vmlinux - ... - (gdb) p &((struct siginfo *) 0)->_sifields - $1 = (union {...} *) 0x10 - -Actually, it seems that the union member is located at offset 0x10 -- which -means that gcc has inserted 4 bytes of padding between the members ``si_code`` -and ``_sifields``. We can now get a fuller picture of the memory dump:: - - _----------------------------=> si_code - / _--------------------=> (padding) - | / _------------=> _sifields(._kill._pid) - | | / _----=> _sifields(._kill._uid) - | | | / - -------|-------|-------|-------| - 80000000000000000000000000000000000000000088ffff0000000000000000 - i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u - -This allows us to realize another important fact: ``si_code`` contains the -value 0x80. Remember that x86 is little endian, so the first 4 bytes -"80000000" are really the number 0x00000080. With a bit of research, we -find that this is actually the constant ``SI_KERNEL`` defined in -``include/asm-generic/siginfo.h``:: - - 144 #define SI_KERNEL 0x80 /* sent by the kernel from somewhere */ - -This macro is used in exactly one place in the x86 kernel: In ``send_signal()`` -in ``kernel/signal.c``:: - - 816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t, - 817 int group) - 818 { - ... - 828 pending = group ? &t->signal->shared_pending : &t->pending; - ... - 851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && - 852 (is_si_special(info) || - 853 info->si_code >= 0))); - 854 if (q) { - 855 list_add_tail(&q->list, &pending->list); - 856 switch ((unsigned long) info) { - ... - 865 case (unsigned long) SEND_SIG_PRIV: - 866 q->info.si_signo = sig; - 867 q->info.si_errno = 0; - 868 q->info.si_code = SI_KERNEL; - 869 q->info.si_pid = 0; - 870 q->info.si_uid = 0; - 871 break; - ... - 890 } - -Not only does this match with the ``.si_code`` member, it also matches the place -we found earlier when looking for where siginfo_t objects are enqueued on the -``shared_pending`` list. - -So to sum up: It seems that it is the padding introduced by the compiler -between two struct fields that is uninitialized, and this gets reported when -we do a ``memcpy()`` on the struct. This means that we have identified a false -positive warning. - -Normally, kmemcheck will not report uninitialized accesses in ``memcpy()`` calls -when both the source and destination addresses are tracked. (Instead, we copy -the shadow bytemap as well). In this case, the destination address clearly -was not tracked. We can dig a little deeper into the stack trace from above:: - - arch/x86/kernel/signal.c:805 - arch/x86/kernel/signal.c:871 - arch/x86/kernel/entry_64.S:694 - -And we clearly see that the destination siginfo object is located on the -stack:: - - 782 static void do_signal(struct pt_regs *regs) - 783 { - 784 struct k_sigaction ka; - 785 siginfo_t info; - ... - 804 signr = get_signal_to_deliver(&info, &ka, regs, NULL); - ... - 854 } - -And this ``&info`` is what eventually gets passed to ``copy_siginfo()`` as the -destination argument. - -Now, even though we didn't find an actual error here, the example is still a -good one, because it shows how one would go about to find out what the report -was all about. - - -Annotating false positives -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -There are a few different ways to make annotations in the source code that -will keep kmemcheck from checking and reporting certain allocations. Here -they are: - -- ``__GFP_NOTRACK_FALSE_POSITIVE`` - This flag can be passed to ``kmalloc()`` or ``kmem_cache_alloc()`` - (therefore also to other functions that end up calling one of - these) to indicate that the allocation should not be tracked - because it would lead to a false positive report. This is a "big - hammer" way of silencing kmemcheck; after all, even if the false - positive pertains to particular field in a struct, for example, we - will now lose the ability to find (real) errors in other parts of - the same struct. - - Example:: - - /* No warnings will ever trigger on accessing any part of x */ - x = kmalloc(sizeof *x, GFP_KERNEL | __GFP_NOTRACK_FALSE_POSITIVE); - -- ``kmemcheck_bitfield_begin(name)``/``kmemcheck_bitfield_end(name)`` and - ``kmemcheck_annotate_bitfield(ptr, name)`` - The first two of these three macros can be used inside struct - definitions to signal, respectively, the beginning and end of a - bitfield. Additionally, this will assign the bitfield a name, which - is given as an argument to the macros. - - Having used these markers, one can later use - kmemcheck_annotate_bitfield() at the point of allocation, to indicate - which parts of the allocation is part of a bitfield. - - Example:: - - struct foo { - int x; - - kmemcheck_bitfield_begin(flags); - int flag_a:1; - int flag_b:1; - kmemcheck_bitfield_end(flags); - - int y; - }; - - struct foo *x = kmalloc(sizeof *x); - - /* No warnings will trigger on accessing the bitfield of x */ - kmemcheck_annotate_bitfield(x, flags); - - Note that ``kmemcheck_annotate_bitfield()`` can be used even before the - return value of ``kmalloc()`` is checked -- in other words, passing NULL - as the first argument is legal (and will do nothing). - - -Reporting errors ----------------- - -As we have seen, kmemcheck will produce false positive reports. Therefore, it -is not very wise to blindly post kmemcheck warnings to mailing lists and -maintainers. Instead, I encourage maintainers and developers to find errors -in their own code. If you get a warning, you can try to work around it, try -to figure out if it's a real error or not, or simply ignore it. Most -developers know their own code and will quickly and efficiently determine the -root cause of a kmemcheck report. This is therefore also the most efficient -way to work with kmemcheck. - -That said, we (the kmemcheck maintainers) will always be on the lookout for -false positives that we can annotate and silence. So whatever you find, -please drop us a note privately! Kernel configs and steps to reproduce (if -available) are of course a great help too. - -Happy hacking! - - -Technical description ---------------------- - -kmemcheck works by marking memory pages non-present. This means that whenever -somebody attempts to access the page, a page fault is generated. The page -fault handler notices that the page was in fact only hidden, and so it calls -on the kmemcheck code to make further investigations. - -When the investigations are completed, kmemcheck "shows" the page by marking -it present (as it would be under normal circumstances). This way, the -interrupted code can continue as usual. - -But after the instruction has been executed, we should hide the page again, so -that we can catch the next access too! Now kmemcheck makes use of a debugging -feature of the processor, namely single-stepping. When the processor has -finished the one instruction that generated the memory access, a debug -exception is raised. From here, we simply hide the page again and continue -execution, this time with the single-stepping feature turned off. - -kmemcheck requires some assistance from the memory allocator in order to work. -The memory allocator needs to - - 1. Tell kmemcheck about newly allocated pages and pages that are about to - be freed. This allows kmemcheck to set up and tear down the shadow memory - for the pages in question. The shadow memory stores the status of each - byte in the allocation proper, e.g. whether it is initialized or - uninitialized. - - 2. Tell kmemcheck which parts of memory should be marked uninitialized. - There are actually a few more states, such as "not yet allocated" and - "recently freed". - -If a slab cache is set up using the SLAB_NOTRACK flag, it will never return -memory that can take page faults because of kmemcheck. - -If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still -request memory with the __GFP_NOTRACK or __GFP_NOTRACK_FALSE_POSITIVE flags. -This does not prevent the page faults from occurring, however, but marks the -object in question as being initialized so that no warnings will ever be -produced for this object. - -Currently, the SLAB and SLUB allocators are supported by kmemcheck. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index adba21b5ada7..ec571b9bb18a 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -250,7 +250,6 @@ Table 1-2: Contents of the status files (as of 4.8) VmExe size of text segment VmLib size of shared library code VmPTE size of page table entries - VmPMD size of second level page tables VmSwap amount of swap used by anonymous private data (shmem swap usage is not included) HugetlbPages size of hugetlb memory portions diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 9baf66a9ef4e..055c8b3e1018 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -58,6 +58,7 @@ Currently, these files are in /proc/sys/vm: - percpu_pagelist_fraction - stat_interval - stat_refresh +- numa_stat - swappiness - user_reserve_kbytes - vfs_cache_pressure @@ -157,6 +158,10 @@ Note: the minimum value allowed for dirty_bytes is two pages (in bytes); any value lower than this limit will be ignored and the old configuration will be retained. +Note: the value of dirty_bytes also must be set greater than +dirty_background_bytes or the amount of memory corresponding to +dirty_background_ratio. + ============================================================== dirty_expire_centisecs @@ -176,6 +181,9 @@ generating disk writes will itself start writing out dirty data. The total available memory is not equal to total system memory. +Note: dirty_ratio must be set greater than dirty_background_ratio or +ratio corresponding to dirty_background_bytes. + ============================================================== dirty_writeback_centisecs @@ -622,7 +630,7 @@ oom_dump_tasks Enables a system-wide task dump (excluding kernel threads) to be produced when the kernel performs an OOM-killing and includes such information as -pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj +pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj score, and name. This is helpful to determine why the OOM killer was invoked, to identify the rogue task that caused it, and to determine why the OOM killer chose the task it did to kill. @@ -792,6 +800,21 @@ with no ill effects: errors and warnings on these stats are suppressed.) ============================================================== +numa_stat + +This interface allows runtime configuration of numa statistics. + +When page allocation performance becomes a bottleneck and you can tolerate +some possible tool breakage and decreased numa counter precision, you can +do: + echo 0 > /proc/sys/vm/numa_stat + +When page allocation performance is not a bottleneck and you want all +tooling to work, you can do: + echo 1 > /proc/sys/vm/numa_stat + +============================================================== + swappiness This control is used to define how aggressive the kernel will swap diff --git a/Documentation/vm/mmu_notifier.txt b/Documentation/vm/mmu_notifier.txt new file mode 100644 index 000000000000..23b462566bb7 --- /dev/null +++ b/Documentation/vm/mmu_notifier.txt @@ -0,0 +1,93 @@ +When do you need to notify inside page table lock ? + +When clearing a pte/pmd we are given a choice to notify the event through +(notify version of *_clear_flush call mmu_notifier_invalidate_range) under +the page table lock. But that notification is not necessary in all cases. + +For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use +thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a +process virtual address space). There is only 2 cases when you need to notify +those secondary TLB while holding page table lock when clearing a pte/pmd: + + A) page backing address is free before mmu_notifier_invalidate_range_end() + B) a page table entry is updated to point to a new page (COW, write fault + on zero page, __replace_page(), ...) + +Case A is obvious you do not want to take the risk for the device to write to +a page that might now be used by some completely different task. + +Case B is more subtle. For correctness it requires the following sequence to +happen: + - take page table lock + - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify()) + - set page table entry to point to new page + +If clearing the page table entry is not followed by a notify before setting +the new pte/pmd value then you can break memory model like C11 or C++11 for +the device. + +Consider the following scenario (device use a feature similar to ATS/PASID): + +Two address addrA and addrB such that |addrA - addrB| >= PAGE_SIZE we assume +they are write protected for COW (other case of B apply too). + +[Time N] -------------------------------------------------------------------- +CPU-thread-0 {try to write to addrA} +CPU-thread-1 {try to write to addrB} +CPU-thread-2 {} +CPU-thread-3 {} +DEV-thread-0 {read addrA and populate device TLB} +DEV-thread-2 {read addrB and populate device TLB} +[Time N+1] ------------------------------------------------------------------ +CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} +CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} +CPU-thread-2 {} +CPU-thread-3 {} +DEV-thread-0 {} +DEV-thread-2 {} +[Time N+2] ------------------------------------------------------------------ +CPU-thread-0 {COW_step1: {update page table to point to new page for addrA}} +CPU-thread-1 {COW_step1: {update page table to point to new page for addrB}} +CPU-thread-2 {} +CPU-thread-3 {} +DEV-thread-0 {} +DEV-thread-2 {} +[Time N+3] ------------------------------------------------------------------ +CPU-thread-0 {preempted} +CPU-thread-1 {preempted} +CPU-thread-2 {write to addrA which is a write to new page} +CPU-thread-3 {} +DEV-thread-0 {} +DEV-thread-2 {} +[Time N+3] ------------------------------------------------------------------ +CPU-thread-0 {preempted} +CPU-thread-1 {preempted} +CPU-thread-2 {} +CPU-thread-3 {write to addrB which is a write to new page} +DEV-thread-0 {} +DEV-thread-2 {} +[Time N+4] ------------------------------------------------------------------ +CPU-thread-0 {preempted} +CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} +CPU-thread-2 {} +CPU-thread-3 {} +DEV-thread-0 {} +DEV-thread-2 {} +[Time N+5] ------------------------------------------------------------------ +CPU-thread-0 {preempted} +CPU-thread-1 {} +CPU-thread-2 {} +CPU-thread-3 {} +DEV-thread-0 {read addrA from old page} +DEV-thread-2 {read addrB from new page} + +So here because at time N+2 the clear page table entry was not pair with a +notification to invalidate the secondary TLB, the device see the new value for +addrB before seing the new value for addrA. This break total memory ordering +for the device. + +When changing a pte to write protect or to point to a new write protected page +with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range +call to mmu_notifier_invalidate_range_end() outside the page table lock. This +is true even if the thread doing the page table update is preempted right after +releasing page table lock but before call mmu_notifier_invalidate_range_end(). diff --git a/MAINTAINERS b/MAINTAINERS index cd7e12dc6af4..b0543c223f6a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7692,16 +7692,6 @@ F: include/linux/kdb.h F: include/linux/kgdb.h F: kernel/debug/ -KMEMCHECK -M: Vegard Nossum <vegardno@ifi.uio.no> -M: Pekka Enberg <penberg@kernel.org> -S: Maintained -F: Documentation/dev-tools/kmemcheck.rst -F: arch/x86/include/asm/kmemcheck.h -F: arch/x86/mm/kmemcheck/ -F: include/linux/kmemcheck.h -F: mm/kmemcheck.c - KMEMLEAK M: Catalin Marinas <catalin.marinas@arm.com> S: Maintained diff --git a/arch/arm/include/asm/dma-iommu.h b/arch/arm/include/asm/dma-iommu.h index 0722ec6be692..6821f1249300 100644 --- a/arch/arm/include/asm/dma-iommu.h +++ b/arch/arm/include/asm/dma-iommu.h @@ -7,7 +7,6 @@ #include <linux/mm_types.h> #include <linux/scatterlist.h> #include <linux/dma-debug.h> -#include <linux/kmemcheck.h> #include <linux/kref.h> #define ARM_MAPPING_ERROR (~(dma_addr_t)0x0) diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index b2902a5cd780..2d7344f0e208 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -57,7 +57,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) static inline void clean_pte_table(pte_t *pte) { diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index c1c1a5c67da1..61e281cb29fb 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -141,7 +141,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base) pte = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free(mm, pte); - atomic_long_dec(&mm->nr_ptes); + mm_dec_nr_ptes(mm); no_pmd: pud_clear(pud); pmd_free(mm, pmd); diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ba6aab55d464..a93339f5178f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -85,7 +85,7 @@ config ARM64 select HAVE_ARCH_BITREVERSE select HAVE_ARCH_HUGE_VMAP select HAVE_ARCH_JUMP_LABEL - select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48) + select HAVE_ARCH_KASAN if !(ARM64_16K_PAGES && ARM64_VA_BITS_48) select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index d25f4f137c2a..5ca6a573a701 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -26,7 +26,7 @@ #define check_pgt_cache() do { } while (0) -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) #if CONFIG_PGTABLE_LEVELS > 2 diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 81f03959a4ab..acba49fb5aac 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -11,6 +11,7 @@ */ #define pr_fmt(fmt) "kasan: " fmt +#include <linux/bootmem.h> #include <linux/kasan.h> #include <linux/kernel.h> #include <linux/sched/task.h> @@ -35,77 +36,117 @@ static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE); * with the physical address from __pa_symbol. */ -static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr, - unsigned long end) +static phys_addr_t __init kasan_alloc_zeroed_page(int node) { - pte_t *pte; - unsigned long next; + void *p = memblock_virt_alloc_try_nid(PAGE_SIZE, PAGE_SIZE, + __pa(MAX_DMA_ADDRESS), + MEMBLOCK_ALLOC_ACCESSIBLE, node); + return __pa(p); +} + +static pte_t *__init kasan_pte_offset(pmd_t *pmd, unsigned long addr, int node, + bool early) +{ + if (pmd_none(*pmd)) { + phys_addr_t pte_phys = early ? __pa_symbol(kasan_zero_pte) + : kasan_alloc_zeroed_page(node); + __pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE); + } + + return early ? pte_offset_kimg(pmd, addr) + : pte_offset_kernel(pmd, addr); +} - if (pmd_none(*pmd)) - __pmd_populate(pmd, __pa_symbol(kasan_zero_pte), PMD_TYPE_TABLE); +static pmd_t *__init kasan_pmd_offset(pud_t *pud, unsigned long addr, int node, + bool early) +{ + if (pud_none(*pud)) { + phys_addr_t pmd_phys = early ? __pa_symbol(kasan_zero_pmd) + : kasan_alloc_zeroed_page(node); + __pud_populate(pud, pmd_phys, PMD_TYPE_TABLE); + } + + return early ? pmd_offset_kimg(pud, addr) : pmd_offset(pud, addr); +} + +static pud_t *__init kasan_pud_offset(pgd_t *pgd, unsigned long addr, int node, + bool early) +{ + if (pgd_none(*pgd)) { + phys_addr_t pud_phys = early ? __pa_symbol(kasan_zero_pud) + : kasan_alloc_zeroed_page(node); + __pgd_populate(pgd, pud_phys, PMD_TYPE_TABLE); + } + + return early ? pud_offset_kimg(pgd, addr) : pud_offset(pgd, addr); +} + +static void __init kasan_pte_populate(pmd_t *pmd, unsigned long addr, + unsigned long end, int node, bool early) +{ + unsigned long next; + pte_t *pte = kasan_pte_offset(pmd, addr, node, early); - pte = pte_offset_kimg(pmd, addr); do { + phys_addr_t page_phys = early ? __pa_symbol(kasan_zero_page) + : kasan_alloc_zeroed_page(node); next = addr + PAGE_SIZE; - set_pte(pte, pfn_pte(sym_to_pfn(kasan_zero_page), - PAGE_KERNEL)); + set_pte(pte, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); } while (pte++, addr = next, addr != end && pte_none(*pte)); } -static void __init kasan_early_pmd_populate(pud_t *pud, - unsigned long addr, - unsigned long end) +static void __init kasan_pmd_populate(pud_t *pud, unsigned long addr, + unsigned long end, int node, bool early) { - pmd_t *pmd; unsigned long next; + pmd_t *pmd = kasan_pmd_offset(pud, addr, node, early); - if (pud_none(*pud)) - __pud_populate(pud, __pa_symbol(kasan_zero_pmd), PMD_TYPE_TABLE); - - pmd = pmd_offset_kimg(pud, addr); do { next = pmd_addr_end(addr, end); - kasan_early_pte_populate(pmd, addr, next); + kasan_pte_populate(pmd, addr, next, node, early); } while (pmd++, addr = next, addr != end && pmd_none(*pmd)); } -static void __init kasan_early_pud_populate(pgd_t *pgd, - unsigned long addr, - unsigned long end) +static void __init kasan_pud_populate(pgd_t *pgd, unsigned long addr, + unsigned long end, int node, bool early) { - pud_t *pud; unsigned long next; + pud_t *pud = kasan_pud_offset(pgd, addr, node, early); - if (pgd_none(*pgd)) - __pgd_populate(pgd, __pa_symbol(kasan_zero_pud), PUD_TYPE_TABLE); - - pud = pud_offset_kimg(pgd, addr); do { next = pud_addr_end(addr, end); - kasan_early_pmd_populate(pud, addr, next); + kasan_pmd_populate(pud, addr, next, node, early); } while (pud++, addr = next, addr != end && pud_none(*pud)); } -static void __init kasan_map_early_shadow(void) +static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, + int node, bool early) { - unsigned long addr = KASAN_SHADOW_START; - unsigned long end = KASAN_SHADOW_END; unsigned long next; pgd_t *pgd; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - kasan_early_pud_populate(pgd, addr, next); + kasan_pud_populate(pgd, addr, next, node, early); } while (pgd++, addr = next, addr != end); } +/* The early shadow maps everything to a single page of zeroes */ asmlinkage void __init kasan_early_init(void) { BUILD_BUG_ON(KASAN_SHADOW_OFFSET != KASAN_SHADOW_END - (1UL << 61)); BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE)); BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE)); - kasan_map_early_shadow(); + kasan_pgd_populate(KASAN_SHADOW_START, KASAN_SHADOW_END, NUMA_NO_NODE, + true); +} + +/* Set up full kasan mappings, ensuring that the mapped pages are zeroed */ +static void __init kasan_map_populate(unsigned long start, unsigned long end, + int node) +{ + kasan_pgd_populate(start & PAGE_MASK, PAGE_ALIGN(end), node, false); } /* @@ -142,8 +183,8 @@ void __init kasan_init(void) struct memblock_region *reg; int i; - kimg_shadow_start = (u64)kasan_mem_to_shadow(_text); - kimg_shadow_end = (u64)kasan_mem_to_shadow(_end); + kimg_shadow_start = (u64)kasan_mem_to_shadow(_text) & PAGE_MASK; + kimg_shadow_end = PAGE_ALIGN((u64)kasan_mem_to_shadow(_end)); mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR); mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END); @@ -161,19 +202,8 @@ void __init kasan_init(void) clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); - vmemmap_populate(kimg_shadow_start, kimg_shadow_end, - pfn_to_nid(virt_to_pfn(lm_alias(_text)))); - - /* - * vmemmap_populate() has populated the shadow region that covers the - * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round - * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent - * kasan_populate_zero_shadow() from replacing the page table entries - * (PMD or PTE) at the edges of the shadow region for the kernel - * image. - */ - kimg_shadow_start = round_down(kimg_shadow_start, SWAPPER_BLOCK_SIZE); - kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE); + kasan_map_populate(kimg_shadow_start, kimg_shadow_end, + pfn_to_nid(virt_to_pfn(lm_alias(_text)))); kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, (void *)mod_shadow_start); @@ -191,9 +221,9 @@ void __init kasan_init(void) if (start >= end) break; - vmemmap_populate((unsigned long)kasan_mem_to_shadow(start), - (unsigned long)kasan_mem_to_shadow(end), - pfn_to_nid(virt_to_pfn(start))); + kasan_map_populate((unsigned long)kasan_mem_to_shadow(start), + (unsigned long)kasan_mem_to_shadow(end), + pfn_to_nid(virt_to_pfn(start))); } /* diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c index 328f0a292316..cf464100e838 100644 --- a/arch/frv/mm/init.c +++ b/arch/frv/mm/init.c @@ -42,21 +42,9 @@ #undef DEBUG /* - * BAD_PAGE is the page that is used for page faults when linux - * is out-of-memory. Older versions of linux just did a - * do_exit(), but using this instead means there is less risk - * for a process dying in kernel mode, possibly leaving a inode - * unused etc.. - * - * BAD_PAGETABLE is the accompanying page-table: it is initialized - * to point to BAD_PAGE entries. - * * ZERO_PAGE is a special page that is used for zero-initialized * data and COW. */ -static unsigned long empty_bad_page_table; -static unsigned long empty_bad_page; - unsigned long empty_zero_page; EXPORT_SYMBOL(empty_zero_page); @@ -72,8 +60,6 @@ void __init paging_init(void) unsigned long zones_size[MAX_NR_ZONES] = {0, }; /* allocate some pages for kernel housekeeping tasks */ - empty_bad_page_table = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - empty_bad_page = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); empty_zero_page = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); memset((void *) empty_zero_page, 0, PAGE_SIZE); diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index eeead51bed2d..015287ac8ce8 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -40,20 +40,9 @@ #include <asm/sections.h> /* - * BAD_PAGE is the page that is used for page faults when linux - * is out-of-memory. Older versions of linux just did a - * do_exit(), but using this instead means there is less risk - * for a process dying in kernel mode, possibly leaving a inode - * unused etc.. - * - * BAD_PAGETABLE is the accompanying page-table: it is initialized - * to point to BAD_PAGE entries. - * * ZERO_PAGE is a special page that is used for zero-initialized * data and COW. */ -static unsigned long empty_bad_page_table; -static unsigned long empty_bad_page; unsigned long empty_zero_page; /* @@ -78,8 +67,6 @@ void __init paging_init(void) * Initialize the bad page table and bad page to point * to a couple of allocated pages. */ - empty_bad_page_table = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); - empty_bad_page = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); empty_zero_page = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); memset((void *)empty_zero_page, 0, PAGE_SIZE); diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 0d9446c37ae8..498398d915c1 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -196,8 +196,8 @@ config TIMER_DIVIDE default "128" config CPU_BIG_ENDIAN - bool "Generate big endian code" - default n + bool + default !CPU_LITTLE_ENDIAN config CPU_LITTLE_ENDIAN bool "Generate little endian code" diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index 67fe6dc5211c..0036ea0c7173 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -31,12 +31,7 @@ * tables. Each page table is also a single 4K page, giving 512 (== * PTRS_PER_PTE) 8 byte ptes. Each pud entry is initialized to point to * invalid_pmd_table, each pmd entry is initialized to point to - * invalid_pte_table, each pte is initialized to 0. When memory is low, - * and a pmd table or a page table allocation fails, empty_bad_pmd_table - * and empty_bad_page_table is returned back to higher layer code, so - * that the failure is recognized later on. Linux does not seem to - * handle these failures very well though. The empty_bad_page_table has - * invalid pte entries in it, to force page faults. + * invalid_pte_table, each pte is initialized to 0. * * Kernel mappings: kernel mappings are held in the swapper_pg_table. * The layout is identical to userspace except it's indexed with the @@ -175,7 +170,6 @@ printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e)) extern pte_t invalid_pte_table[PTRS_PER_PTE]; -extern pte_t empty_bad_page_table[PTRS_PER_PTE]; #ifndef __PAGETABLE_PUD_FOLDED /* diff --git a/arch/mn10300/kernel/head.S b/arch/mn10300/kernel/head.S index 73e00fc78072..0b15f759e0d2 100644 --- a/arch/mn10300/kernel/head.S +++ b/arch/mn10300/kernel/head.S @@ -434,14 +434,6 @@ ENTRY(empty_zero_page) .space PAGE_SIZE .balign PAGE_SIZE -ENTRY(empty_bad_page) - .space PAGE_SIZE - - .balign PAGE_SIZE -ENTRY(empty_bad_pte_table) - .space PAGE_SIZE - - .balign PAGE_SIZE ENTRY(large_page_table) .space PAGE_SIZE diff --git a/arch/openrisc/include/asm/dma-mapping.h b/arch/openrisc/include/asm/dma-mapping.h index f41bd3cb76d9..e212a1f0b6d2 100644 --- a/arch/openrisc/include/asm/dma-mapping.h +++ b/arch/openrisc/include/asm/dma-mapping.h @@ -23,7 +23,6 @@ */ #include <linux/dma-debug.h> -#include <linux/kmemcheck.h> #include <linux/dma-mapping.h> extern const struct dma_map_ops or1k_dma_map_ops; diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index a14203c005f1..e11f03007b57 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -18,7 +18,7 @@ static inline gfp_t pgtable_gfp_flags(struct mm_struct *mm, gfp_t gfp) } #endif /* MODULE */ -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) #ifdef CONFIG_PPC_BOOK3S #include <asm/book3s/pgalloc.h> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 1571a498a33f..a9b9083c5e49 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -433,6 +433,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); pud_free_tlb(tlb, pud, start); + mm_dec_nr_puds(tlb->mm); } /* diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index 05e15386d4cb..a7e998158f37 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -200,7 +200,7 @@ static void destroy_pagetable_page(struct mm_struct *mm) /* We allow PTE_FRAG_NR fragments from a PTE page */ if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) { pgtable_page_dtor(page); - free_hot_cold_page(page, 0); + free_unref_page(page); } } diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index ac0717a90ca6..1ec3aee43624 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -404,7 +404,7 @@ void pte_fragment_free(unsigned long *table, int kernel) if (put_page_testzero(page)) { if (!kernel) pgtable_page_dtor(page); - free_hot_cold_page(page, 0); + free_unref_page(page); } } diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 43607bb12cc2..cf4c1cb17dcd 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -44,6 +44,8 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.asce_limit = STACK_TOP_MAX; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION3; + /* pgd_alloc() did not account this pud */ + mm_inc_nr_puds(mm); break; case -PAGE_SIZE: /* forked 5-level task, set new asce with new_mm->pgd */ @@ -59,7 +61,7 @@ static inline int init_new_context(struct task_struct *tsk, /* forked 2-level compat task, set new asce with new mm->pgd */ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; - /* pgd_alloc() did not increase mm->nr_pmds */ + /* pgd_alloc() did not account this pmd */ mm_inc_nr_pmds(mm); } crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c index e1d751ae2498..1a2526676a87 100644 --- a/arch/sh/kernel/dwarf.c +++ b/arch/sh/kernel/dwarf.c @@ -1172,11 +1172,11 @@ static int __init dwarf_unwinder_init(void) dwarf_frame_cachep = kmem_cache_create("dwarf_frames", sizeof(struct dwarf_frame), 0, - SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL); + SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL); dwarf_reg_cachep = kmem_cache_create("dwarf_regs", sizeof(struct dwarf_reg), 0, - SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL); + SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL); dwarf_frame_pool = mempool_create_slab_pool(DWARF_FRAME_MIN_REQ, dwarf_frame_cachep); diff --git a/arch/sh/kernel/head_64.S b/arch/sh/kernel/head_64.S index defd851abefa..cca491397a28 100644 --- a/arch/sh/kernel/head_64.S +++ b/arch/sh/kernel/head_64.S @@ -101,14 +101,6 @@ empty_zero_page: mmu_pdtp_cache: .space PAGE_SIZE, 0 - .global empty_bad_page -empty_bad_page: - .space PAGE_SIZE, 0 - - .global empty_bad_pte_table -empty_bad_pte_table: - .space PAGE_SIZE, 0 - .global fpu_in_use fpu_in_use: .quad 0 diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c index b2d9963d5978..68b1a67533ce 100644 --- a/arch/sh/kernel/process.c +++ b/arch/sh/kernel/process.c @@ -59,7 +59,7 @@ void arch_task_cache_init(void) task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size, __alignof__(union thread_xstate), - SLAB_PANIC | SLAB_NOTRACK, NULL); + SLAB_PANIC, NULL); } #ifdef CONFIG_SH_FPU_EMU diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index fd9d9bac7cfa..5a9e96be1665 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -231,6 +231,36 @@ extern unsigned long _PAGE_ALL_SZ_BITS; extern struct page *mem_map_zero; #define ZERO_PAGE(vaddr) (mem_map_zero) +/* This macro must be updated when the size of struct page grows above 80 + * or reduces below 64. + * The idea that compiler optimizes out switch() statement, and only + * leaves clrx instructions + */ +#define mm_zero_struct_page(pp) do { \ + unsigned long *_pp = (void *)(pp); \ + \ + /* Check that struct page is either 64, 72, or 80 bytes */ \ + BUILD_BUG_ON(sizeof(struct page) & 7); \ + BUILD_BUG_ON(sizeof(struct page) < 64); \ + BUILD_BUG_ON(sizeof(struct page) > 80); \ + \ + switch (sizeof(struct page)) { \ + case 80: \ + _pp[9] = 0; /* fallthrough */ \ + case 72: \ + _pp[8] = 0; /* fallthrough */ \ + default: \ + _pp[7] = 0; \ + _pp[6] = 0; \ + _pp[5] = 0; \ + _pp[4] = 0; \ + _pp[3] = 0; \ + _pp[2] = 0; \ + _pp[1] = 0; \ + _pp[0] = 0; \ + } \ +} while (0) + /* PFNs are real physical page numbers. However, mem_map only begins to record * per-page information starting at pfn_base. This is to handle systems where * the first physical page in the machine is at some huge physical address, diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 5078b7f68890..0112d6942288 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -397,7 +397,7 @@ static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pmd_clear(pmd); pte_free_tlb(tlb, token, addr); - atomic_long_dec(&tlb->mm->nr_ptes); + mm_dec_nr_ptes(tlb->mm); } static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -472,6 +472,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); pud_free_tlb(tlb, pud, start); + mm_dec_nr_puds(tlb->mm); } void hugetlb_free_pgd_range(struct mmu_gather *tlb, diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 61bdc1270d19..55ba62957e64 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2540,10 +2540,17 @@ void __init mem_init(void) { high_memory = __va(last_valid_pfn << PAGE_SHIFT); - register_page_bootmem_info(); free_all_bootmem(); /* + * Must be done after boot memory is put on freelist, because here we + * might set fields in deferred struct pages that have not yet been + * initialized, and free_all_bootmem() initializes all the reserved + * deferred pages for us. + */ + register_page_bootmem_info(); + + /* * Set up the zero page, mark it reserved, so that page count * is not manipulated when freeing the page from user ptes. */ @@ -2637,30 +2644,19 @@ int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend, vstart = vstart & PMD_MASK; vend = ALIGN(vend, PMD_SIZE); for (; vstart < vend; vstart += PMD_SIZE) { - pgd_t *pgd = pgd_offset_k(vstart); + pgd_t *pgd = vmemmap_pgd_populate(vstart, node); unsigned long pte; pud_t *pud; pmd_t *pmd; - if (pgd_none(*pgd)) { - pud_t *new = vmemmap_alloc_block(PAGE_SIZE, node); - - if (!new) - return -ENOMEM; - pgd_populate(&init_mm, pgd, new); - } - - pud = pud_offset(pgd, vstart); - if (pud_none(*pud)) { - pmd_t *new = vmemmap_alloc_block(PAGE_SIZE, node); + if (!pgd) + return -ENOMEM; - if (!new) - return -ENOMEM; - pud_populate(&init_mm, pud, new); - } + pud = vmemmap_pud_populate(pgd, vstart, node); + if (!pud) + return -ENOMEM; pmd = pmd_offset(pud, vstart); - pte = pmd_val(*pmd); if (!(pte & _PAGE_VALID)) { void *block = vmemmap_alloc_block(PMD_SIZE, node); @@ -2927,7 +2923,7 @@ void __flush_tlb_all(void) pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); pte_t *pte = NULL; if (page) @@ -2939,11 +2935,11 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { - struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) return NULL; if (!pgtable_page_ctor(page)) { - free_hot_cold_page(page, 0); + free_unref_page(page); return NULL; } return (pte_t *) page_address(page); diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c index b51cc28acd0a..4432f31e8479 100644 --- a/arch/tile/mm/homecache.c +++ b/arch/tile/mm/homecache.c @@ -409,7 +409,7 @@ void __homecache_free_pages(struct page *page, unsigned int order) if (put_page_testzero(page)) { homecache_change_page_home(page, order, PAGE_HOME_HASH); if (order == 0) { - free_hot_cold_page(page, false); + free_unref_page(page); } else { init_page_count(page); __free_pages(page, order); diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index e7437ec62710..3c0e470ea646 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -22,8 +22,6 @@ /* allocated in paging_init, zeroed in mem_init, and unchanged thereafter */ unsigned long *empty_zero_page = NULL; EXPORT_SYMBOL(empty_zero_page); -/* allocated in paging_init and unchanged thereafter */ -static unsigned long *empty_bad_page = NULL; /* * Initialized during boot, and readonly for initializing page tables @@ -146,7 +144,6 @@ void __init paging_init(void) int i; empty_zero_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); - empty_bad_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); for (i = 0; i < ARRAY_SIZE(zones_size); i++) zones_size[i] = 0; diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h index 26775793c204..f0fdb268f8f2 100644 --- a/arch/unicore32/include/asm/pgalloc.h +++ b/arch/unicore32/include/asm/pgalloc.h @@ -28,7 +28,7 @@ extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd); #define pgd_alloc(mm) get_pgd_slow(mm) #define pgd_free(mm, pgd) free_pgd_slow(mm, pgd) -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) /* * Allocate one PTE table. diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c index c572a28c76c9..a830a300aaa1 100644 --- a/arch/unicore32/mm/pgd.c +++ b/arch/unicore32/mm/pgd.c @@ -97,7 +97,7 @@ void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd) pte = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free(mm, pte); - atomic_long_dec(&mm->nr_ptes); + mm_dec_nr_ptes(mm); pmd_free(mm, pmd); mm_dec_nr_pmds(mm); free: diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f08977d82ca0..df3276d6bfe3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -110,9 +110,8 @@ config X86 select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL - select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP + select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KGDB - select HAVE_ARCH_KMEMCHECK select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT @@ -1430,7 +1429,7 @@ config ARCH_DMA_ADDR_T_64BIT config X86_DIRECT_GBPAGES def_bool y - depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK + depends on X86_64 && !DEBUG_PAGEALLOC ---help--- Certain kernel features effectively disable kernel linear 1 GB mappings (even if the CPU otherwise diff --git a/arch/x86/Makefile b/arch/x86/Makefile index a20eacd9c7e9..3e73bc255e4e 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -158,11 +158,6 @@ ifdef CONFIG_X86_X32 endif export CONFIG_X86_X32_ABI -# Don't unroll struct assignments with kmemcheck enabled -ifeq ($(CONFIG_KMEMCHECK),y) - KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy) -endif - # # If the function graph tracer is used with mcount instead of fentry, # '-maccumulate-outgoing-args' is needed to prevent a GCC bug diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 43cbe843de8d..0350d99bb8fd 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -7,7 +7,6 @@ * Documentation/DMA-API.txt for documentation. */ -#include <linux/kmemcheck.h> #include <linux/scatterlist.h> #include <linux/dma-debug.h> #include <asm/io.h> diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h index 945a0337fbcf..ea32a7d3cf1b 100644 --- a/arch/x86/include/asm/kmemcheck.h +++ b/arch/x86/include/asm/kmemcheck.h @@ -1,43 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ASM_X86_KMEMCHECK_H -#define ASM_X86_KMEMCHECK_H - -#include <linux/types.h> -#include <asm/ptrace.h> - -#ifdef CONFIG_KMEMCHECK -bool kmemcheck_active(struct pt_regs *regs); - -void kmemcheck_show(struct pt_regs *regs); -void kmemcheck_hide(struct pt_regs *regs); - -bool kmemcheck_fault(struct pt_regs *regs, - unsigned long address, unsigned long error_code); -bool kmemcheck_trap(struct pt_regs *regs); -#else -static inline bool kmemcheck_active(struct pt_regs *regs) -{ - return false; -} - -static inline void kmemcheck_show(struct pt_regs *regs) -{ -} - -static inline void kmemcheck_hide(struct pt_regs *regs) -{ -} - -static inline bool kmemcheck_fault(struct pt_regs *regs, - unsigned long address, unsigned long error_code) -{ - return false; -} - -static inline bool kmemcheck_trap(struct pt_regs *regs) -{ - return false; -} -#endif /* CONFIG_KMEMCHECK */ - -#endif diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f735c3016325..09f9e1e00e3b 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -667,11 +667,6 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a) return false; } -static inline int pte_hidden(pte_t pte) -{ - return pte_flags(pte) & _PAGE_HIDDEN; -} - static inline int pmd_present(pmd_t pmd) { /* diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 9e9b05fc4860..3696398a9475 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -32,7 +32,6 @@ #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 -#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 @@ -79,18 +78,6 @@ #define _PAGE_KNL_ERRATUM_MASK 0 #endif -#ifdef CONFIG_KMEMCHECK -#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) -#else -#define _PAGE_HIDDEN (_AT(pteval_t, 0)) -#endif - -/* - * The same hidden bit is used by kmemcheck, but since kmemcheck - * works on kernel pages while soft-dirty engine on user space, - * they do not conflict with each other. - */ - #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #else diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 076502241eae..55d392c6bd29 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -179,8 +179,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) * No 3D Now! */ -#ifndef CONFIG_KMEMCHECK - #if (__GNUC__ >= 4) #define memcpy(t, f, n) __builtin_memcpy(t, f, n) #else @@ -189,13 +187,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) ? __constant_memcpy((t), (f), (n)) \ : __memcpy((t), (f), (n))) #endif -#else -/* - * kmemcheck becomes very happy if we use the REP instructions unconditionally, - * because it means that we know both memory operands in advance. - */ -#define memcpy(t, f, n) __memcpy((t), (f), (n)) -#endif #endif #endif /* !CONFIG_FORTIFY_SOURCE */ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 0b1b4445f4c5..533f74c300c2 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -33,7 +33,6 @@ extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); #ifndef CONFIG_FORTIFY_SOURCE -#ifndef CONFIG_KMEMCHECK #if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4 #define memcpy(dst, src, len) \ ({ \ @@ -46,13 +45,6 @@ extern void *__memcpy(void *to, const void *from, size_t len); __ret; \ }) #endif -#else -/* - * kmemcheck becomes very happy if we use the REP instructions unconditionally, - * because it means that we know both memory operands in advance. - */ -#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len)) -#endif #endif /* !CONFIG_FORTIFY_SOURCE */ #define __HAVE_ARCH_MEMSET diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index 1f5c5161ead6..45c8605467f1 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h @@ -1,7 +1,4 @@ -#ifdef CONFIG_KMEMCHECK -/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ -# include <asm-generic/xor.h> -#elif !defined(_ASM_X86_XOR_H) +#ifndef _ASM_X86_XOR_H #define _ASM_X86_XOR_H /* diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index b720dacac051..b1af22073e28 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -187,21 +187,6 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model < 15) clear_cpu_cap(c, X86_FEATURE_PAT); -#ifdef CONFIG_KMEMCHECK - /* - * P4s have a "fast strings" feature which causes single- - * stepping REP instructions to only generate a #DB on - * cache-line boundaries. - * - * Ingo Molnar reported a Pentium D (model 6) and a Xeon - * (model 2) with the same problem. - */ - if (c->x86 == 15) - if (msr_clear_bit(MSR_IA32_MISC_ENABLE, - MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0) - pr_info("kmemcheck: Disabling fast string operations\n"); -#endif - /* * If fast string is not enabled in IA32_MISC_ENABLE for any reason, * clear the fast string and enhanced fast string CPU capabilities. diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 7d7715dde901..e5ec3cafa72e 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -57,7 +57,7 @@ # error "Need more virtual address space for the ESPFIX hack" #endif -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) /* This contains the *bottom* address of the espfix stack */ DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b7b0f74a2150..989514c94a55 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -42,7 +42,6 @@ #include <linux/edac.h> #endif -#include <asm/kmemcheck.h> #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> @@ -749,10 +748,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) if (!dr6 && user_mode(regs)) user_icebp = 1; - /* Catch kmemcheck conditions! */ - if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) - goto exit; - /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 7ba7f3d7f477..8e13b8cc6bed 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -29,8 +29,6 @@ obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o -obj-$(CONFIG_KMEMCHECK) += kmemcheck/ - KASAN_SANITIZE_kasan_init_$(BITS).o := n obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3109ba6c6ede..78ca9a8ee454 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -20,7 +20,6 @@ #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ -#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ #include <asm/fixmap.h> /* VSYSCALL_ADDR */ #include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vm86.h> /* struct vm86 */ @@ -1256,8 +1255,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * Detect and handle instructions that would cause a page fault for * both a tracked kernel page and a userspace page. */ - if (kmemcheck_active(regs)) - kmemcheck_hide(regs); prefetchw(&mm->mmap_sem); if (unlikely(kmmio_fault(regs, address))) @@ -1280,9 +1277,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { if (vmalloc_fault(address) >= 0) return; - - if (kmemcheck_fault(regs, address, error_code)) - return; } /* Can handle a stale RO->RW TLB: */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index a22c2b95e513..6fdf91ef130a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -92,8 +92,7 @@ __ref void *alloc_low_pages(unsigned int num) unsigned int order; order = get_order((unsigned long)num << PAGE_SHIFT); - return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | - __GFP_ZERO, order); + return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); } if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { @@ -164,12 +163,11 @@ static int page_size_mask; static void __init probe_page_size_mask(void) { /* - * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will - * use small pages. + * For pagealloc debugging, identity mapping will use small pages. * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ - if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK)) + if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled()) page_size_mask |= 1 << PG_LEVEL_2M; else direct_gbpages = 0; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index adcea90a2046..4a837289f2ad 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -184,7 +184,7 @@ static __ref void *spp_getpage(void) void *ptr; if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + ptr = (void *) get_zeroed_page(GFP_ATOMIC); else ptr = alloc_bootmem_pages(PAGE_SIZE); @@ -1173,12 +1173,18 @@ void __init mem_init(void) /* clear_bss() already clear the empty_zero_page */ - register_page_bootmem_info(); - /* this will put all memory onto the freelists */ free_all_bootmem(); after_bootmem = 1; + /* + * Must be done after boot memory is put on freelist, because here we + * might set fields in deferred struct pages that have not yet been + * initialized, and free_all_bootmem() initializes all the reserved + * deferred pages for us. + */ + register_page_bootmem_info(); + /* Register memory areas for /proc/kcore */ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_OTHER); @@ -1399,7 +1405,6 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, vmemmap_verify((pte_t *)pmd, node, addr, next); continue; } - pr_warn_once("vmemmap: falling back to regular page backing\n"); if (vmemmap_populate_basepages(addr, next, node)) return -ENOMEM; } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 2b60dc6e64b1..99dfed6dfef8 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -4,12 +4,14 @@ #include <linux/bootmem.h> #include <linux/kasan.h> #include <linux/kdebug.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/vmalloc.h> #include <asm/e820/types.h> +#include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/pgtable.h> @@ -18,7 +20,134 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); -static int __init map_range(struct range *range) +static __init void *early_alloc(size_t size, int nid) +{ + return memblock_virt_alloc_try_nid_nopanic(size, size, + __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); +} + +static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, + unsigned long end, int nid) +{ + pte_t *pte; + + if (pmd_none(*pmd)) { + void *p; + + if (boot_cpu_has(X86_FEATURE_PSE) && + ((end - addr) == PMD_SIZE) && + IS_ALIGNED(addr, PMD_SIZE)) { + p = early_alloc(PMD_SIZE, nid); + if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) + return; + else if (p) + memblock_free(__pa(p), PMD_SIZE); + } + + p = early_alloc(PAGE_SIZE, nid); + pmd_populate_kernel(&init_mm, pmd, p); + } + + pte = pte_offset_kernel(pmd, addr); + do { + pte_t entry; + void *p; + + if (!pte_none(*pte)) + continue; + + p = early_alloc(PAGE_SIZE, nid); + entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, + unsigned long end, int nid) +{ + pmd_t *pmd; + unsigned long next; + + if (pud_none(*pud)) { + void *p; + + if (boot_cpu_has(X86_FEATURE_GBPAGES) && + ((end - addr) == PUD_SIZE) && + IS_ALIGNED(addr, PUD_SIZE)) { + p = early_alloc(PUD_SIZE, nid); + if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) + return; + else if (p) + memblock_free(__pa(p), PUD_SIZE); + } + + p = early_alloc(PAGE_SIZE, nid); + pud_populate(&init_mm, pud, p); + } + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (!pmd_large(*pmd)) + kasan_populate_pmd(pmd, addr, next, nid); + } while (pmd++, addr = next, addr != end); +} + +static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr, + unsigned long end, int nid) +{ + pud_t *pud; + unsigned long next; + + if (p4d_none(*p4d)) { + void *p = early_alloc(PAGE_SIZE, nid); + + p4d_populate(&init_mm, p4d, p); + } + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (!pud_large(*pud)) + kasan_populate_pud(pud, addr, next, nid); + } while (pud++, addr = next, addr != end); +} + +static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr, + unsigned long end, int nid) +{ + void *p; + p4d_t *p4d; + unsigned long next; + + if (pgd_none(*pgd)) { + p = early_alloc(PAGE_SIZE, nid); + pgd_populate(&init_mm, pgd, p); + } + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + kasan_populate_p4d(p4d, addr, next, nid); + } while (p4d++, addr = next, addr != end); +} + +static void __init kasan_populate_shadow(unsigned long addr, unsigned long end, + int nid) +{ + pgd_t *pgd; + unsigned long next; + + addr = addr & PAGE_MASK; + end = round_up(end, PAGE_SIZE); + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + kasan_populate_pgd(pgd, addr, next, nid); + } while (pgd++, addr = next, addr != end); +} + +static void __init map_range(struct range *range) { unsigned long start; unsigned long end; @@ -26,7 +155,7 @@ static int __init map_range(struct range *range) start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); - return vmemmap_populate(start, end, NUMA_NO_NODE); + kasan_populate_shadow(start, end, early_pfn_to_nid(range->start)); } static void __init clear_pgds(unsigned long start, @@ -189,16 +318,16 @@ void __init kasan_init(void) if (pfn_mapped[i].end == 0) break; - if (map_range(&pfn_mapped[i])) - panic("kasan: unable to allocate shadow!"); + map_range(&pfn_mapped[i]); } + kasan_populate_zero_shadow( kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), kasan_mem_to_shadow((void *)__START_KERNEL_map)); - vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext), - (unsigned long)kasan_mem_to_shadow(_end), - NUMA_NO_NODE); + kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), + (unsigned long)kasan_mem_to_shadow(_end), + early_pfn_to_nid(__pa(_stext))); kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), (void *)KASAN_SHADOW_END); diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile deleted file mode 100644 index 520b3bce4095..000000000000 --- a/arch/x86/mm/kmemcheck/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index 872ec4159a68..cec594032515 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c @@ -1,228 +1 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/interrupt.h> -#include <linux/kdebug.h> -#include <linux/kmemcheck.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/stacktrace.h> -#include <linux/string.h> - -#include "error.h" -#include "shadow.h" - -enum kmemcheck_error_type { - KMEMCHECK_ERROR_INVALID_ACCESS, - KMEMCHECK_ERROR_BUG, -}; - -#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT) - -struct kmemcheck_error { - enum kmemcheck_error_type type; - - union { - /* KMEMCHECK_ERROR_INVALID_ACCESS */ - struct { - /* Kind of access that caused the error */ - enum kmemcheck_shadow state; - /* Address and size of the erroneous read */ - unsigned long address; - unsigned int size; - }; - }; - - struct pt_regs regs; - struct stack_trace trace; - unsigned long trace_entries[32]; - - /* We compress it to a char. */ - unsigned char shadow_copy[SHADOW_COPY_SIZE]; - unsigned char memory_copy[SHADOW_COPY_SIZE]; -}; - -/* - * Create a ring queue of errors to output. We can't call printk() directly - * from the kmemcheck traps, since this may call the console drivers and - * result in a recursive fault. - */ -static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE]; -static unsigned int error_count; -static unsigned int error_rd; -static unsigned int error_wr; -static unsigned int error_missed_count; - -static struct kmemcheck_error *error_next_wr(void) -{ - struct kmemcheck_error *e; - - if (error_count == ARRAY_SIZE(error_fifo)) { - ++error_missed_count; - return NULL; - } - - e = &error_fifo[error_wr]; - if (++error_wr == ARRAY_SIZE(error_fifo)) - error_wr = 0; - ++error_count; - return e; -} - -static struct kmemcheck_error *error_next_rd(void) -{ - struct kmemcheck_error *e; - - if (error_count == 0) - return NULL; - - e = &error_fifo[error_rd]; - if (++error_rd == ARRAY_SIZE(error_fifo)) - error_rd = 0; - --error_count; - return e; -} - -void kmemcheck_error_recall(void) -{ - static const char *desc[] = { - [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated", - [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized", - [KMEMCHECK_SHADOW_INITIALIZED] = "initialized", - [KMEMCHECK_SHADOW_FREED] = "freed", - }; - - static const char short_desc[] = { - [KMEMCHECK_SHADOW_UNALLOCATED] = 'a', - [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u', - [KMEMCHECK_SHADOW_INITIALIZED] = 'i', - [KMEMCHECK_SHADOW_FREED] = 'f', - }; - - struct kmemcheck_error *e; - unsigned int i; - - e = error_next_rd(); - if (!e) - return; - - switch (e->type) { - case KMEMCHECK_ERROR_INVALID_ACCESS: - printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n", - 8 * e->size, e->state < ARRAY_SIZE(desc) ? - desc[e->state] : "(invalid shadow state)", - (void *) e->address); - - printk(KERN_WARNING); - for (i = 0; i < SHADOW_COPY_SIZE; ++i) - printk(KERN_CONT "%02x", e->memory_copy[i]); - printk(KERN_CONT "\n"); - - printk(KERN_WARNING); - for (i = 0; i < SHADOW_COPY_SIZE; ++i) { - if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) - printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]); - else - printk(KERN_CONT " ?"); - } - printk(KERN_CONT "\n"); - printk(KERN_WARNING "%*c\n", 2 + 2 - * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); - break; - case KMEMCHECK_ERROR_BUG: - printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n"); - break; - } - - __show_regs(&e->regs, 1); - print_stack_trace(&e->trace, 0); -} - -static void do_wakeup(unsigned long data) -{ - while (error_count > 0) - kmemcheck_error_recall(); - - if (error_missed_count > 0) { - printk(KERN_WARNING "kmemcheck: Lost %d error reports because " - "the queue was too small\n", error_missed_count); - error_missed_count = 0; - } -} - -static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0); - -/* - * Save the context of an error report. - */ -void kmemcheck_error_save(enum kmemcheck_shadow state, - unsigned long address, unsigned int size, struct pt_regs *regs) -{ - static unsigned long prev_ip; - - struct kmemcheck_error *e; - void *shadow_copy; - void *memory_copy; - - /* Don't report several adjacent errors from the same EIP. */ - if (regs->ip == prev_ip) - return; - prev_ip = regs->ip; - - e = error_next_wr(); - if (!e) - return; - - e->type = KMEMCHECK_ERROR_INVALID_ACCESS; - - e->state = state; - e->address = address; - e->size = size; - - /* Save regs */ - memcpy(&e->regs, regs, sizeof(*regs)); - - /* Save stack trace */ - e->trace.nr_entries = 0; - e->trace.entries = e->trace_entries; - e->trace.max_entries = ARRAY_SIZE(e->trace_entries); - e->trace.skip = 0; - save_stack_trace_regs(regs, &e->trace); - - /* Round address down to nearest 16 bytes */ - shadow_copy = kmemcheck_shadow_lookup(address - & ~(SHADOW_COPY_SIZE - 1)); - BUG_ON(!shadow_copy); - - memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE); - - kmemcheck_show_addr(address); - memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1)); - memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE); - kmemcheck_hide_addr(address); - - tasklet_hi_schedule_first(&kmemcheck_tasklet); -} - -/* - * Save the context of a kmemcheck bug. - */ -void kmemcheck_error_save_bug(struct pt_regs *regs) -{ - struct kmemcheck_error *e; - - e = error_next_wr(); - if (!e) - return; - - e->type = KMEMCHECK_ERROR_BUG; - - memcpy(&e->regs, regs, sizeof(*regs)); - - e->trace.nr_entries = 0; - e->trace.entries = e->trace_entries; - e->trace.max_entries = ARRAY_SIZE(e->trace_entries); - e->trace.skip = 1; - save_stack_trace(&e->trace); - - tasklet_hi_schedule_first(&kmemcheck_tasklet); -} diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h index 39f80d7a874d..ea32a7d3cf1b 100644 --- a/arch/x86/mm/kmemcheck/error.h +++ b/arch/x86/mm/kmemcheck/error.h @@ -1,16 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H -#define ARCH__X86__MM__KMEMCHECK__ERROR_H - -#include <linux/ptrace.h> - -#include "shadow.h" - -void kmemcheck_error_save(enum kmemcheck_shadow state, - unsigned long address, unsigned int size, struct pt_regs *regs); - -void kmemcheck_error_save_bug(struct pt_regs *regs); - -void kmemcheck_error_recall(void); - -#endif diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c deleted file mode 100644 index 4515bae36bbe..000000000000 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ /dev/null @@ -1,658 +0,0 @@ -/** - * kmemcheck - a heavyweight memory checker for the linux kernel - * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no> - * (With a lot of help from Ingo Molnar and Pekka Enberg.) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2) as - * published by the Free Software Foundation. - */ - -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/kallsyms.h> -#include <linux/kernel.h> -#include <linux/kmemcheck.h> -#include <linux/mm.h> -#include <linux/page-flags.h> -#include <linux/percpu.h> -#include <linux/ptrace.h> -#include <linux/string.h> -#include <linux/types.h> - -#include <asm/cacheflush.h> -#include <asm/kmemcheck.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> - -#include "error.h" -#include "opcode.h" -#include "pte.h" -#include "selftest.h" -#include "shadow.h" - - -#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT -# define KMEMCHECK_ENABLED 0 -#endif - -#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT -# define KMEMCHECK_ENABLED 1 -#endif - -#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT -# define KMEMCHECK_ENABLED 2 -#endif - -int kmemcheck_enabled = KMEMCHECK_ENABLED; - -int __init kmemcheck_init(void) -{ -#ifdef CONFIG_SMP - /* - * Limit SMP to use a single CPU. We rely on the fact that this code - * runs before SMP is set up. - */ - if (setup_max_cpus > 1) { - printk(KERN_INFO - "kmemcheck: Limiting number of CPUs to 1.\n"); - setup_max_cpus = 1; - } -#endif - - if (!kmemcheck_selftest()) { - printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n"); - kmemcheck_enabled = 0; - return -EINVAL; - } - - printk(KERN_INFO "kmemcheck: Initialized\n"); - return 0; -} - -early_initcall(kmemcheck_init); - -/* - * We need to parse the kmemcheck= option before any memory is allocated. - */ -static int __init param_kmemcheck(char *str) -{ - int val; - int ret; - - if (!str) - return -EINVAL; - - ret = kstrtoint(str, 0, &val); - if (ret) - return ret; - kmemcheck_enabled = val; - return 0; -} - -early_param("kmemcheck", param_kmemcheck); - -int kmemcheck_show_addr(unsigned long address) -{ - pte_t *pte; - - pte = kmemcheck_pte_lookup(address); - if (!pte) - return 0; - - set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); - __flush_tlb_one(address); - return 1; -} - -int kmemcheck_hide_addr(unsigned long address) -{ - pte_t *pte; - - pte = kmemcheck_pte_lookup(address); - if (!pte) - return 0; - - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); - __flush_tlb_one(address); - return 1; -} - -struct kmemcheck_context { - bool busy; - int balance; - - /* - * There can be at most two memory operands to an instruction, but - * each address can cross a page boundary -- so we may need up to - * four addresses that must be hidden/revealed for each fault. - */ - unsigned long addr[4]; - unsigned long n_addrs; - unsigned long flags; - - /* Data size of the instruction that caused a fault. */ - unsigned int size; -}; - -static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context); - -bool kmemcheck_active(struct pt_regs *regs) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - - return data->balance > 0; -} - -/* Save an address that needs to be shown/hidden */ -static void kmemcheck_save_addr(unsigned long addr) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - - BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr)); - data->addr[data->n_addrs++] = addr; -} - -static unsigned int kmemcheck_show_all(void) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - unsigned int i; - unsigned int n; - - n = 0; - for (i = 0; i < data->n_addrs; ++i) - n += kmemcheck_show_addr(data->addr[i]); - - return n; -} - -static unsigned int kmemcheck_hide_all(void) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - unsigned int i; - unsigned int n; - - n = 0; - for (i = 0; i < data->n_addrs; ++i) - n += kmemcheck_hide_addr(data->addr[i]); - - return n; -} - -/* - * Called from the #PF handler. - */ -void kmemcheck_show(struct pt_regs *regs) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - - BUG_ON(!irqs_disabled()); - - if (unlikely(data->balance != 0)) { - kmemcheck_show_all(); - kmemcheck_error_save_bug(regs); - data->balance = 0; - return; - } - - /* - * None of the addresses actually belonged to kmemcheck. Note that - * this is not an error. - */ - if (kmemcheck_show_all() == 0) - return; - - ++data->balance; - - /* - * The IF needs to be cleared as well, so that the faulting - * instruction can run "uninterrupted". Otherwise, we might take - * an interrupt and start executing that before we've had a chance - * to hide the page again. - * - * NOTE: In the rare case of multiple faults, we must not override - * the original flags: - */ - if (!(regs->flags & X86_EFLAGS_TF)) - data->flags = regs->flags; - - regs->flags |= X86_EFLAGS_TF; - regs->flags &= ~X86_EFLAGS_IF; -} - -/* - * Called from the #DB handler. - */ -void kmemcheck_hide(struct pt_regs *regs) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - int n; - - BUG_ON(!irqs_disabled()); - - if (unlikely(data->balance != 1)) { - kmemcheck_show_all(); - kmemcheck_error_save_bug(regs); - data->n_addrs = 0; - data->balance = 0; - - if (!(data->flags & X86_EFLAGS_TF)) - regs->flags &= ~X86_EFLAGS_TF; - if (data->flags & X86_EFLAGS_IF) - regs->flags |= X86_EFLAGS_IF; - return; - } - - if (kmemcheck_enabled) - n = kmemcheck_hide_all(); - else - n = kmemcheck_show_all(); - - if (n == 0) - return; - - --data->balance; - - data->n_addrs = 0; - - if (!(data->flags & X86_EFLAGS_TF)) - regs->flags &= ~X86_EFLAGS_TF; - if (data->flags & X86_EFLAGS_IF) - regs->flags |= X86_EFLAGS_IF; -} - -void kmemcheck_show_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) { - unsigned long address; - pte_t *pte; - unsigned int level; - - address = (unsigned long) page_address(&p[i]); - pte = lookup_address(address, &level); - BUG_ON(!pte); - BUG_ON(level != PG_LEVEL_4K); - - set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN)); - __flush_tlb_one(address); - } -} - -bool kmemcheck_page_is_tracked(struct page *p) -{ - /* This will also check the "hidden" flag of the PTE. */ - return kmemcheck_pte_lookup((unsigned long) page_address(p)); -} - -void kmemcheck_hide_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) { - unsigned long address; - pte_t *pte; - unsigned int level; - - address = (unsigned long) page_address(&p[i]); - pte = lookup_address(address, &level); - BUG_ON(!pte); - BUG_ON(level != PG_LEVEL_4K); - - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); - set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN)); - __flush_tlb_one(address); - } -} - -/* Access may NOT cross page boundary */ -static void kmemcheck_read_strict(struct pt_regs *regs, - unsigned long addr, unsigned int size) -{ - void *shadow; - enum kmemcheck_shadow status; - - shadow = kmemcheck_shadow_lookup(addr); - if (!shadow) - return; - - kmemcheck_save_addr(addr); - status = kmemcheck_shadow_test(shadow, size); - if (status == KMEMCHECK_SHADOW_INITIALIZED) - return; - - if (kmemcheck_enabled) - kmemcheck_error_save(status, addr, size, regs); - - if (kmemcheck_enabled == 2) - kmemcheck_enabled = 0; - - /* Don't warn about it again. */ - kmemcheck_shadow_set(shadow, size); -} - -bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) -{ - enum kmemcheck_shadow status; - void *shadow; - - shadow = kmemcheck_shadow_lookup(addr); - if (!shadow) - return true; - - status = kmemcheck_shadow_test_all(shadow, size); - - return status == KMEMCHECK_SHADOW_INITIALIZED; -} - -/* Access may cross page boundary */ -static void kmemcheck_read(struct pt_regs *regs, - unsigned long addr, unsigned int size) -{ - unsigned long page = addr & PAGE_MASK; - unsigned long next_addr = addr + size - 1; - unsigned long next_page = next_addr & PAGE_MASK; - - if (likely(page == next_page)) { - kmemcheck_read_strict(regs, addr, size); - return; - } - - /* - * What we do is basically to split the access across the - * two pages and handle each part separately. Yes, this means - * that we may now see reads that are 3 + 5 bytes, for - * example (and if both are uninitialized, there will be two - * reports), but it makes the code a lot simpler. - */ - kmemcheck_read_strict(regs, addr, next_page - addr); - kmemcheck_read_strict(regs, next_page, next_addr - next_page); -} - -static void kmemcheck_write_strict(struct pt_regs *regs, - unsigned long addr, unsigned int size) -{ - void *shadow; - - shadow = kmemcheck_shadow_lookup(addr); - if (!shadow) - return; - - kmemcheck_save_addr(addr); - kmemcheck_shadow_set(shadow, size); -} - -static void kmemcheck_write(struct pt_regs *regs, - unsigned long addr, unsigned int size) -{ - unsigned long page = addr & PAGE_MASK; - unsigned long next_addr = addr + size - 1; - unsigned long next_page = next_addr & PAGE_MASK; - - if (likely(page == next_page)) { - kmemcheck_write_strict(regs, addr, size); - return; - } - - /* See comment in kmemcheck_read(). */ - kmemcheck_write_strict(regs, addr, next_page - addr); - kmemcheck_write_strict(regs, next_page, next_addr - next_page); -} - -/* - * Copying is hard. We have two addresses, each of which may be split across - * a page (and each page will have different shadow addresses). - */ -static void kmemcheck_copy(struct pt_regs *regs, - unsigned long src_addr, unsigned long dst_addr, unsigned int size) -{ - uint8_t shadow[8]; - enum kmemcheck_shadow status; - - unsigned long page; - unsigned long next_addr; - unsigned long next_page; - - uint8_t *x; - unsigned int i; - unsigned int n; - - BUG_ON(size > sizeof(shadow)); - - page = src_addr & PAGE_MASK; - next_addr = src_addr + size - 1; - next_page = next_addr & PAGE_MASK; - - if (likely(page == next_page)) { - /* Same page */ - x = kmemcheck_shadow_lookup(src_addr); - if (x) { - kmemcheck_save_addr(src_addr); - for (i = 0; i < size; ++i) - shadow[i] = x[i]; - } else { - for (i = 0; i < size; ++i) - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } else { - n = next_page - src_addr; - BUG_ON(n > sizeof(shadow)); - - /* First page */ - x = kmemcheck_shadow_lookup(src_addr); - if (x) { - kmemcheck_save_addr(src_addr); - for (i = 0; i < n; ++i) - shadow[i] = x[i]; - } else { - /* Not tracked */ - for (i = 0; i < n; ++i) - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - - /* Second page */ - x = kmemcheck_shadow_lookup(next_page); - if (x) { - kmemcheck_save_addr(next_page); - for (i = n; i < size; ++i) - shadow[i] = x[i - n]; - } else { - /* Not tracked */ - for (i = n; i < size; ++i) - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } - - page = dst_addr & PAGE_MASK; - next_addr = dst_addr + size - 1; - next_page = next_addr & PAGE_MASK; - - if (likely(page == next_page)) { - /* Same page */ - x = kmemcheck_shadow_lookup(dst_addr); - if (x) { - kmemcheck_save_addr(dst_addr); - for (i = 0; i < size; ++i) { - x[i] = shadow[i]; - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } - } else { - n = next_page - dst_addr; - BUG_ON(n > sizeof(shadow)); - - /* First page */ - x = kmemcheck_shadow_lookup(dst_addr); - if (x) { - kmemcheck_save_addr(dst_addr); - for (i = 0; i < n; ++i) { - x[i] = shadow[i]; - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } - - /* Second page */ - x = kmemcheck_shadow_lookup(next_page); - if (x) { - kmemcheck_save_addr(next_page); - for (i = n; i < size; ++i) { - x[i - n] = shadow[i]; - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } - } - - status = kmemcheck_shadow_test(shadow, size); - if (status == KMEMCHECK_SHADOW_INITIALIZED) - return; - - if (kmemcheck_enabled) - kmemcheck_error_save(status, src_addr, size, regs); - - if (kmemcheck_enabled == 2) - kmemcheck_enabled = 0; -} - -enum kmemcheck_method { - KMEMCHECK_READ, - KMEMCHECK_WRITE, -}; - -static void kmemcheck_access(struct pt_regs *regs, - unsigned long fallback_address, enum kmemcheck_method fallback_method) -{ - const uint8_t *insn; - const uint8_t *insn_primary; - unsigned int size; - - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - - /* Recursive fault -- ouch. */ - if (data->busy) { - kmemcheck_show_addr(fallback_address); - kmemcheck_error_save_bug(regs); - return; - } - - data->busy = true; - - insn = (const uint8_t *) regs->ip; - insn_primary = kmemcheck_opcode_get_primary(insn); - - kmemcheck_opcode_decode(insn, &size); - - switch (insn_primary[0]) { -#ifdef CONFIG_KMEMCHECK_BITOPS_OK - /* AND, OR, XOR */ - /* - * Unfortunately, these instructions have to be excluded from - * our regular checking since they access only some (and not - * all) bits. This clears out "bogus" bitfield-access warnings. - */ - case 0x80: - case 0x81: - case 0x82: - case 0x83: - switch ((insn_primary[1] >> 3) & 7) { - /* OR */ - case 1: - /* AND */ - case 4: - /* XOR */ - case 6: - kmemcheck_write(regs, fallback_address, size); - goto out; - - /* ADD */ - case 0: - /* ADC */ - case 2: - /* SBB */ - case 3: - /* SUB */ - case 5: - /* CMP */ - case 7: - break; - } - break; -#endif - - /* MOVS, MOVSB, MOVSW, MOVSD */ - case 0xa4: - case 0xa5: - /* - * These instructions are special because they take two - * addresses, but we only get one page fault. - */ - kmemcheck_copy(regs, regs->si, regs->di, size); - goto out; - - /* CMPS, CMPSB, CMPSW, CMPSD */ - case 0xa6: - case 0xa7: - kmemcheck_read(regs, regs->si, size); - kmemcheck_read(regs, regs->di, size); - goto out; - } - - /* - * If the opcode isn't special in any way, we use the data from the - * page fault handler to determine the address and type of memory - * access. - */ - switch (fallback_method) { - case KMEMCHECK_READ: - kmemcheck_read(regs, fallback_address, size); - goto out; - case KMEMCHECK_WRITE: - kmemcheck_write(regs, fallback_address, size); - goto out; - } - -out: - data->busy = false; -} - -bool kmemcheck_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code) -{ - pte_t *pte; - - /* - * XXX: Is it safe to assume that memory accesses from virtual 86 - * mode or non-kernel code segments will _never_ access kernel - * memory (e.g. tracked pages)? For now, we need this to avoid - * invoking kmemcheck for PnP BIOS calls. - */ - if (regs->flags & X86_VM_MASK) - return false; - if (regs->cs != __KERNEL_CS) - return false; - - pte = kmemcheck_pte_lookup(address); - if (!pte) - return false; - - WARN_ON_ONCE(in_nmi()); - - if (error_code & 2) - kmemcheck_access(regs, address, KMEMCHECK_WRITE); - else - kmemcheck_access(regs, address, KMEMCHECK_READ); - - kmemcheck_show(regs); - return true; -} - -bool kmemcheck_trap(struct pt_regs *regs) -{ - if (!kmemcheck_active(regs)) - return false; - - /* We're done. */ - kmemcheck_hide(regs); - return true; -} diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c index df8109ddf7fe..cec594032515 100644 --- a/arch/x86/mm/kmemcheck/opcode.c +++ b/arch/x86/mm/kmemcheck/opcode.c @@ -1,107 +1 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/types.h> - -#include "opcode.h" - -static bool opcode_is_prefix(uint8_t b) -{ - return - /* Group 1 */ - b == 0xf0 || b == 0xf2 || b == 0xf3 - /* Group 2 */ - || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 - || b == 0x64 || b == 0x65 - /* Group 3 */ - || b == 0x66 - /* Group 4 */ - || b == 0x67; -} - -#ifdef CONFIG_X86_64 -static bool opcode_is_rex_prefix(uint8_t b) -{ - return (b & 0xf0) == 0x40; -} -#else -static bool opcode_is_rex_prefix(uint8_t b) -{ - return false; -} -#endif - -#define REX_W (1 << 3) - -/* - * This is a VERY crude opcode decoder. We only need to find the size of the - * load/store that caused our #PF and this should work for all the opcodes - * that we care about. Moreover, the ones who invented this instruction set - * should be shot. - */ -void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size) -{ - /* Default operand size */ - int operand_size_override = 4; - - /* prefixes */ - for (; opcode_is_prefix(*op); ++op) { - if (*op == 0x66) - operand_size_override = 2; - } - - /* REX prefix */ - if (opcode_is_rex_prefix(*op)) { - uint8_t rex = *op; - - ++op; - if (rex & REX_W) { - switch (*op) { - case 0x63: - *size = 4; - return; - case 0x0f: - ++op; - - switch (*op) { - case 0xb6: - case 0xbe: - *size = 1; - return; - case 0xb7: - case 0xbf: - *size = 2; - return; - } - - break; - } - - *size = 8; - return; - } - } - - /* escape opcode */ - if (*op == 0x0f) { - ++op; - - /* - * This is move with zero-extend and sign-extend, respectively; - * we don't have to think about 0xb6/0xbe, because this is - * already handled in the conditional below. - */ - if (*op == 0xb7 || *op == 0xbf) - operand_size_override = 2; - } - - *size = (*op & 1) ? operand_size_override : 1; -} - -const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op) -{ - /* skip prefixes */ - while (opcode_is_prefix(*op)) - ++op; - if (opcode_is_rex_prefix(*op)) - ++op; - return op; -} diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h index 51a1ce94c24a..ea32a7d3cf1b 100644 --- a/arch/x86/mm/kmemcheck/opcode.h +++ b/arch/x86/mm/kmemcheck/opcode.h @@ -1,10 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H -#define ARCH__X86__MM__KMEMCHECK__OPCODE_H - -#include <linux/types.h> - -void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size); -const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op); - -#endif diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c index 8a03be90272a..cec594032515 100644 --- a/arch/x86/mm/kmemcheck/pte.c +++ b/arch/x86/mm/kmemcheck/pte.c @@ -1,23 +1 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/mm.h> - -#include <asm/pgtable.h> - -#include "pte.h" - -pte_t *kmemcheck_pte_lookup(unsigned long address) -{ - pte_t *pte; - unsigned int level; - - pte = lookup_address(address, &level); - if (!pte) - return NULL; - if (level != PG_LEVEL_4K) - return NULL; - if (!pte_hidden(*pte)) - return NULL; - - return pte; -} - diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h index b595612382c2..ea32a7d3cf1b 100644 --- a/arch/x86/mm/kmemcheck/pte.h +++ b/arch/x86/mm/kmemcheck/pte.h @@ -1,11 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H -#define ARCH__X86__MM__KMEMCHECK__PTE_H - -#include <linux/mm.h> - -#include <asm/pgtable.h> - -pte_t *kmemcheck_pte_lookup(unsigned long address); - -#endif diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c index 7ce0be1f99eb..cec594032515 100644 --- a/arch/x86/mm/kmemcheck/selftest.c +++ b/arch/x86/mm/kmemcheck/selftest.c @@ -1,71 +1 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/bug.h> -#include <linux/kernel.h> - -#include "opcode.h" -#include "selftest.h" - -struct selftest_opcode { - unsigned int expected_size; - const uint8_t *insn; - const char *desc; -}; - -static const struct selftest_opcode selftest_opcodes[] = { - /* REP MOVS */ - {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"}, - {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"}, - - /* MOVZX / MOVZXD */ - {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"}, - {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"}, - - /* MOVSX / MOVSXD */ - {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"}, - {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"}, - -#ifdef CONFIG_X86_64 - /* MOVZX / MOVZXD */ - {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"}, - {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"}, - - /* MOVSX / MOVSXD */ - {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"}, - {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"}, - {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"}, -#endif -}; - -static bool selftest_opcode_one(const struct selftest_opcode *op) -{ - unsigned size; - - kmemcheck_opcode_decode(op->insn, &size); - - if (size == op->expected_size) - return true; - - printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n", - op->desc, op->expected_size, size); - return false; -} - -static bool selftest_opcodes_all(void) -{ - bool pass = true; - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i) - pass = pass && selftest_opcode_one(&selftest_opcodes[i]); - - return pass; -} - -bool kmemcheck_selftest(void) -{ - bool pass = true; - - pass = pass && selftest_opcodes_all(); - - return pass; -} diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h index 8d759aae453d..ea32a7d3cf1b 100644 --- a/arch/x86/mm/kmemcheck/selftest.h +++ b/arch/x86/mm/kmemcheck/selftest.h @@ -1,7 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H -#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H - -bool kmemcheck_selftest(void); - -#endif diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c deleted file mode 100644 index c2638a7d2c10..000000000000 --- a/arch/x86/mm/kmemcheck/shadow.c +++ /dev/null @@ -1,173 +0,0 @@ -#include <linux/kmemcheck.h> -#include <linux/export.h> -#include <linux/mm.h> - -#include <asm/page.h> -#include <asm/pgtable.h> - -#include "pte.h" -#include "shadow.h" - -/* - * Return the shadow address for the given address. Returns NULL if the - * address is not tracked. - * - * We need to be extremely careful not to follow any invalid pointers, - * because this function can be called for *any* possible address. - */ -void *kmemcheck_shadow_lookup(unsigned long address) -{ - pte_t *pte; - struct page *page; - - if (!virt_addr_valid(address)) - return NULL; - - pte = kmemcheck_pte_lookup(address); - if (!pte) - return NULL; - - page = virt_to_page(address); - if (!page->shadow) - return NULL; - return page->shadow + (address & (PAGE_SIZE - 1)); -} - -static void mark_shadow(void *address, unsigned int n, - enum kmemcheck_shadow status) -{ - unsigned long addr = (unsigned long) address; - unsigned long last_addr = addr + n - 1; - unsigned long page = addr & PAGE_MASK; - unsigned long last_page = last_addr & PAGE_MASK; - unsigned int first_n; - void *shadow; - - /* If the memory range crosses a page boundary, stop there. */ - if (page == last_page) - first_n = n; - else - first_n = page + PAGE_SIZE - addr; - - shadow = kmemcheck_shadow_lookup(addr); - if (shadow) - memset(shadow, status, first_n); - - addr += first_n; - n -= first_n; - - /* Do full-page memset()s. */ - while (n >= PAGE_SIZE) { - shadow = kmemcheck_shadow_lookup(addr); - if (shadow) - memset(shadow, status, PAGE_SIZE); - - addr += PAGE_SIZE; - n -= PAGE_SIZE; - } - - /* Do the remaining page, if any. */ - if (n > 0) { - shadow = kmemcheck_shadow_lookup(addr); - if (shadow) - memset(shadow, status, n); - } -} - -void kmemcheck_mark_unallocated(void *address, unsigned int n) -{ - mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED); -} - -void kmemcheck_mark_uninitialized(void *address, unsigned int n) -{ - mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED); -} - -/* - * Fill the shadow memory of the given address such that the memory at that - * address is marked as being initialized. - */ -void kmemcheck_mark_initialized(void *address, unsigned int n) -{ - mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED); -} -EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized); - -void kmemcheck_mark_freed(void *address, unsigned int n) -{ - mark_shadow(address, n, KMEMCHECK_SHADOW_FREED); -} - -void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) - kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE); -} - -void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) - kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE); -} - -void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) - kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE); -} - -enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) -{ -#ifdef CONFIG_KMEMCHECK_PARTIAL_OK - uint8_t *x; - unsigned int i; - - x = shadow; - - /* - * Make sure _some_ bytes are initialized. Gcc frequently generates - * code to access neighboring bytes. - */ - for (i = 0; i < size; ++i) { - if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) - return x[i]; - } - - return x[0]; -#else - return kmemcheck_shadow_test_all(shadow, size); -#endif -} - -enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size) -{ - uint8_t *x; - unsigned int i; - - x = shadow; - - /* All bytes must be initialized. */ - for (i = 0; i < size; ++i) { - if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) - return x[i]; - } - - return x[0]; -} - -void kmemcheck_shadow_set(void *shadow, unsigned int size) -{ - uint8_t *x; - unsigned int i; - - x = shadow; - for (i = 0; i < size; ++i) - x[i] = KMEMCHECK_SHADOW_INITIALIZED; -} diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h index 49768dc18664..ea32a7d3cf1b 100644 --- a/arch/x86/mm/kmemcheck/shadow.h +++ b/arch/x86/mm/kmemcheck/shadow.h @@ -1,19 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H -#define ARCH__X86__MM__KMEMCHECK__SHADOW_H - -enum kmemcheck_shadow { - KMEMCHECK_SHADOW_UNALLOCATED, - KMEMCHECK_SHADOW_UNINITIALIZED, - KMEMCHECK_SHADOW_INITIALIZED, - KMEMCHECK_SHADOW_FREED, -}; - -void *kmemcheck_shadow_lookup(unsigned long address); - -enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); -enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, - unsigned int size); -void kmemcheck_shadow_set(void *shadow, unsigned int size); - -#endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3fe68483463c..85cf12219dea 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -753,7 +753,7 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); - base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); + base = alloc_pages(GFP_KERNEL, 0); if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); if (!base) @@ -904,7 +904,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) static int alloc_pte_page(pmd_t *pmd) { - pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL); if (!pte) return -1; @@ -914,7 +914,7 @@ static int alloc_pte_page(pmd_t *pmd) static int alloc_pmd_page(pud_t *pud) { - pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd) return -1; @@ -1120,7 +1120,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) pgd_entry = cpa->pgd + pgd_index(addr); if (pgd_none(*pgd_entry)) { - p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); if (!p4d) return -1; @@ -1132,7 +1132,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) */ p4d = p4d_offset(pgd_entry, addr); if (p4d_none(*p4d)) { - pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) return -1; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 17ebc5a978cc..96d456a94b03 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -7,7 +7,7 @@ #include <asm/fixmap.h> #include <asm/mtrr.h> -#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) #ifdef CONFIG_HIGHPTE #define PGALLOC_USER_GFP __GFP_HIGHMEM diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 9e4ee5b04b2d..6a151ce70e86 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -207,7 +207,7 @@ int __init efi_alloc_page_tables(void) if (efi_enabled(EFI_OLD_MEMMAP)) return 0; - gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO; + gfp_mask = GFP_KERNEL | __GFP_ZERO; efi_pgd = (pgd_t *)__get_free_page(gfp_mask); if (!efi_pgd) return -ENOMEM; diff --git a/block/blk-mq.c b/block/blk-mq.c index b600463791ec..11097477eeab 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2047,7 +2047,7 @@ static int blk_mq_init_hctx(struct request_queue *q, * Allocate space for all possible cpus to avoid allocation at * runtime */ - hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), + hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), GFP_KERNEL, node); if (!hctx->ctxs) goto unregister_cpu_notifier; diff --git a/crypto/xor.c b/crypto/xor.c index 263af9fb45ea..bce9fe7af40a 100644 --- a/crypto/xor.c +++ b/crypto/xor.c @@ -122,12 +122,7 @@ calibrate_xor_blocks(void) goto out; } - /* - * Note: Since the memory is not actually used for _anything_ but to - * test the XOR speed, we don't really want kmemcheck to warn about - * reading uninitialized bytes here. - */ - b1 = (void *) __get_free_pages(GFP_KERNEL | __GFP_NOTRACK, 2); + b1 = (void *) __get_free_pages(GFP_KERNEL, 2); if (!b1) { printk(KERN_WARNING "xor: Yikes! No memory available.\n"); return -ENOMEM; diff --git a/drivers/block/brd.c b/drivers/block/brd.c index c1cf87718c2e..588360d79fca 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -20,6 +20,7 @@ #include <linux/radix-tree.h> #include <linux/fs.h> #include <linux/slab.h> +#include <linux/backing-dev.h> #ifdef CONFIG_BLK_DEV_RAM_DAX #include <linux/pfn_t.h> #include <linux/dax.h> @@ -448,6 +449,7 @@ static struct brd_device *brd_alloc(int i) disk->flags = GENHD_FL_EXT_DEVT; sprintf(disk->disk_name, "ram%d", i); set_capacity(disk, rd_size * 2); + disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; #ifdef CONFIG_BLK_DEV_RAM_DAX queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue); diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 5b8992beffec..4ed0a78fdc09 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -23,15 +23,15 @@ static const char * const backends[] = { #if IS_ENABLED(CONFIG_CRYPTO_LZ4) "lz4", #endif -#if IS_ENABLED(CONFIG_CRYPTO_DEFLATE) - "deflate", -#endif #if IS_ENABLED(CONFIG_CRYPTO_LZ4HC) "lz4hc", #endif #if IS_ENABLED(CONFIG_CRYPTO_842) "842", #endif +#if IS_ENABLED(CONFIG_CRYPTO_ZSTD) + "zstd", +#endif NULL }; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f149d3e61234..d70eba30003a 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -122,14 +122,6 @@ static inline bool is_partial_io(struct bio_vec *bvec) } #endif -static void zram_revalidate_disk(struct zram *zram) -{ - revalidate_disk(zram->disk); - /* revalidate_disk reset the BDI_CAP_STABLE_WRITES so set again */ - zram->disk->queue->backing_dev_info->capabilities |= - BDI_CAP_STABLE_WRITES; -} - /* * Check if request is within bounds and aligned on zram logical blocks. */ @@ -436,7 +428,7 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry) WARN_ON_ONCE(!was_set); } -void zram_page_end_io(struct bio *bio) +static void zram_page_end_io(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; @@ -1373,7 +1365,8 @@ static ssize_t disksize_store(struct device *dev, zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - zram_revalidate_disk(zram); + + revalidate_disk(zram->disk); up_write(&zram->init_lock); return len; @@ -1420,7 +1413,7 @@ static ssize_t reset_store(struct device *dev, /* Make sure all the pending I/O are finished */ fsync_bdev(bdev); zram_reset_device(zram); - zram_revalidate_disk(zram); + revalidate_disk(zram->disk); bdput(bdev); mutex_lock(&bdev->bd_mutex); @@ -1539,6 +1532,7 @@ static int zram_add(void) /* zram devices sort of resembles non-rotational disks */ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); + /* * To ensure that we always get PAGE_SIZE aligned * and n*PAGE_SIZED sized I/O requests. @@ -1563,6 +1557,8 @@ static int zram_add(void) if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); + zram->disk->queue->backing_dev_info->capabilities |= + (BDI_CAP_STABLE_WRITES | BDI_CAP_SYNCHRONOUS_IO); add_disk(zram->disk); ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj, diff --git a/drivers/char/random.c b/drivers/char/random.c index 6c7ccac2679e..ec42c8bb9b0d 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -259,7 +259,6 @@ #include <linux/cryptohash.h> #include <linux/fips.h> #include <linux/ptrace.h> -#include <linux/kmemcheck.h> #include <linux/workqueue.h> #include <linux/irq.h> #include <linux/syscalls.h> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 60d8bedb694d..cd664832f9e8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -553,8 +553,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, * invalidated it. Free it and try again */ release_pages(e->user_pages, - e->robj->tbo.ttm->num_pages, - false); + e->robj->tbo.ttm->num_pages); kvfree(e->user_pages); e->user_pages = NULL; } @@ -691,8 +690,7 @@ error_free_pages: continue; release_pages(e->user_pages, - e->robj->tbo.ttm->num_pages, - false); + e->robj->tbo.ttm->num_pages); kvfree(e->user_pages); } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 6149a47fe63d..0bda8f2a188a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -347,7 +347,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, void *data, return 0; free_pages: - release_pages(bo->tbo.ttm->pages, bo->tbo.ttm->num_pages, false); + release_pages(bo->tbo.ttm->pages, bo->tbo.ttm->num_pages); unlock_mmap_sem: up_read(¤t->mm->mmap_sem); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index bc746131987f..d792959fac43 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -659,7 +659,7 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) return 0; release_pages: - release_pages(pages, pinned, 0); + release_pages(pages, pinned); return r; } diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 57881167ccd2..bcc8c2d7c7c9 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -779,7 +779,7 @@ static struct page **etnaviv_gem_userptr_do_get_pages( up_read(&mm->mmap_sem); if (ret < 0) { - release_pages(pvec, pinned, 0); + release_pages(pvec, pinned); kvfree(pvec); return ERR_PTR(ret); } @@ -852,7 +852,7 @@ static int etnaviv_gem_userptr_get_pages(struct etnaviv_gem_object *etnaviv_obj) } } - release_pages(pvec, pinned, 0); + release_pages(pvec, pinned); kvfree(pvec); work = kmalloc(sizeof(*work), GFP_KERNEL); @@ -886,7 +886,7 @@ static void etnaviv_gem_userptr_release(struct etnaviv_gem_object *etnaviv_obj) if (etnaviv_obj->pages) { int npages = etnaviv_obj->base.size >> PAGE_SHIFT; - release_pages(etnaviv_obj->pages, npages, 0); + release_pages(etnaviv_obj->pages, npages); kvfree(etnaviv_obj->pages); } put_task_struct(etnaviv_obj->userptr.task); diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index ad524cb0f6fc..7982ad817c11 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -1859,7 +1859,7 @@ static void i915_address_space_init(struct i915_address_space *vm, INIT_LIST_HEAD(&vm->unbound_list); list_add_tail(&vm->global_link, &dev_priv->vm_list); - pagevec_init(&vm->free_pages, false); + pagevec_init(&vm->free_pages); } static void i915_address_space_fini(struct i915_address_space *vm) diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 709efe2357ea..aa22361bd5a1 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -554,7 +554,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) } mutex_unlock(&obj->mm.lock); - release_pages(pvec, pinned, 0); + release_pages(pvec, pinned); kvfree(pvec); i915_gem_object_put(obj); @@ -668,7 +668,7 @@ i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj) __i915_gem_userptr_set_active(obj, true); if (IS_ERR(pages)) - release_pages(pvec, pinned, 0); + release_pages(pvec, pinned); kvfree(pvec); return pages; diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index bf69bf9086bf..1fdfc7a46072 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -597,7 +597,7 @@ release_sg: kfree(ttm->sg); release_pages: - release_pages(ttm->pages, pinned, 0); + release_pages(ttm->pages, pinned); return r; } diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 5243ad30dfc0..85dfbba427f6 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -1667,8 +1667,9 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) } if (!rcd->rcvegrbuf_phys) { rcd->rcvegrbuf_phys = - kmalloc_node(chunk * sizeof(rcd->rcvegrbuf_phys[0]), - GFP_KERNEL, rcd->node_id); + kmalloc_array_node(chunk, + sizeof(rcd->rcvegrbuf_phys[0]), + GFP_KERNEL, rcd->node_id); if (!rcd->rcvegrbuf_phys) goto bail_rcvegrbuf; } diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 410025a19729..9177df60742a 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -238,7 +238,7 @@ int rvt_driver_qp_init(struct rvt_dev_info *rdi) rdi->qp_dev->qp_table_size = rdi->dparms.qp_table_size; rdi->qp_dev->qp_table_bits = ilog2(rdi->dparms.qp_table_size); rdi->qp_dev->qp_table = - kmalloc_node(rdi->qp_dev->qp_table_size * + kmalloc_array_node(rdi->qp_dev->qp_table_size, sizeof(*rdi->qp_dev->qp_table), GFP_KERNEL, rdi->dparms.node); if (!rdi->qp_dev->qp_table) diff --git a/drivers/misc/c2port/core.c b/drivers/misc/c2port/core.c index 1922cb8f6b88..1c5b7aec13d4 100644 --- a/drivers/misc/c2port/core.c +++ b/drivers/misc/c2port/core.c @@ -15,7 +15,6 @@ #include <linux/errno.h> #include <linux/err.h> #include <linux/kernel.h> -#include <linux/kmemcheck.h> #include <linux/ctype.h> #include <linux/delay.h> #include <linux/idr.h> @@ -904,7 +903,6 @@ struct c2port_device *c2port_device_register(char *name, return ERR_PTR(-EINVAL); c2dev = kmalloc(sizeof(struct c2port_device), GFP_KERNEL); - kmemcheck_annotate_bitfield(c2dev, flags); if (unlikely(!c2dev)) return ERR_PTR(-ENOMEM); diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 5417e4da64ca..7451922c209d 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -517,7 +517,7 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num) rc = ena_alloc_rx_page(rx_ring, rx_info, - __GFP_COLD | GFP_ATOMIC | __GFP_COMP); + GFP_ATOMIC | __GFP_COMP); if (unlikely(rc < 0)) { netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev, "failed to alloc buffer for rx queue %d\n", diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c index 45d92304068e..cc1e4f820e64 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c @@ -295,7 +295,7 @@ again: order = alloc_order; /* Try to obtain pages, decreasing order if necessary */ - gfp = GFP_ATOMIC | __GFP_COLD | __GFP_COMP | __GFP_NOWARN; + gfp = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN; while (order >= 0) { pages = alloc_pages_node(node, gfp, order); if (pages) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 0654e0c76bc2..519ca6534b85 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -304,8 +304,7 @@ int aq_ring_rx_fill(struct aq_ring_s *self) buff->flags = 0U; buff->len = AQ_CFG_RX_FRAME_MAX; - buff->page = alloc_pages(GFP_ATOMIC | __GFP_COLD | - __GFP_COMP, pages_order); + buff->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, pages_order); if (!buff->page) { err = -ENOMEM; goto err_exit; diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h index 433f3619de8f..f2d1a076a038 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h @@ -198,7 +198,7 @@ static inline void struct sk_buff *skb; struct octeon_skb_page_info *skb_pg_info; - page = alloc_page(GFP_ATOMIC | __GFP_COLD); + page = alloc_page(GFP_ATOMIC); if (unlikely(!page)) return NULL; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 92aec17f4b4d..85e28efcda33 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -193,7 +193,7 @@ static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv) if (mlx4_en_prepare_rx_desc(priv, ring, ring->actual_size, - GFP_KERNEL | __GFP_COLD)) { + GFP_KERNEL)) { if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) { en_err(priv, "Failed to allocate enough rx buffers\n"); return -ENOMEM; @@ -551,8 +551,7 @@ static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv, do { if (mlx4_en_prepare_rx_desc(priv, ring, ring->prod & ring->size_mask, - GFP_ATOMIC | __GFP_COLD | - __GFP_MEMALLOC)) + GFP_ATOMIC | __GFP_MEMALLOC)) break; ring->prod++; } while (likely(--missing)); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 232044b1b7aa..1a603fdd9e80 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1185,7 +1185,7 @@ static void *nfp_net_rx_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr) } else { struct page *page; - page = alloc_page(GFP_KERNEL | __GFP_COLD); + page = alloc_page(GFP_KERNEL); frag = page ? page_address(page) : NULL; } if (!frag) { diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c index 29fea74bff2e..7b97a9969046 100644 --- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c +++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c @@ -1092,8 +1092,7 @@ static int ql_get_next_chunk(struct ql_adapter *qdev, struct rx_ring *rx_ring, { if (!rx_ring->pg_chunk.page) { u64 map; - rx_ring->pg_chunk.page = alloc_pages(__GFP_COLD | __GFP_COMP | - GFP_ATOMIC, + rx_ring->pg_chunk.page = alloc_pages(__GFP_COMP | GFP_ATOMIC, qdev->lbq_buf_order); if (unlikely(!rx_ring->pg_chunk.page)) { netif_err(qdev, drv, qdev->ndev, diff --git a/drivers/net/ethernet/sfc/falcon/rx.c b/drivers/net/ethernet/sfc/falcon/rx.c index 382019b302db..02456ed13a7d 100644 --- a/drivers/net/ethernet/sfc/falcon/rx.c +++ b/drivers/net/ethernet/sfc/falcon/rx.c @@ -163,7 +163,7 @@ static int ef4_init_rx_buffers(struct ef4_rx_queue *rx_queue, bool atomic) do { page = ef4_reuse_page(rx_queue); if (page == NULL) { - page = alloc_pages(__GFP_COLD | __GFP_COMP | + page = alloc_pages(__GFP_COMP | (atomic ? GFP_ATOMIC : GFP_KERNEL), efx->rx_buffer_order); if (unlikely(page == NULL)) diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index 8cb60513dca2..cfe76aad79ee 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -163,7 +163,7 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue, bool atomic) do { page = efx_reuse_page(rx_queue); if (page == NULL) { - page = alloc_pages(__GFP_COLD | __GFP_COMP | + page = alloc_pages(__GFP_COMP | (atomic ? GFP_ATOMIC : GFP_KERNEL), efx->rx_buffer_order); if (unlikely(page == NULL)) diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c index e9672b1f9968..031cf9c3435a 100644 --- a/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c +++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c @@ -335,7 +335,7 @@ static int xlgmac_alloc_pages(struct xlgmac_pdata *pdata, dma_addr_t pages_dma; /* Try to obtain pages, decreasing order if necessary */ - gfp |= __GFP_COLD | __GFP_COMP | __GFP_NOWARN; + gfp |= __GFP_COMP | __GFP_NOWARN; while (order >= 0) { pages = alloc_pages(gfp, order); if (pages) diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index 15e2e3031d36..ed58c746e4af 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -906,7 +906,7 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq) sw_data[0] = (u32)bufptr; } else { /* Allocate a secondary receive queue entry */ - page = alloc_page(GFP_ATOMIC | GFP_DMA | __GFP_COLD); + page = alloc_page(GFP_ATOMIC | GFP_DMA); if (unlikely(!page)) { dev_warn_ratelimited(netcp->ndev_dev, "Secondary page alloc failed\n"); goto fail; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index edf984406ba0..19a985ef9104 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1030,7 +1030,6 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq, int err; bool oom; - gfp |= __GFP_COLD; do { if (vi->mergeable_rx_bufs) err = add_recvbuf_mergeable(vi, rq, gfp); diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index d5612bd1cc81..e949e3302af4 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -23,6 +23,7 @@ #include <linux/ndctl.h> #include <linux/fs.h> #include <linux/nd.h> +#include <linux/backing-dev.h> #include "btt.h" #include "nd.h" @@ -1402,6 +1403,8 @@ static int btt_blk_init(struct btt *btt) btt->btt_disk->private_data = btt; btt->btt_disk->queue = btt->btt_queue; btt->btt_disk->flags = GENHD_FL_EXT_DEVT; + btt->btt_disk->queue->backing_dev_info->capabilities |= + BDI_CAP_SYNCHRONOUS_IO; blk_queue_make_request(btt->btt_queue, btt_make_request); blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 39dfd7affa31..7fbc5c5dc8e1 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -31,6 +31,7 @@ #include <linux/uio.h> #include <linux/dax.h> #include <linux/nd.h> +#include <linux/backing-dev.h> #include "pmem.h" #include "pfn.h" #include "nd.h" @@ -394,6 +395,7 @@ static int pmem_attach_disk(struct device *dev, disk->fops = &pmem_fops; disk->queue = q; disk->flags = GENHD_FL_EXT_DEVT; + disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; nvdimm_namespace_disk_name(ndns, disk->disk_name); set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) / 512); diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c index 9e538a59f09d..03e55bca4ada 100644 --- a/drivers/staging/lustre/lustre/mdc/mdc_request.c +++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c @@ -1152,7 +1152,7 @@ static int mdc_read_page_remote(void *data, struct page *page0) } for (npages = 1; npages < max_pages; npages++) { - page = page_cache_alloc_cold(inode->i_mapping); + page = page_cache_alloc(inode->i_mapping); if (!page) break; page_pool[npages] = page; diff --git a/fs/afs/write.c b/fs/afs/write.c index 106e43db1115..11dd0526b96b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -308,7 +308,7 @@ static void afs_kill_pages(struct afs_vnode *vnode, bool error, _enter("{%x:%u},%lx-%lx", vnode->fid.vid, vnode->fid.vnode, first, last); - pagevec_init(&pv, 0); + pagevec_init(&pv); do { _debug("kill %lx-%lx", first, last); @@ -497,20 +497,13 @@ static int afs_writepages_region(struct address_space *mapping, _enter(",,%lx,%lx,", index, end); do { - n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY, - 1, &page); + n = find_get_pages_range_tag(mapping, &index, end, + PAGECACHE_TAG_DIRTY, 1, &page); if (!n) break; _debug("wback %lx", page->index); - if (page->index > end) { - *_next = index; - put_page(page); - _leave(" = 0 [%lx]", *_next); - return 0; - } - /* at this point we hold neither mapping->tree_lock nor lock on * the page itself: the page may be truncated or invalidated * (changing page->mapping to NULL), or even swizzled back from @@ -609,7 +602,7 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) ASSERT(wb != NULL); - pagevec_init(&pv, 0); + pagevec_init(&pv); do { _debug("done %lx-%lx", first, last); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index adbbc017191c..16045ea86fc1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3797,7 +3797,7 @@ int btree_write_cache_pages(struct address_space *mapping, int scanned = 0; int tag; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -3814,8 +3814,8 @@ retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag))) { unsigned i; scanned = 1; @@ -3825,11 +3825,6 @@ retry: if (!PagePrivate(page)) continue; - if (!wbc->range_cyclic && page->index > end) { - done = 1; - break; - } - spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { spin_unlock(&mapping->private_lock); @@ -3941,7 +3936,7 @@ static int extent_write_cache_pages(struct address_space *mapping, if (!igrab(inode)) return 0; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -3961,8 +3956,8 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, + &index, end, tag))) { unsigned i; scanned = 1; @@ -3987,12 +3982,6 @@ retry: continue; } - if (!wbc->range_cyclic && page->index > end) { - done = 1; - unlock_page(page); - continue; - } - if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(page)) flush_fn(data); diff --git a/fs/buffer.c b/fs/buffer.c index 1c18a22a6013..0736a6a2e2f0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1592,7 +1592,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) struct buffer_head *head; end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) { count = pagevec_count(&pvec); for (i = 0; i < count; i++) { @@ -3514,7 +3514,7 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, if (length <= 0) return -ENOENT; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); do { unsigned nr_pages, i; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 18d7aa61ef0f..883bc7bb12c5 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -256,8 +256,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, goto backing_page_already_present; if (!newpage) { - newpage = __page_cache_alloc(cachefiles_gfp | - __GFP_COLD); + newpage = __page_cache_alloc(cachefiles_gfp); if (!newpage) goto nomem_monitor; } @@ -493,8 +492,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, goto backing_page_already_present; if (!newpage) { - newpage = __page_cache_alloc(cachefiles_gfp | - __GFP_COLD); + newpage = __page_cache_alloc(cachefiles_gfp); if (!newpage) goto nomem; } @@ -710,7 +708,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, /* calculate the shift required to use bmap */ shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; - pagevec_init(&pagevec, 0); + pagevec_init(&pagevec); op->op.flags &= FSCACHE_OP_KEEP_FLAGS; op->op.flags |= FSCACHE_OP_ASYNC; @@ -844,7 +842,7 @@ int cachefiles_allocate_pages(struct fscache_retrieval *op, ret = cachefiles_has_space(cache, 0, *nr_pages); if (ret == 0) { - pagevec_init(&pagevec, 0); + pagevec_init(&pagevec); list_for_each_entry(page, pages, lru) { if (pagevec_add(&pagevec, page) == 0) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 4d622654bfbc..dbf07051aacd 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -680,7 +680,7 @@ static void ceph_release_pages(struct page **pages, int num) struct pagevec pvec; int i; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); for (i = 0; i < num; i++) { if (pagevec_add(&pvec, pages[i]) == 0) pagevec_release(&pvec); @@ -811,7 +811,7 @@ static int ceph_writepages_start(struct address_space *mapping, if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); start_index = wbc->range_cyclic ? mapping->writeback_index : 0; index = start_index; @@ -870,15 +870,10 @@ retry: max_pages = wsize >> PAGE_SHIFT; get_more_pages: - pvec_pages = min_t(unsigned, PAGEVEC_SIZE, - max_pages - locked_pages); - if (end - index < (u64)(pvec_pages - 1)) - pvec_pages = (unsigned)(end - index) + 1; - - pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - pvec_pages); - dout("pagevec_lookup_tag got %d\n", pvec_pages); + pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index, + end, PAGECACHE_TAG_DIRTY, + max_pages - locked_pages); + dout("pagevec_lookup_range_tag got %d\n", pvec_pages); if (!pvec_pages && !locked_pages) break; for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { @@ -896,16 +891,6 @@ get_more_pages: unlock_page(page); continue; } - if (page->index > end) { - dout("end of range %p\n", page); - /* can't be range_cyclic (1st pass) because - * end == -1 in that case. */ - stop = true; - if (ceph_wbc.head_snapc) - done = true; - unlock_page(page); - break; - } if (strip_unit_end && (page->index > strip_unit_end)) { dout("end of strip unit %p\n", page); unlock_page(page); @@ -1177,8 +1162,7 @@ release_pvec_pages: index = 0; while ((index <= end) && (nr = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - PAGEVEC_SIZE))) { + PAGECACHE_TAG_WRITEBACK))) { for (i = 0; i < nr; i++) { page = pvec.pages[i]; if (page_snap_context(page) != snapc) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 92fdf9c35de2..df9f682708c6 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1963,8 +1963,6 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, pgoff_t end, pgoff_t *index, unsigned int *found_pages) { - unsigned int nr_pages; - struct page **pages; struct cifs_writedata *wdata; wdata = cifs_writedata_alloc((unsigned int)tofind, @@ -1972,23 +1970,8 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, if (!wdata) return NULL; - /* - * find_get_pages_tag seems to return a max of 256 on each - * iteration, so we must call it several times in order to - * fill the array or the wsize is effectively limited to - * 256 * PAGE_SIZE. - */ - *found_pages = 0; - pages = wdata->pages; - do { - nr_pages = find_get_pages_tag(mapping, index, - PAGECACHE_TAG_DIRTY, tofind, - pages); - *found_pages += nr_pages; - tofind -= nr_pages; - pages += nr_pages; - } while (nr_pages && tofind && *index <= end); - + *found_pages = find_get_pages_range_tag(mapping, index, end, + PAGECACHE_TAG_DIRTY, tofind, wdata->pages); return wdata; } @@ -565,7 +565,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, ret = __radix_tree_lookup(page_tree, index, &node, &slot); WARN_ON_ONCE(ret != entry); __radix_tree_replace(page_tree, node, slot, - new_entry, NULL, NULL); + new_entry, NULL); entry = new_entry; } @@ -614,6 +614,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl)) continue; + /* + * No need to call mmu_notifier_invalidate_range() as we are + * downgrading page table protection not changing it to point + * to a new page. + * + * See Documentation/vm/mmu_notifier.txt + */ if (pmdp) { #ifdef CONFIG_FS_DAX_PMD pmd_t pmd; @@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); set_pmd_at(vma->vm_mm, address, pmdp, pmd); - mmu_notifier_invalidate_range(vma->vm_mm, start, end); unlock_pmd: spin_unlock(ptl); #endif @@ -643,7 +649,6 @@ unlock_pmd: pte = pte_wrprotect(pte); pte = pte_mkclean(pte); set_pte_at(vma->vm_mm, address, ptep, pte); - mmu_notifier_invalidate_range(vma->vm_mm, start, end); unlock_pte: pte_unmap_unlock(ptep, ptl); } @@ -789,7 +794,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, tag_pages_for_writeback(mapping, start_index, end_index); - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (!done) { pvec.nr = find_get_entries_tag(mapping, start_index, PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, diff --git a/fs/dcache.c b/fs/dcache.c index bcc9f6981569..5c7df1df81ff 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2705,8 +2705,6 @@ static void swap_names(struct dentry *dentry, struct dentry *target) */ unsigned int i; BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); - kmemcheck_mark_initialized(dentry->d_iname, DNAME_INLINE_LEN); - kmemcheck_mark_initialized(target->d_iname, DNAME_INLINE_LEN); for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { swap(((long *) &dentry->d_iname)[i], ((long *) &target->d_iname)[i]); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 6b801186baa5..25aeaa7328ba 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -660,7 +660,7 @@ static struct ecryptfs_cache_info { struct kmem_cache **cache; const char *name; size_t size; - unsigned long flags; + slab_flags_t flags; void (*ctor)(void *obj); } ecryptfs_cache_infos[] = { { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2633150e41b9..8d2b582fb141 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1719,7 +1719,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, ext4_es_remove_extent(inode, start, last - start + 1); } - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (index <= end) { nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end); if (nr_pages == 0) @@ -2345,7 +2345,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) lblk = start << bpp_bits; pblock = mpd->map.m_pblk; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (start <= end) { nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &start, end); @@ -2616,12 +2616,12 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) else tag = PAGECACHE_TAG_DIRTY; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); mpd->map.m_len = 0; mpd->next_page = index; while (index <= end) { - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag); if (nr_pages == 0) goto out; @@ -2629,16 +2629,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) struct page *page = pvec.pages[i]; /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) - goto out; - - /* * Accumulated enough dirty pages? This doesn't apply * to WB_SYNC_ALL mode. For integrity sync we have to * keep going because someone may be concurrently diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 04fe1df052b2..0bb8e2c022d3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -305,25 +305,22 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); - pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; + pgoff_t index = 0, prev = ULONG_MAX; struct pagevec pvec; long nwritten = 0; + int nr_pages; struct writeback_control wbc = { .for_reclaim = 0, }; struct blk_plug plug; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); blk_start_plug(&plug); - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (unlikely(nr_pages == 0)) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 36b535207c88..7b3ad5d8e2e9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1635,7 +1635,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int range_whole = 0; int tag; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); if (get_dirty_pages(mapping->host) <= SM_I(F2FS_M_SB(mapping))->min_hot_blocks) @@ -1669,8 +1669,8 @@ retry: while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag); if (nr_pages == 0) break; @@ -1678,11 +1678,6 @@ retry: struct page *page = pvec.pages[i]; bool submitted = false; - if (page->index > end) { - done = 1; - break; - } - done_index = page->index; retry_write: lock_page(page); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 517e112c8a9a..f78b76ec4707 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -313,18 +313,19 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) static pgoff_t __get_first_dirty_index(struct address_space *mapping, pgoff_t pgofs, int whence) { - struct pagevec pvec; + struct page *page; int nr_pages; if (whence != SEEK_DATA) return 0; /* find first dirty page index */ - pagevec_init(&pvec, 0); - nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, - PAGECACHE_TAG_DIRTY, 1); - pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX; - pagevec_release(&pvec); + nr_pages = find_get_pages_tag(mapping, &pgofs, PAGECACHE_TAG_DIRTY, + 1, &page); + if (!nr_pages) + return ULONG_MAX; + pgofs = page->index; + put_page(page); return pgofs; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index fca87835a1da..b33dac9592ca 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1277,21 +1277,17 @@ release_page: static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { - pgoff_t index, end; + pgoff_t index; struct pagevec pvec; struct page *last_page = NULL; + int nr_pages; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = 0; - end = ULONG_MAX; - - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1425,13 +1421,14 @@ static int f2fs_write_node_page(struct page *page, int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { - pgoff_t index, end; + pgoff_t index; pgoff_t last_idx = ULONG_MAX; struct pagevec pvec; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; + int nr_pages; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1439,17 +1436,12 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, return PTR_ERR_OR_ZERO(last_page); } retry: - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = 0; - end = ULONG_MAX; - - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1548,25 +1540,21 @@ out: int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type) { - pgoff_t index, end; + pgoff_t index; struct pagevec pvec; int step = 0; int nwritten = 0; int ret = 0; + int nr_pages; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); next_step: index = 0; - end = ULONG_MAX; - - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1655,27 +1643,20 @@ out: int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { - pgoff_t index = 0, end = ULONG_MAX; + pgoff_t index = 0; struct pagevec pvec; int ret2, ret = 0; + int nr_pages; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_WRITEBACK))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* until radix tree lookup accepts end_index */ - if (unlikely(page->index > end)) - continue; - if (ino && ino_of_node(page) == ino) { f2fs_wait_on_page_writeback(page, NODE, true); if (TestClearPageError(page)) diff --git a/fs/file_table.c b/fs/file_table.c index 49e1f2f1a4cb..2dc9f38bd195 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -312,7 +312,7 @@ void put_filp(struct file *file) void __init files_init(void) { filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 0ad3fd3ad0b4..961029e04027 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -1175,7 +1175,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, return; } - pagevec_init(&pvec, 0); + pagevec_init(&pvec); next = 0; do { if (!pagevec_lookup(&pvec, mapping, &next)) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a42d89371748..17f0d05bfd4c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1636,7 +1636,7 @@ out_finish: static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) { - release_pages(req->pages, req->num_pages, false); + release_pages(req->pages, req->num_pages); } static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index a79e320349cd..2f504d615d92 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1273,9 +1273,9 @@ static int __init fuse_fs_init(void) int err; fuse_inode_cachep = kmem_cache_create("fuse_inode", - sizeof(struct fuse_inode), 0, - SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, - fuse_inode_init_once); + sizeof(struct fuse_inode), 0, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT, + fuse_inode_init_once); err = -ENOMEM; if (!fuse_inode_cachep) goto out; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 68ed06962537..1daf15a1f00c 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -280,22 +280,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, for(i = 0; i < nr_pages; i++) { struct page *page = pvec->pages[i]; - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) { - /* - * can't be range_cyclic (1st pass) because - * end == -1 in that case. - */ - ret = 1; - break; - } - *done_index = page->index; lock_page(page); @@ -387,7 +371,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, int range_whole = 0; int tag; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -413,8 +397,8 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag); if (nr_pages == 0) break; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ed113ea17aff..1e76730aac0d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -407,7 +407,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); - pagevec_init(&pvec, 0); + pagevec_init(&pvec); next = start; while (next < end) { /* @@ -668,7 +668,6 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) return error; if (ia_valid & ATTR_SIZE) { - error = -EINVAL; if (attr->ia_size & ~huge_page_mask(h)) return -EINVAL; error = hugetlb_vmtruncate(inode, attr->ia_size); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 06ffa135dfa6..16a7a67a11c9 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -2156,10 +2156,10 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, level++) INIT_LIST_HEAD(&lists[level]); - pagevec_init(&pvec, 0); + pagevec_init(&pvec); - while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) { + while (pagevec_lookup_tag(&pvec, btcache, &index, + PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { bh = head = page_buffers(pvec.pages[i]); do { diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 8616c46d33da..68241512d7c1 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -255,10 +255,9 @@ int nilfs_copy_dirty_pages(struct address_space *dmap, pgoff_t index = 0; int err = 0; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); repeat: - if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) + if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY)) return 0; for (i = 0; i < pagevec_count(&pvec); i++) { @@ -310,7 +309,7 @@ void nilfs_copy_back_pages(struct address_space *dmap, pgoff_t index = 0; int err; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); repeat: n = pagevec_lookup(&pvec, smap, &index); if (!n) @@ -374,10 +373,10 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) unsigned int i; pgoff_t index = 0; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); - while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) { + while (pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -519,7 +518,7 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, index = start_blk >> (PAGE_SHIFT - inode->i_blkbits); nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits); - pagevec_init(&pvec, 0); + pagevec_init(&pvec); repeat: pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE, diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 70ded52dc1dd..f65392fecb5c 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -708,21 +708,17 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, index = start >> PAGE_SHIFT; last = end >> PAGE_SHIFT; } - pagevec_init(&pvec, 0); + pagevec_init(&pvec); repeat: if (unlikely(index > last) || - !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - min_t(pgoff_t, last - index, - PAGEVEC_SIZE - 1) + 1)) + !pagevec_lookup_range_tag(&pvec, mapping, &index, last, + PAGECACHE_TAG_DIRTY)) return ndirties; for (i = 0; i < pagevec_count(&pvec); i++) { struct buffer_head *bh, *head; struct page *page = pvec.pages[i]; - if (unlikely(page->index > last)) - break; - lock_page(page); if (!page_has_buffers(page)) create_empty_buffers(page, i_blocksize(inode), 0); @@ -757,10 +753,10 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, unsigned int i; pgoff_t index = 0; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); - while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) { + while (pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { bh = head = page_buffers(pvec.pages[i]); do { diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index addd7c5f2d3e..ab5105f9767e 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -3585,8 +3585,6 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, * The easy case - we can just plop the record right in. */ *left_rec = *split_rec; - - has_empty_extent = 0; } else le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 88a31e9340a0..d1516327b787 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -134,6 +134,19 @@ bail: return err; } +static int ocfs2_lock_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + down_read(&oi->ip_alloc_sem); + ret = ocfs2_get_block(inode, iblock, bh_result, create); + up_read(&oi->ip_alloc_sem); + + return ret; +} + int ocfs2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -2128,7 +2141,7 @@ static void ocfs2_dio_free_write_ctx(struct inode *inode, * called like this: dio->get_blocks(dio->inode, fs_startblk, * fs_count, map_bh, dio->rw == WRITE); */ -static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, +static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -2154,12 +2167,9 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, * while file size will be changed. */ if (pos + total_len <= i_size_read(inode)) { - down_read(&oi->ip_alloc_sem); - /* This is the fast path for re-write. */ - ret = ocfs2_get_block(inode, iblock, bh_result, create); - - up_read(&oi->ip_alloc_sem); + /* This is the fast path for re-write. */ + ret = ocfs2_lock_get_block(inode, iblock, bh_result, create); if (buffer_mapped(bh_result) && !buffer_new(bh_result) && ret == 0) @@ -2424,9 +2434,9 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return 0; if (iov_iter_rw(iter) == READ) - get_block = ocfs2_get_block; + get_block = ocfs2_lock_get_block; else - get_block = ocfs2_dio_get_block; + get_block = ocfs2_dio_wr_get_block; return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block, diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h index b97bcc6dde7c..b1bb70c8ca4d 100644 --- a/fs/ocfs2/buffer_head_io.h +++ b/fs/ocfs2/buffer_head_io.h @@ -28,9 +28,6 @@ #include <linux/buffer_head.h> -void ocfs2_end_buffer_io_sync(struct buffer_head *bh, - int uptodate); - int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, struct ocfs2_caching_info *ci); diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 3ef5137dc362..a9e67efc0004 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -79,10 +79,8 @@ void o2hb_fill_node_map(unsigned long *map, unsigned bytes); void o2hb_exit(void); int o2hb_init(void); -int o2hb_check_node_heartbeating(u8 node_num); int o2hb_check_node_heartbeating_no_sem(u8 node_num); int o2hb_check_node_heartbeating_from_callback(u8 node_num); -int o2hb_check_local_node_heartbeating(void); void o2hb_stop_all_regions(void); int o2hb_get_all_regions(char *region_uuids, u8 numregions); int o2hb_global_heartbeat_active(void); diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index a51200ece93d..da64c3a20eeb 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -40,6 +40,9 @@ char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { "panic", /* O2NM_FENCE_PANIC */ }; +static inline void o2nm_lock_subsystem(void); +static inline void o2nm_unlock_subsystem(void); + struct o2nm_node *o2nm_get_node_by_num(u8 node_num) { struct o2nm_node *node = NULL; @@ -181,7 +184,10 @@ static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node) { /* through the first node_set .parent * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */ - return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); + if (node->nd_item.ci_parent) + return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); + else + return NULL; } enum { @@ -194,7 +200,7 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page, size_t count) { struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_cluster *cluster; unsigned long tmp; char *p = (char *)page; int ret = 0; @@ -214,6 +220,13 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page, !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) return -EINVAL; /* XXX */ + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + o2nm_unlock_subsystem(); + return -EINVAL; + } + write_lock(&cluster->cl_nodes_lock); if (cluster->cl_nodes[tmp]) ret = -EEXIST; @@ -226,6 +239,8 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page, set_bit(tmp, cluster->cl_nodes_bitmap); } write_unlock(&cluster->cl_nodes_lock); + o2nm_unlock_subsystem(); + if (ret) return ret; @@ -269,7 +284,7 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item, size_t count) { struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_cluster *cluster; int ret, i; struct rb_node **p, *parent; unsigned int octets[4]; @@ -286,6 +301,13 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item, be32_add_cpu(&ipv4_addr, octets[i] << (i * 8)); } + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + o2nm_unlock_subsystem(); + return -EINVAL; + } + ret = 0; write_lock(&cluster->cl_nodes_lock); if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent)) @@ -298,6 +320,8 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item, rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree); } write_unlock(&cluster->cl_nodes_lock); + o2nm_unlock_subsystem(); + if (ret) return ret; @@ -315,7 +339,7 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page, size_t count) { struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_cluster *cluster; unsigned long tmp; char *p = (char *)page; ssize_t ret; @@ -333,17 +357,26 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page, !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) return -EINVAL; /* XXX */ + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + ret = -EINVAL; + goto out; + } + /* the only failure case is trying to set a new local node * when a different one is already set */ if (tmp && tmp == cluster->cl_has_local && - cluster->cl_local_node != node->nd_num) - return -EBUSY; + cluster->cl_local_node != node->nd_num) { + ret = -EBUSY; + goto out; + } /* bring up the rx thread if we're setting the new local node. */ if (tmp && !cluster->cl_has_local) { ret = o2net_start_listening(node); if (ret) - return ret; + goto out; } if (!tmp && cluster->cl_has_local && @@ -358,7 +391,11 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page, cluster->cl_local_node = node->nd_num; } - return count; + ret = count; + +out: + o2nm_unlock_subsystem(); + return ret; } CONFIGFS_ATTR(o2nm_node_, num); @@ -738,6 +775,16 @@ static struct o2nm_cluster_group o2nm_cluster_group = { }, }; +static inline void o2nm_lock_subsystem(void) +{ + mutex_lock(&o2nm_cluster_group.cs_subsys.su_mutex); +} + +static inline void o2nm_unlock_subsystem(void) +{ + mutex_unlock(&o2nm_cluster_group.cs_subsys.su_mutex); +} + int o2nm_depend_item(struct config_item *item) { return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index a2b19fbdcf46..e1fea149f50b 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -394,7 +394,6 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm) static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) { if (dlm->dlm_worker) { - flush_workqueue(dlm->dlm_worker); destroy_workqueue(dlm->dlm_worker); dlm->dlm_worker = NULL; } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3e04279446e8..9c3e0f13ca87 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2616,7 +2616,9 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, * otherwise the assert_master from the new * master will destroy this. */ - dlm_get_mle_inuse(mle); + if (ret != -EEXIST) + dlm_get_mle_inuse(mle); + spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 74407c6dd592..ec8f75813beb 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2419,6 +2419,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) dlm_lockres_put(res); continue; } + dlm_move_lockres_to_recovery_list(dlm, res); } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 988137de08f5..9c7c18c0e129 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -670,7 +670,6 @@ static void __exit exit_dlmfs_fs(void) { unregister_filesystem(&dlmfs_fs_type); - flush_workqueue(user_dlm_worker); destroy_workqueue(user_dlm_worker); /* diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6e41fc8fabbe..dc455d45a66a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1161,6 +1161,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; if (size_change) { + /* + * Here we should wait dio to finish before inode lock + * to avoid a deadlock between ocfs2_setattr() and + * ocfs2_dio_end_io_write() + */ + inode_dio_wait(inode); + status = ocfs2_rw_lock(inode, 1); if (status < 0) { mlog_errno(status); @@ -1200,8 +1207,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (status) goto bail_unlock; - inode_dio_wait(inode); - if (i_size_read(inode) >= attr->ia_size) { if (ocfs2_should_order_data(inode)) { status = ocfs2_begin_ordered_truncate(inode, diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 71f22c8fbffd..9f0b95abc09f 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1147,12 +1147,9 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT, NULL, ALLOC_NEW_GROUP); - if (status < 0 && status != -ENOSPC) { + if (status < 0 && status != -ENOSPC) mlog_errno(status); - goto bail; - } -bail: return status; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 80733496b22a..040bbb6a6e4b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2521,10 +2521,8 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) /* This function assumes that the caller has the main osb resource */ /* ocfs2_initializer_super have already created this workqueue */ - if (osb->ocfs2_wq) { - flush_workqueue(osb->ocfs2_wq); + if (osb->ocfs2_wq) destroy_workqueue(osb->ocfs2_wq); - } ocfs2_free_slot_info(osb); diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index b023e4f3d740..d4550c8bbc41 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -26,9 +26,6 @@ #ifndef OCFS2_SUPER_H #define OCFS2_SUPER_H -int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, - int node_num); - __printf(3, 4) int __ocfs2_error(struct super_block *sb, const char *function, const char *fmt, ...); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6744bd706ecf..875231c36cb3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -26,7 +26,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long text, lib, swap, ptes, pmds, anon, file, shmem; + unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; anon = get_mm_counter(mm, MM_ANONPAGES); @@ -50,8 +50,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); - ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); - pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" @@ -67,7 +65,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" "VmPTE:\t%8lu kB\n" - "VmPMD:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), total_vm << (PAGE_SHIFT-10), @@ -80,8 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) shmem << (PAGE_SHIFT-10), mm->data_vm << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - ptes >> 10, - pmds >> 10, + mm_pgtables_bytes(mm) >> 10, swap << (PAGE_SHIFT-10)); hugetlb_report_usage(m, mm); } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f46d133c0949..ac9a4e65ca49 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -668,7 +668,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) ctx->features = octx->features; ctx->released = false; ctx->mm = vma->vm_mm; - atomic_inc(&ctx->mm->mm_count); + mmgrab(ctx->mm); userfaultfd_ctx_get(octx); fctx->orig = octx; diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 758f37ac5ad3..4b87472f35bc 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -104,7 +104,7 @@ kmem_zone_init(int size, char *zone_name) } static inline kmem_zone_t * -kmem_zone_init_flags(int size, char *zone_name, unsigned long flags, +kmem_zone_init_flags(int size, char *zone_name, slab_flags_t flags, void (*construct)(void *)) { return kmem_cache_create(zone_name, size, 0, flags, construct); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f41ca8486e02..e54e7e0033eb 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -93,7 +93,7 @@ extern void wb_writeout_inc(struct bdi_writeback *wb); /* * maximal error of a stat counter. */ -static inline unsigned long wb_stat_error(struct bdi_writeback *wb) +static inline unsigned long wb_stat_error(void) { #ifdef CONFIG_SMP return nr_cpu_ids * WB_STAT_BATCH; @@ -122,6 +122,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. * * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback. + * BDI_CAP_SYNCHRONOUS_IO: Device is so fast that asynchronous IO would be + * inefficient. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 @@ -129,6 +131,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_STABLE_WRITES 0x00000008 #define BDI_CAP_STRICTLIMIT 0x00000010 #define BDI_CAP_CGROUP_WRITEBACK 0x00000020 +#define BDI_CAP_SYNCHRONOUS_IO 0x00000040 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) @@ -174,6 +177,11 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) long congestion_wait(int sync, long timeout); long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); +static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi) +{ + return bdi->capabilities & BDI_CAP_SYNCHRONOUS_IO; +} + static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi) { return bdi->capabilities & BDI_CAP_STABLE_WRITES; diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index fdf40ca04b3c..a53063e9d7d8 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -161,6 +161,9 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, #define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0) /* FIXME: Move to memblock.h at a point where we remove nobootmem.c */ +void *memblock_virt_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, + phys_addr_t max_addr, int nid); void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); @@ -177,6 +180,14 @@ static inline void * __init memblock_virt_alloc( NUMA_NO_NODE); } +static inline void * __init memblock_virt_alloc_raw( + phys_addr_t size, phys_addr_t align) +{ + return memblock_virt_alloc_try_nid_raw(size, align, BOOTMEM_LOW_LIMIT, + BOOTMEM_ALLOC_ACCESSIBLE, + NUMA_NO_NODE); +} + static inline void * __init memblock_virt_alloc_nopanic( phys_addr_t size, phys_addr_t align) { @@ -258,6 +269,14 @@ static inline void * __init memblock_virt_alloc( return __alloc_bootmem(size, align, BOOTMEM_LOW_LIMIT); } +static inline void * __init memblock_virt_alloc_raw( + phys_addr_t size, phys_addr_t align) +{ + if (!align) + align = SMP_CACHE_BYTES; + return __alloc_bootmem_nopanic(size, align, BOOTMEM_LOW_LIMIT); +} + static inline void * __init memblock_virt_alloc_nopanic( phys_addr_t size, phys_addr_t align) { @@ -310,6 +329,14 @@ static inline void * __init memblock_virt_alloc_try_nid(phys_addr_t size, min_addr); } +static inline void * __init memblock_virt_alloc_try_nid_raw( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, int nid) +{ + return ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, align, + min_addr, max_addr); +} + static inline void * __init memblock_virt_alloc_try_nid_nopanic( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) diff --git a/include/linux/c2port.h b/include/linux/c2port.h index 4efabcb51347..f2736348ca26 100644 --- a/include/linux/c2port.h +++ b/include/linux/c2port.h @@ -9,8 +9,6 @@ * the Free Software Foundation */ -#include <linux/kmemcheck.h> - #define C2PORT_NAME_LEN 32 struct device; @@ -22,10 +20,8 @@ struct device; /* Main struct */ struct c2port_ops; struct c2port_device { - kmemcheck_bitfield_begin(flags); unsigned int access:1; unsigned int flash_access:1; - kmemcheck_bitfield_end(flags); int id; char name[C2PORT_NAME_LEN]; diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index eee1499db396..e8f8e8fb244d 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -9,7 +9,6 @@ #include <linux/dma-debug.h> #include <linux/dma-direction.h> #include <linux/scatterlist.h> -#include <linux/kmemcheck.h> #include <linux/bug.h> #include <linux/mem_encrypt.h> @@ -232,7 +231,6 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, const struct dma_map_ops *ops = get_dma_ops(dev); dma_addr_t addr; - kmemcheck_mark_initialized(ptr, size); BUG_ON(!valid_dma_direction(dir)); addr = ops->map_page(dev, virt_to_page(ptr), offset_in_page(ptr), size, @@ -265,11 +263,8 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); - int i, ents; - struct scatterlist *s; + int ents; - for_each_sg(sg, s, nents, i) - kmemcheck_mark_initialized(sg_virt(s), s->length); BUG_ON(!valid_dma_direction(dir)); ents = ops->map_sg(dev, sg, nents, dir, attrs); BUG_ON(ents < 0); @@ -299,7 +294,6 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev, const struct dma_map_ops *ops = get_dma_ops(dev); dma_addr_t addr; - kmemcheck_mark_initialized(page_address(page) + offset, size); BUG_ON(!valid_dma_direction(dir)); addr = ops->map_page(dev, page, offset, size, dir, attrs); debug_dma_map_page(dev, page, offset, size, dir, addr, false); diff --git a/include/linux/filter.h b/include/linux/filter.h index 0cd02ff4ae30..80b5b482cb46 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -454,13 +454,11 @@ struct bpf_binary_header { struct bpf_prog { u16 pages; /* Number of allocated pages */ - kmemcheck_bitfield_begin(meta); u16 jited:1, /* Is our filter JIT'ed? */ locked:1, /* Program image locked? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1; /* Do we need dst entry? */ - kmemcheck_bitfield_end(meta); enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 710143741eb5..1a4582b44d32 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -24,7 +24,6 @@ struct vm_area_struct; #define ___GFP_HIGH 0x20u #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u -#define ___GFP_COLD 0x100u #define ___GFP_NOWARN 0x200u #define ___GFP_RETRY_MAYFAIL 0x400u #define ___GFP_NOFAIL 0x800u @@ -37,7 +36,6 @@ struct vm_area_struct; #define ___GFP_THISNODE 0x40000u #define ___GFP_ATOMIC 0x80000u #define ___GFP_ACCOUNT 0x100000u -#define ___GFP_NOTRACK 0x200000u #define ___GFP_DIRECT_RECLAIM 0x400000u #define ___GFP_WRITE 0x800000u #define ___GFP_KSWAPD_RECLAIM 0x1000000u @@ -193,27 +191,15 @@ struct vm_area_struct; /* * Action modifiers * - * __GFP_COLD indicates that the caller does not expect to be used in the near - * future. Where possible, a cache-cold page will be returned. - * * __GFP_NOWARN suppresses allocation failure reports. * * __GFP_COMP address compound page metadata. * * __GFP_ZERO returns a zeroed page on success. - * - * __GFP_NOTRACK avoids tracking with kmemcheck. - * - * __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of - * distinguishing in the source between false positives and allocations that - * cannot be supported (e.g. page tables). */ -#define __GFP_COLD ((__force gfp_t)___GFP_COLD) #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) -#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) -#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) /* Disable lockdep for GFP context tracking */ #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) @@ -539,8 +525,8 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); -extern void free_hot_cold_page(struct page *page, bool cold); -extern void free_hot_cold_page_list(struct list_head *list, bool cold); +extern void free_unref_page(struct page *page); +extern void free_unref_page_list(struct list_head *list); struct page_frag_cache; extern void __page_frag_cache_drain(struct page *page, unsigned int count); diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 96e69979f84d..325017ad9311 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -471,9 +471,9 @@ static inline void hmm_devmem_page_set_drvdata(struct page *page, * @page: pointer to struct page * Return: driver data value */ -static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page) +static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page) { - unsigned long *drvdata = (unsigned long *)&page->pgmap; + const unsigned long *drvdata = (const unsigned long *)&page->pgmap; return drvdata[1]; } diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index baeb872283d9..69c238210325 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -594,21 +594,6 @@ static inline void tasklet_hi_schedule(struct tasklet_struct *t) __tasklet_hi_schedule(t); } -extern void __tasklet_hi_schedule_first(struct tasklet_struct *t); - -/* - * This version avoids touching any other tasklets. Needed for kmemcheck - * in order not to take any page faults while enqueueing this tasklet; - * consider VERY carefully whether you really need this or - * tasklet_hi_schedule()... - */ -static inline void tasklet_hi_schedule_first(struct tasklet_struct *t) -{ - if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) - __tasklet_hi_schedule_first(t); -} - - static inline void tasklet_disable_nosync(struct tasklet_struct *t) { atomic_inc(&t->count); diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 5017269e3f04..e3eb834c9a35 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -46,7 +46,7 @@ void kasan_alloc_pages(struct page *page, unsigned int order); void kasan_free_pages(struct page *page, unsigned int order); void kasan_cache_create(struct kmem_cache *cache, size_t *size, - unsigned long *flags); + slab_flags_t *flags); void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); @@ -95,7 +95,7 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {} static inline void kasan_cache_create(struct kmem_cache *cache, size_t *size, - unsigned long *flags) {} + slab_flags_t *flags) {} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h index 7b1d7bead7d9..ea32a7d3cf1b 100644 --- a/include/linux/kmemcheck.h +++ b/include/linux/kmemcheck.h @@ -1,172 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef LINUX_KMEMCHECK_H -#define LINUX_KMEMCHECK_H - -#include <linux/mm_types.h> -#include <linux/types.h> - -#ifdef CONFIG_KMEMCHECK -extern int kmemcheck_enabled; - -/* The slab-related functions. */ -void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node); -void kmemcheck_free_shadow(struct page *page, int order); -void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, - size_t size); -void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size); - -void kmemcheck_pagealloc_alloc(struct page *p, unsigned int order, - gfp_t gfpflags); - -void kmemcheck_show_pages(struct page *p, unsigned int n); -void kmemcheck_hide_pages(struct page *p, unsigned int n); - -bool kmemcheck_page_is_tracked(struct page *p); - -void kmemcheck_mark_unallocated(void *address, unsigned int n); -void kmemcheck_mark_uninitialized(void *address, unsigned int n); -void kmemcheck_mark_initialized(void *address, unsigned int n); -void kmemcheck_mark_freed(void *address, unsigned int n); - -void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n); -void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n); -void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n); - -int kmemcheck_show_addr(unsigned long address); -int kmemcheck_hide_addr(unsigned long address); - -bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size); - -/* - * Bitfield annotations - * - * How to use: If you have a struct using bitfields, for example - * - * struct a { - * int x:8, y:8; - * }; - * - * then this should be rewritten as - * - * struct a { - * kmemcheck_bitfield_begin(flags); - * int x:8, y:8; - * kmemcheck_bitfield_end(flags); - * }; - * - * Now the "flags_begin" and "flags_end" members may be used to refer to the - * beginning and end, respectively, of the bitfield (and things like - * &x.flags_begin is allowed). As soon as the struct is allocated, the bit- - * fields should be annotated: - * - * struct a *a = kmalloc(sizeof(struct a), GFP_KERNEL); - * kmemcheck_annotate_bitfield(a, flags); - */ -#define kmemcheck_bitfield_begin(name) \ - int name##_begin[0]; - -#define kmemcheck_bitfield_end(name) \ - int name##_end[0]; - -#define kmemcheck_annotate_bitfield(ptr, name) \ - do { \ - int _n; \ - \ - if (!ptr) \ - break; \ - \ - _n = (long) &((ptr)->name##_end) \ - - (long) &((ptr)->name##_begin); \ - BUILD_BUG_ON(_n < 0); \ - \ - kmemcheck_mark_initialized(&((ptr)->name##_begin), _n); \ - } while (0) - -#define kmemcheck_annotate_variable(var) \ - do { \ - kmemcheck_mark_initialized(&(var), sizeof(var)); \ - } while (0) \ - -#else -#define kmemcheck_enabled 0 - -static inline void -kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) -{ -} - -static inline void -kmemcheck_free_shadow(struct page *page, int order) -{ -} - -static inline void -kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, - size_t size) -{ -} - -static inline void kmemcheck_slab_free(struct kmem_cache *s, void *object, - size_t size) -{ -} - -static inline void kmemcheck_pagealloc_alloc(struct page *p, - unsigned int order, gfp_t gfpflags) -{ -} - -static inline bool kmemcheck_page_is_tracked(struct page *p) -{ - return false; -} - -static inline void kmemcheck_mark_unallocated(void *address, unsigned int n) -{ -} - -static inline void kmemcheck_mark_uninitialized(void *address, unsigned int n) -{ -} - -static inline void kmemcheck_mark_initialized(void *address, unsigned int n) -{ -} - -static inline void kmemcheck_mark_freed(void *address, unsigned int n) -{ -} - -static inline void kmemcheck_mark_unallocated_pages(struct page *p, - unsigned int n) -{ -} - -static inline void kmemcheck_mark_uninitialized_pages(struct page *p, - unsigned int n) -{ -} - -static inline void kmemcheck_mark_initialized_pages(struct page *p, - unsigned int n) -{ -} - -static inline bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) -{ - return true; -} - -#define kmemcheck_bitfield_begin(name) -#define kmemcheck_bitfield_end(name) -#define kmemcheck_annotate_bitfield(ptr, name) \ - do { \ - } while (0) - -#define kmemcheck_annotate_variable(var) \ - do { \ - } while (0) - -#endif /* CONFIG_KMEMCHECK */ - -#endif /* LINUX_KMEMCHECK_H */ diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 590343f6c1b1..5ac416e2d339 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -48,14 +48,14 @@ extern void kmemleak_not_leak_phys(phys_addr_t phys) __ref; extern void kmemleak_ignore_phys(phys_addr_t phys) __ref; static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, - int min_count, unsigned long flags, + int min_count, slab_flags_t flags, gfp_t gfp) { if (!(flags & SLAB_NOLEAKTRACE)) kmemleak_alloc(ptr, size, min_count, gfp); } -static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags) +static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags) { if (!(flags & SLAB_NOLEAKTRACE)) kmemleak_free(ptr); @@ -76,7 +76,7 @@ static inline void kmemleak_alloc(const void *ptr, size_t size, int min_count, { } static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, - int min_count, unsigned long flags, + int min_count, slab_flags_t flags, gfp_t gfp) { } @@ -94,7 +94,7 @@ static inline void kmemleak_free(const void *ptr) static inline void kmemleak_free_part(const void *ptr, size_t size) { } -static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags) +static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags) { } static inline void kmemleak_free_percpu(const void __percpu *ptr) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index bae11c7e7bf3..7ed0f7782d16 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -237,6 +237,22 @@ unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn); for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ nid, flags, p_start, p_end, p_nid) +/** + * for_each_resv_unavail_range - iterate through reserved and unavailable memory + * @i: u64 used as loop variable + * @flags: pick from blocks based on memory attributes + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * + * Walks over unavailable but reserved (reserved && !memory) areas of memblock. + * Available as soon as memblock is initialized. + * Note: because this memory does not belong to any physical node, flags and + * nid arguments do not make sense and thus not exported as arguments. + */ +#define for_each_resv_unavail_range(i, p_start, p_end) \ + for_each_mem_range(i, &memblock.reserved, &memblock.memory, \ + NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL) + static inline void memblock_set_region_flags(struct memblock_region *r, unsigned long flags) { @@ -389,10 +405,10 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \ region++) -#define for_each_memblock_type(memblock_type, rgn) \ - for (idx = 0, rgn = &memblock_type->regions[0]; \ - idx < memblock_type->cnt; \ - idx++, rgn = &memblock_type->regions[idx]) +#define for_each_memblock_type(i, memblock_type, rgn) \ + for (i = 0, rgn = &memblock_type->regions[0]; \ + i < memblock_type->cnt; \ + i++, rgn = &memblock_type->regions[i]) #ifdef CONFIG_MEMTEST extern void early_memtest(phys_addr_t start, phys_addr_t end); diff --git a/include/linux/mm.h b/include/linux/mm.h index 91b46f99b4d2..c7b1d617dff6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -96,6 +96,15 @@ extern int mmap_rnd_compat_bits __read_mostly; #endif /* + * On some architectures it is expensive to call memset() for small sizes. + * Those architectures should provide their own implementation of "struct page" + * zeroing by defining this macro in <asm/pgtable.h>. + */ +#ifndef mm_zero_struct_page +#define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) +#endif + +/* * Default maximum number of active map areas, this limits the number of vmas * per mm struct. Users can overwrite this number by sysctl but there is a * problem. @@ -1431,7 +1440,13 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, struct bdi_writeback *wb); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); -void cancel_dirty_page(struct page *page); +void __cancel_dirty_page(struct page *page); +static inline void cancel_dirty_page(struct page *page) +{ + /* Avoid atomic ops, locking, etc. when not actually needed. */ + if (PageDirty(page)) + __cancel_dirty_page(page); +} int clear_page_dirty_for_io(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); @@ -1599,26 +1614,32 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); #endif -#ifdef __PAGETABLE_PUD_FOLDED +#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return 0; } +static inline void mm_inc_nr_puds(struct mm_struct *mm) {} +static inline void mm_dec_nr_puds(struct mm_struct *mm) {} + #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); -#endif -#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) -static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, - unsigned long address) +static inline void mm_inc_nr_puds(struct mm_struct *mm) { - return 0; + atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } -static inline void mm_nr_pmds_init(struct mm_struct *mm) {} +static inline void mm_dec_nr_puds(struct mm_struct *mm) +{ + atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); +} +#endif -static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) +static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, + unsigned long address) { return 0; } @@ -1629,25 +1650,47 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); -static inline void mm_nr_pmds_init(struct mm_struct *mm) +static inline void mm_inc_nr_pmds(struct mm_struct *mm) { - atomic_long_set(&mm->nr_pmds, 0); + atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } -static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +static inline void mm_dec_nr_pmds(struct mm_struct *mm) { - return atomic_long_read(&mm->nr_pmds); + atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } +#endif -static inline void mm_inc_nr_pmds(struct mm_struct *mm) +#ifdef CONFIG_MMU +static inline void mm_pgtables_bytes_init(struct mm_struct *mm) { - atomic_long_inc(&mm->nr_pmds); + atomic_long_set(&mm->pgtables_bytes, 0); } -static inline void mm_dec_nr_pmds(struct mm_struct *mm) +static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) +{ + return atomic_long_read(&mm->pgtables_bytes); +} + +static inline void mm_inc_nr_ptes(struct mm_struct *mm) +{ + atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); +} + +static inline void mm_dec_nr_ptes(struct mm_struct *mm) { - atomic_long_dec(&mm->nr_pmds); + atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } +#else + +static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {} +static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) +{ + return 0; +} + +static inline void mm_inc_nr_ptes(struct mm_struct *mm) {} +static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} #endif int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); @@ -2002,6 +2045,12 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn, struct mminit_pfnnid_cache *state); #endif +#ifdef CONFIG_HAVE_MEMBLOCK +void zero_resv_unavail(void); +#else +static inline void zero_resv_unavail(void) {} +#endif + extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c85f11dafd56..cfd0ac4e5e0e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -48,8 +48,10 @@ struct page { * inode address_space, or NULL. * If page mapped as anonymous * memory, low bit is set, and - * it points to anon_vma object: - * see PAGE_MAPPING_ANON below. + * it points to anon_vma object + * or KSM private structure. See + * PAGE_MAPPING_ANON and + * PAGE_MAPPING_KSM. */ void *s_mem; /* slab first object */ atomic_t compound_mapcount; /* first tail page */ @@ -207,14 +209,6 @@ struct page { not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ -#ifdef CONFIG_KMEMCHECK - /* - * kmemcheck wants to track the status of each byte in a page; this - * is a pointer to such a status block. NULL if not tracked. - */ - void *shadow; -#endif - #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif @@ -399,9 +393,8 @@ struct mm_struct { */ atomic_t mm_count; - atomic_long_t nr_ptes; /* PTE page table pages */ -#if CONFIG_PGTABLE_LEVELS > 2 - atomic_long_t nr_pmds; /* PMD page table pages */ +#ifdef CONFIG_MMU + atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif int map_count; /* number of VMAs */ diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 2cf1c3c807f6..b25dc9db19fc 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -156,7 +156,8 @@ struct mmu_notifier_ops { * shared page-tables, it not necessary to implement the * invalidate_range_start()/end() notifiers, as * invalidate_range() alread catches the points in time when an - * external TLB range needs to be flushed. + * external TLB range needs to be flushed. For more in depth + * discussion on this see Documentation/vm/mmu_notifier.txt * * The invalidate_range() function is called under the ptl * spin-lock and not allowed to sleep. @@ -213,7 +214,8 @@ extern void __mmu_notifier_change_pte(struct mm_struct *mm, extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end); extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, + bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); @@ -267,7 +269,14 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range_end(mm, start, end); + __mmu_notifier_invalidate_range_end(mm, start, end, false); +} + +static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + if (mm_has_notifiers(mm)) + __mmu_notifier_invalidate_range_end(mm, start, end, true); } static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, @@ -438,6 +447,11 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, { } +static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a507f43ad221..67f2e3c38939 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -700,7 +700,8 @@ typedef struct pglist_data { * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; - unsigned long static_init_size; + /* Number of non-deferred pages */ + unsigned long static_init_pgcnt; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -712,12 +713,6 @@ typedef struct pglist_data { /* Fields commonly accessed by the page reclaim scanner */ struct lruvec lruvec; - /* - * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on - * this node's LRU. Maintained by the pageout code. - */ - unsigned int inactive_ratio; - unsigned long flags; ZONE_PADDING(_pad2_) diff --git a/include/linux/net.h b/include/linux/net.h index d97d80d7fdf8..caeb159abda5 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -22,7 +22,6 @@ #include <linux/random.h> #include <linux/wait.h> #include <linux/fcntl.h> /* For O_CLOEXEC and O_NONBLOCK */ -#include <linux/kmemcheck.h> #include <linux/rcupdate.h> #include <linux/once.h> #include <linux/fs.h> @@ -111,9 +110,7 @@ struct socket_wq { struct socket { socket_state state; - kmemcheck_bitfield_begin(type); short type; - kmemcheck_bitfield_end(type); unsigned long flags; diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index de1c50b93c61..15cab3967d6d 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -104,7 +104,9 @@ extern nodemask_t _unused_nodemask_arg_; * * Can be used to provide arguments for '%*pb[l]' when printing a nodemask. */ -#define nodemask_pr_args(maskp) MAX_NUMNODES, (maskp)->bits +#define nodemask_pr_args(maskp) \ + ((maskp) != NULL) ? MAX_NUMNODES : 0, \ + ((maskp) != NULL) ? (maskp)->bits : NULL /* * The inline keyword gives the compiler room to decide to inline, or diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 584b14c774c1..3ec44e27aa9d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -18,7 +18,7 @@ * Various page->flags bits: * * PG_reserved is set for special pages, which can never be swapped out. Some - * of them might not even exist (eg empty_bad_page)... + * of them might not even exist... * * The PG_private bitflag is set on pagecache pages if they contain filesystem * specific data (which is normally at page->private). It can be used by diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 05a04e603686..cdad58bbfd8b 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -31,7 +31,7 @@ static inline bool is_migrate_isolate(int migratetype) #endif bool has_unmovable_pages(struct zone *zone, struct page *page, int count, - bool skip_hwpoisoned_pages); + int migratetype, bool skip_hwpoisoned_pages); void set_pageblock_migratetype(struct page *page, int migratetype); int move_freepages_block(struct zone *zone, struct page *page, int migratetype, int *num_movable); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e08b5339023c..34ce3ebf97d5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -16,6 +16,8 @@ #include <linux/hardirq.h> /* for in_interrupt() */ #include <linux/hugetlb_inline.h> +struct pagevec; + /* * Bits in mapping->flags. */ @@ -116,7 +118,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) m->gfp_mask = mask; } -void release_pages(struct page **pages, int nr, bool cold); +void release_pages(struct page **pages, int nr); /* * speculatively take a reference to a page. @@ -232,15 +234,9 @@ static inline struct page *page_cache_alloc(struct address_space *x) return __page_cache_alloc(mapping_gfp_mask(x)); } -static inline struct page *page_cache_alloc_cold(struct address_space *x) -{ - return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD); -} - static inline gfp_t readahead_gfp_mask(struct address_space *x) { - return mapping_gfp_mask(x) | - __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN; + return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN; } typedef int filler_t(void *, struct page *); @@ -366,8 +362,16 @@ static inline unsigned find_get_pages(struct address_space *mapping, } unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); -unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, - int tag, unsigned int nr_pages, struct page **pages); +unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, + pgoff_t end, int tag, unsigned int nr_pages, + struct page **pages); +static inline unsigned find_get_pages_tag(struct address_space *mapping, + pgoff_t *index, int tag, unsigned int nr_pages, + struct page **pages) +{ + return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, + nr_pages, pages); +} unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, int tag, unsigned int nr_entries, struct page **entries, pgoff_t *indices); @@ -616,6 +620,8 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, extern void delete_from_page_cache(struct page *page); extern void __delete_from_page_cache(struct page *page, void *shadow); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); +void delete_from_page_cache_batch(struct address_space *mapping, + struct pagevec *pvec); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 2636c0c0f279..5fb6580f7f23 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -17,7 +17,7 @@ struct address_space; struct pagevec { unsigned long nr; - unsigned long cold; + bool percpu_pvec_drained; struct page *pages[PAGEVEC_SIZE]; }; @@ -38,14 +38,22 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec, return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1); } -unsigned pagevec_lookup_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, int tag, - unsigned nr_pages); +unsigned pagevec_lookup_range_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag); +unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned max_pages); +static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, int tag) +{ + return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag); +} -static inline void pagevec_init(struct pagevec *pvec, int cold) +static inline void pagevec_init(struct pagevec *pvec) { pvec->nr = 0; - pvec->cold = cold; + pvec->percpu_pvec_drained = false; } static inline void pagevec_reinit(struct pagevec *pvec) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 567ebb5eaab0..0ca448c1cb42 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -301,18 +301,17 @@ void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index, void *radix_tree_lookup(const struct radix_tree_root *, unsigned long); void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *, unsigned long index); -typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *); +typedef void (*radix_tree_update_node_t)(struct radix_tree_node *); void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *, void __rcu **slot, void *entry, - radix_tree_update_node_t update_node, void *private); + radix_tree_update_node_t update_node); void radix_tree_iter_replace(struct radix_tree_root *, const struct radix_tree_iter *, void __rcu **slot, void *entry); void radix_tree_replace_slot(struct radix_tree_root *, void __rcu **slot, void *entry); void __radix_tree_delete_node(struct radix_tree_root *, struct radix_tree_node *, - radix_tree_update_node_t update_node, - void *private); + radix_tree_update_node_t update_node); void radix_tree_iter_delete(struct radix_tree_root *, struct radix_tree_iter *iter, void __rcu **slot); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index fa6ace66fea5..289e4d54e3e0 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -2,7 +2,6 @@ #ifndef _LINUX_RING_BUFFER_H #define _LINUX_RING_BUFFER_H -#include <linux/kmemcheck.h> #include <linux/mm.h> #include <linux/seq_file.h> #include <linux/poll.h> @@ -14,9 +13,7 @@ struct ring_buffer_iter; * Don't refer to this struct directly, use functions below. */ struct ring_buffer_event { - kmemcheck_bitfield_begin(bitfield); u32 type_len:5, time_delta:27; - kmemcheck_bitfield_end(bitfield); u32 array[]; }; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 54fe91183a8e..ed06e1c28fc7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -15,7 +15,6 @@ #define _LINUX_SKBUFF_H #include <linux/kernel.h> -#include <linux/kmemcheck.h> #include <linux/compiler.h> #include <linux/time.h> #include <linux/bug.h> @@ -711,7 +710,6 @@ struct sk_buff { /* Following fields are _not_ copied in __copy_skb_header() * Note that queue_mapping is here mostly to fill a hole. */ - kmemcheck_bitfield_begin(flags1); __u16 queue_mapping; /* if you move cloned around you also must adapt those constants */ @@ -730,7 +728,6 @@ struct sk_buff { head_frag:1, xmit_more:1, __unused:1; /* one bit hole */ - kmemcheck_bitfield_end(flags1); /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() @@ -2664,7 +2661,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask, * 4. __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to * code in gfp_to_alloc_flags that should be enforcing this. */ - gfp_mask |= __GFP_COLD | __GFP_COMP | __GFP_MEMALLOC; + gfp_mask |= __GFP_COMP | __GFP_MEMALLOC; return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); } diff --git a/include/linux/slab.h b/include/linux/slab.h index af5aa65c7c18..50697a1d6621 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -21,13 +21,20 @@ * Flags to pass to kmem_cache_create(). * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. */ -#define SLAB_CONSISTENCY_CHECKS 0x00000100UL /* DEBUG: Perform (expensive) checks on alloc/free */ -#define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */ -#define SLAB_POISON 0x00000800UL /* DEBUG: Poison objects */ -#define SLAB_HWCACHE_ALIGN 0x00002000UL /* Align objs on cache lines */ -#define SLAB_CACHE_DMA 0x00004000UL /* Use GFP_DMA memory */ -#define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ -#define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */ +/* DEBUG: Perform (expensive) checks on alloc/free */ +#define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100U) +/* DEBUG: Red zone objs in a cache */ +#define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400U) +/* DEBUG: Poison objects */ +#define SLAB_POISON ((slab_flags_t __force)0x00000800U) +/* Align objs on cache lines */ +#define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000U) +/* Use GFP_DMA memory */ +#define SLAB_CACHE_DMA ((slab_flags_t __force)0x00004000U) +/* DEBUG: Store the last owner for bug hunting */ +#define SLAB_STORE_USER ((slab_flags_t __force)0x00010000U) +/* Panic if kmem_cache_create() fails */ +#define SLAB_PANIC ((slab_flags_t __force)0x00040000U) /* * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! * @@ -65,44 +72,45 @@ * * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ -#define SLAB_TYPESAFE_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ -#define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ -#define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */ +/* Defer freeing slabs to RCU */ +#define SLAB_TYPESAFE_BY_RCU ((slab_flags_t __force)0x00080000U) +/* Spread some memory over cpuset */ +#define SLAB_MEM_SPREAD ((slab_flags_t __force)0x00100000U) +/* Trace allocations and frees */ +#define SLAB_TRACE ((slab_flags_t __force)0x00200000U) /* Flag to prevent checks on free */ #ifdef CONFIG_DEBUG_OBJECTS -# define SLAB_DEBUG_OBJECTS 0x00400000UL +# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00400000U) #else -# define SLAB_DEBUG_OBJECTS 0x00000000UL +# define SLAB_DEBUG_OBJECTS 0 #endif -#define SLAB_NOLEAKTRACE 0x00800000UL /* Avoid kmemleak tracing */ +/* Avoid kmemleak tracing */ +#define SLAB_NOLEAKTRACE ((slab_flags_t __force)0x00800000U) -/* Don't track use of uninitialized memory */ -#ifdef CONFIG_KMEMCHECK -# define SLAB_NOTRACK 0x01000000UL -#else -# define SLAB_NOTRACK 0x00000000UL -#endif +/* Fault injection mark */ #ifdef CONFIG_FAILSLAB -# define SLAB_FAILSLAB 0x02000000UL /* Fault injection mark */ +# define SLAB_FAILSLAB ((slab_flags_t __force)0x02000000U) #else -# define SLAB_FAILSLAB 0x00000000UL +# define SLAB_FAILSLAB 0 #endif +/* Account to memcg */ #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) -# define SLAB_ACCOUNT 0x04000000UL /* Account to memcg */ +# define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000U) #else -# define SLAB_ACCOUNT 0x00000000UL +# define SLAB_ACCOUNT 0 #endif #ifdef CONFIG_KASAN -#define SLAB_KASAN 0x08000000UL +#define SLAB_KASAN ((slab_flags_t __force)0x08000000U) #else -#define SLAB_KASAN 0x00000000UL +#define SLAB_KASAN 0 #endif /* The following flags affect the page allocator grouping pages by mobility */ -#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ +/* Objects are reclaimable */ +#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U) #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. @@ -128,7 +136,7 @@ void __init kmem_cache_init(void); bool slab_is_available(void); struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, - unsigned long, + slab_flags_t, void (*)(void *)); void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); @@ -459,9 +467,6 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) * Also it is possible to set different flags by OR'ing * in one or more of the following additional @flags: * - * %__GFP_COLD - Request cache-cold pages instead of - * trying to return cache-warm pages. - * * %__GFP_HIGH - This allocation has high priority and may use emergency pools. * * %__GFP_NOFAIL - Indicate that this allocation is in no way allowed to fail @@ -636,6 +641,22 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); #define kmalloc_track_caller(size, flags) \ __kmalloc_track_caller(size, flags, _RET_IP_) +static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, + int node) +{ + if (size != 0 && n > SIZE_MAX / size) + return NULL; + if (__builtin_constant_p(n) && __builtin_constant_p(size)) + return kmalloc_node(n * size, flags, node); + return __kmalloc_node(n * size, flags, node); +} + +static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) +{ + return kmalloc_array_node(n, size, flags | __GFP_ZERO, node); +} + + #ifdef CONFIG_NUMA extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); #define kmalloc_node_track_caller(size, flags, node) \ diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 8f7d2b1656d2..072e46e9e1d5 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -20,7 +20,7 @@ struct kmem_cache { struct reciprocal_value reciprocal_buffer_size; /* 2) touched by every alloc & free from the backend */ - unsigned int flags; /* constant flags */ + slab_flags_t flags; /* constant flags */ unsigned int num; /* # of objs per slab */ /* 3) cache_grow/shrink */ diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 39fa09bcde23..0adae162dc8f 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -82,7 +82,7 @@ struct kmem_cache_order_objects { struct kmem_cache { struct kmem_cache_cpu __percpu *cpu_slab; /* Used for retriving partial slabs etc */ - unsigned long flags; + slab_flags_t flags; unsigned long min_partial; int size; /* The size of an object including meta data */ int object_size; /* The size of an object without meta data */ diff --git a/include/linux/swap.h b/include/linux/swap.h index f02fb5db8914..c2b8128799c1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -171,8 +171,9 @@ enum { SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ + SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */ /* add others here before... */ - SWP_SCANNING = (1 << 11), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL @@ -297,7 +298,18 @@ struct vma_swap_readahead { void *workingset_eviction(struct address_space *mapping, struct page *page); bool workingset_refault(void *shadow); void workingset_activation(struct page *page); -void workingset_update_node(struct radix_tree_node *node, void *private); + +/* Do not use directly, use workingset_lookup_update */ +void workingset_update_node(struct radix_tree_node *node); + +/* Returns workingset_update_node() if the mapping has shadow entries. */ +#define workingset_lookup_update(mapping) \ +({ \ + radix_tree_update_node_t __helper = workingset_update_node; \ + if (dax_mapping(mapping) || shmem_mapping(mapping)) \ + __helper = NULL; \ + __helper; \ +}) /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; @@ -462,9 +474,11 @@ extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); +extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); +extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); extern bool reuse_swap_page(struct page *, int *); extern int try_to_free_swap(struct page *); struct backing_dev_info; @@ -473,6 +487,16 @@ extern void exit_swap_address_space(unsigned int type); #else /* CONFIG_SWAP */ +static inline int swap_readpage(struct page *page, bool do_poll) +{ + return 0; +} + +static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) +{ + return NULL; +} + #define swap_address_space(entry) (NULL) #define get_nr_swap_pages() 0L #define total_swap_pages 0L @@ -486,7 +510,7 @@ extern void exit_swap_address_space(unsigned int type); #define free_page_and_swap_cache(page) \ put_page(page) #define free_pages_and_swap_cache(pages, nr) \ - release_pages((pages), (nr), false); + release_pages((pages), (nr)); static inline void show_swap_cache_info(void) { @@ -577,6 +601,11 @@ static inline int page_swapcount(struct page *page) return 0; } +static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +{ + return 0; +} + static inline int __swp_swapcount(swp_entry_t entry) { return 0; diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 4bcdf00c110f..34f053a150a9 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -44,10 +44,9 @@ enum { #endif #if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK) -# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \ - __GFP_ZERO) +# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) #else -# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK) +# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT) #endif /* diff --git a/include/linux/types.h b/include/linux/types.h index 34fce54e4f1b..c94d59ef96cc 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -156,6 +156,7 @@ typedef u32 dma_addr_t; #endif typedef unsigned __bitwise gfp_t; +typedef unsigned __bitwise slab_flags_t; typedef unsigned __bitwise fmode_t; #ifdef CONFIG_PHYS_ADDR_T_64BIT diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 1e0cb72e0598..1779c9817b39 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -7,9 +7,19 @@ #include <linux/mmzone.h> #include <linux/vm_event_item.h> #include <linux/atomic.h> +#include <linux/static_key.h> extern int sysctl_stat_interval; +#ifdef CONFIG_NUMA +#define ENABLE_NUMA_STAT 1 +#define DISABLE_NUMA_STAT 0 +extern int sysctl_vm_numa_stat; +DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); +extern int sysctl_vm_numa_stat_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos); +#endif + #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 2135c9ba6ac3..39efb968b7a4 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -17,7 +17,6 @@ #define _INET_SOCK_H #include <linux/bitops.h> -#include <linux/kmemcheck.h> #include <linux/string.h> #include <linux/types.h> #include <linux/jhash.h> @@ -84,7 +83,6 @@ struct inet_request_sock { #define ireq_state req.__req_common.skc_state #define ireq_family req.__req_common.skc_family - kmemcheck_bitfield_begin(flags); u16 snd_wscale : 4, rcv_wscale : 4, tstamp_ok : 1, @@ -94,7 +92,6 @@ struct inet_request_sock { acked : 1, no_srccheck: 1, smc_ok : 1; - kmemcheck_bitfield_end(flags); u32 ir_mark; union { struct ip_options_rcu __rcu *ireq_opt; diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 6a75d67a30fd..1356fa6a7566 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -15,8 +15,6 @@ #ifndef _INET_TIMEWAIT_SOCK_ #define _INET_TIMEWAIT_SOCK_ - -#include <linux/kmemcheck.h> #include <linux/list.h> #include <linux/timer.h> #include <linux/types.h> @@ -69,14 +67,12 @@ struct inet_timewait_sock { /* Socket demultiplex comparisons on incoming packets. */ /* these three are in inet_sock */ __be16 tw_sport; - kmemcheck_bitfield_begin(flags); /* And these are ours. */ unsigned int tw_kill : 1, tw_transparent : 1, tw_flowlabel : 20, tw_pad : 2, /* 2 bits hole */ tw_tos : 8; - kmemcheck_bitfield_end(flags); struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; }; diff --git a/include/net/sock.h b/include/net/sock.h index f8715c5af37d..79e1a2c7912c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -440,7 +440,6 @@ struct sock { #define SK_FL_TYPE_MASK 0xffff0000 #endif - kmemcheck_bitfield_begin(flags); unsigned int sk_padding : 1, sk_kern_sock : 1, sk_no_check_tx : 1, @@ -449,8 +448,6 @@ struct sock { sk_protocol : 8, sk_type : 16; #define SK_PROTOCOL_MAX U8_MAX - kmemcheck_bitfield_end(flags); - u16 sk_gso_max_segs; u8 sk_pacing_shift; unsigned long sk_lingertime; @@ -1114,7 +1111,7 @@ struct proto { struct kmem_cache *slab; unsigned int obj_size; - int slab_flags; + slab_flags_t slab_flags; struct percpu_counter *orphan_count; diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 285feeadac39..eb57e3037deb 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -172,24 +172,21 @@ TRACE_EVENT(mm_page_free, TRACE_EVENT(mm_page_free_batched, - TP_PROTO(struct page *page, int cold), + TP_PROTO(struct page *page), - TP_ARGS(page, cold), + TP_ARGS(page), TP_STRUCT__entry( __field( unsigned long, pfn ) - __field( int, cold ) ), TP_fast_assign( __entry->pfn = page_to_pfn(page); - __entry->cold = cold; ), - TP_printk("page=%p pfn=%lu order=0 cold=%d", + TP_printk("page=%p pfn=%lu order=0", pfn_to_page(__entry->pfn), - __entry->pfn, - __entry->cold) + __entry->pfn) ); TRACE_EVENT(mm_page_alloc, diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 648cbf603736..dbe1bb058c09 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -32,7 +32,6 @@ {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \ {(unsigned long)__GFP_IO, "__GFP_IO"}, \ {(unsigned long)__GFP_FS, "__GFP_FS"}, \ - {(unsigned long)__GFP_COLD, "__GFP_COLD"}, \ {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \ {(unsigned long)__GFP_RETRY_MAYFAIL, "__GFP_RETRY_MAYFAIL"}, \ {(unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \ @@ -46,7 +45,6 @@ {(unsigned long)__GFP_RECLAIMABLE, "__GFP_RECLAIMABLE"}, \ {(unsigned long)__GFP_MOVABLE, "__GFP_MOVABLE"}, \ {(unsigned long)__GFP_ACCOUNT, "__GFP_ACCOUNT"}, \ - {(unsigned long)__GFP_NOTRACK, "__GFP_NOTRACK"}, \ {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \ {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ diff --git a/init/Kconfig b/init/Kconfig index 5327146db9b5..7d5a6fbac56a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1655,12 +1655,6 @@ config HAVE_GENERIC_DMA_COHERENT bool default n -config SLABINFO - bool - depends on PROC_FS - depends on SLAB || SLUB_DEBUG - default y - config RT_MUTEXES bool diff --git a/init/do_mounts.c b/init/do_mounts.c index f6d4dd764a52..7cf4f6dafd5f 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -380,8 +380,7 @@ static int __init do_mount_root(char *name, char *fs, int flags, void *data) void __init mount_block_root(char *name, int flags) { - struct page *page = alloc_page(GFP_KERNEL | - __GFP_NOTRACK_FALSE_POSITIVE); + struct page *page = alloc_page(GFP_KERNEL); char *fs_names = page_address(page); char *p; #ifdef CONFIG_BLOCK diff --git a/init/main.c b/init/main.c index 3bdd8da90f69..859a786f7c0a 100644 --- a/init/main.c +++ b/init/main.c @@ -70,7 +70,6 @@ #include <linux/kgdb.h> #include <linux/ftrace.h> #include <linux/async.h> -#include <linux/kmemcheck.h> #include <linux/sfi.h> #include <linux/shmem_fs.h> #include <linux/slab.h> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8a6c37762330..b9f8686a84cf 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -85,8 +85,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (fp == NULL) return NULL; - kmemcheck_annotate_bitfield(fp, meta); - aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); if (aux == NULL) { vfree(fp); @@ -127,8 +125,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, if (fp == NULL) { __bpf_prog_uncharge(fp_old->aux->user, delta); } else { - kmemcheck_annotate_bitfield(fp, meta); - memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = pages; fp->aux->prog = fp; @@ -675,8 +671,6 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); if (fp != NULL) { - kmemcheck_annotate_bitfield(fp, meta); - /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, * this still needs to be adapted. diff --git a/kernel/fork.c b/kernel/fork.c index 07cc743698d3..4e55eedba8d6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -469,7 +469,7 @@ void __init fork_init(void) /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", arch_task_struct_size, align, - SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); + SLAB_PANIC|SLAB_ACCOUNT, NULL); #endif /* do the arch specific task caches init */ @@ -817,8 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; - atomic_long_set(&mm->nr_ptes, 0); - mm_nr_pmds_init(mm); + mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; @@ -872,12 +871,9 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } - if (atomic_long_read(&mm->nr_ptes)) - pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", - atomic_long_read(&mm->nr_ptes)); - if (mm_nr_pmds(mm)) - pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", - mm_nr_pmds(mm)); + if (mm_pgtables_bytes(mm)) + pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", + mm_pgtables_bytes(mm)); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); @@ -2209,18 +2205,18 @@ void __init proc_caches_init(void) sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| - SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); + SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* * FIXME! The "sizeof(struct mm_struct)" currently includes the @@ -2231,7 +2227,7 @@ void __init proc_caches_init(void) */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index db933d063bfc..9776da8db180 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -47,7 +47,6 @@ #include <linux/stringify.h> #include <linux/bitops.h> #include <linux/gfp.h> -#include <linux/kmemcheck.h> #include <linux/random.h> #include <linux/jhash.h> @@ -3238,8 +3237,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, { int i; - kmemcheck_mark_initialized(lock, sizeof(*lock)); - for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) lock->class_cache[i] = NULL; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index a917a301e201..bce0464524d8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1884,7 +1884,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) */ static inline int get_highmem_buffer(int safe_needed) { - buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); + buffer = get_image_page(GFP_ATOMIC, safe_needed); return buffer ? 0 : -ENOMEM; } @@ -1945,7 +1945,7 @@ static int swsusp_alloc(struct memory_bitmap *copy_bm, while (nr_pages-- > 0) { struct page *page; - page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); + page = alloc_image_page(GFP_ATOMIC); if (!page) goto err_out; memory_bm_set_bit(copy_bm, page_to_pfn(page)); diff --git a/kernel/signal.c b/kernel/signal.c index 8dcd8825b2de..aa1fb9f905db 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1036,8 +1036,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, else override_rlimit = 0; - q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, - override_rlimit); + q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { diff --git a/kernel/softirq.c b/kernel/softirq.c index 662f7b1b7a78..2f5e87f1bae2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -486,16 +486,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) } EXPORT_SYMBOL(__tasklet_hi_schedule); -void __tasklet_hi_schedule_first(struct tasklet_struct *t) -{ - lockdep_assert_irqs_disabled(); - - t->next = __this_cpu_read(tasklet_hi_vec.head); - __this_cpu_write(tasklet_hi_vec.head, t); - __raise_softirq_irqoff(HI_SOFTIRQ); -} -EXPORT_SYMBOL(__tasklet_hi_schedule_first); - static __latent_entropy void tasklet_action(struct softirq_action *a) { struct tasklet_struct *list; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9576bd582d4a..4a13a389e99b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -30,7 +30,6 @@ #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/ctype.h> -#include <linux/kmemcheck.h> #include <linux/kmemleak.h> #include <linux/fs.h> #include <linux/init.h> @@ -1174,15 +1173,6 @@ static struct ctl_table kern_table[] = { .extra2 = &one_thousand, }, #endif -#ifdef CONFIG_KMEMCHECK - { - .procname = "kmemcheck", - .data = &kmemcheck_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { .procname = "panic_on_warn", .data = &panic_on_warn, @@ -1366,6 +1356,15 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, }, + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = &zero, + .extra2 = &one, + }, #endif { .procname = "hugetlb_shm_group", diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 845f3805c73d..d57fede84b38 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -13,7 +13,6 @@ #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kthread.h> /* for self test */ -#include <linux/kmemcheck.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/mutex.h> @@ -2055,7 +2054,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, } event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); /* account for padding bytes */ local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); @@ -2686,7 +2684,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); rb_update_event(cpu_buffer, event, info); local_inc(&tail_page->entries); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 07ce7449765a..5402e3954659 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -504,7 +504,7 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT config DEBUG_SLAB bool "Debug slab memory allocations" - depends on DEBUG_KERNEL && SLAB && !KMEMCHECK + depends on DEBUG_KERNEL && SLAB help Say Y here to have the kernel do limited verification on memory allocation as well as poisoning memory on free to catch use of freed @@ -516,7 +516,7 @@ config DEBUG_SLAB_LEAK config SLUB_DEBUG_ON bool "SLUB debugging on by default" - depends on SLUB && SLUB_DEBUG && !KMEMCHECK + depends on SLUB && SLUB_DEBUG default n help Boot with debugging on by default. SLUB boots by default with @@ -730,8 +730,6 @@ config DEBUG_STACKOVERFLOW If in doubt, say "N". -source "lib/Kconfig.kmemcheck" - source "lib/Kconfig.kasan" endmenu # "Memory Debugging" diff --git a/lib/Kconfig.kmemcheck b/lib/Kconfig.kmemcheck deleted file mode 100644 index 846e039a86b4..000000000000 --- a/lib/Kconfig.kmemcheck +++ /dev/null @@ -1,94 +0,0 @@ -config HAVE_ARCH_KMEMCHECK - bool - -if HAVE_ARCH_KMEMCHECK - -menuconfig KMEMCHECK - bool "kmemcheck: trap use of uninitialized memory" - depends on DEBUG_KERNEL - depends on !X86_USE_3DNOW - depends on SLUB || SLAB - depends on !CC_OPTIMIZE_FOR_SIZE - depends on !FUNCTION_TRACER - select FRAME_POINTER - select STACKTRACE - default n - help - This option enables tracing of dynamically allocated kernel memory - to see if memory is used before it has been given an initial value. - Be aware that this requires half of your memory for bookkeeping and - will insert extra code at *every* read and write to tracked memory - thus slow down the kernel code (but user code is unaffected). - - The kernel may be started with kmemcheck=0 or kmemcheck=1 to disable - or enable kmemcheck at boot-time. If the kernel is started with - kmemcheck=0, the large memory and CPU overhead is not incurred. - -choice - prompt "kmemcheck: default mode at boot" - depends on KMEMCHECK - default KMEMCHECK_ONESHOT_BY_DEFAULT - help - This option controls the default behaviour of kmemcheck when the - kernel boots and no kmemcheck= parameter is given. - -config KMEMCHECK_DISABLED_BY_DEFAULT - bool "disabled" - depends on KMEMCHECK - -config KMEMCHECK_ENABLED_BY_DEFAULT - bool "enabled" - depends on KMEMCHECK - -config KMEMCHECK_ONESHOT_BY_DEFAULT - bool "one-shot" - depends on KMEMCHECK - help - In one-shot mode, only the first error detected is reported before - kmemcheck is disabled. - -endchoice - -config KMEMCHECK_QUEUE_SIZE - int "kmemcheck: error queue size" - depends on KMEMCHECK - default 64 - help - Select the maximum number of errors to store in the queue. Since - errors can occur virtually anywhere and in any context, we need a - temporary storage area which is guarantueed not to generate any - other faults. The queue will be emptied as soon as a tasklet may - be scheduled. If the queue is full, new error reports will be - lost. - -config KMEMCHECK_SHADOW_COPY_SHIFT - int "kmemcheck: shadow copy size (5 => 32 bytes, 6 => 64 bytes)" - depends on KMEMCHECK - range 2 8 - default 5 - help - Select the number of shadow bytes to save along with each entry of - the queue. These bytes indicate what parts of an allocation are - initialized, uninitialized, etc. and will be displayed when an - error is detected to help the debugging of a particular problem. - -config KMEMCHECK_PARTIAL_OK - bool "kmemcheck: allow partially uninitialized memory" - depends on KMEMCHECK - default y - help - This option works around certain GCC optimizations that produce - 32-bit reads from 16-bit variables where the upper 16 bits are - thrown away afterwards. This may of course also hide some real - bugs. - -config KMEMCHECK_BITOPS_OK - bool "kmemcheck: allow bit-field manipulation" - depends on KMEMCHECK - default n - help - This option silences warnings that would be generated for bit-field - accesses where not all the bits are initialized at the same time. - This may also hide some real bugs. - -endif diff --git a/lib/idr.c b/lib/idr.c index edd9b2be1651..2593ce513a18 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -171,7 +171,7 @@ void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id) if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE)) return ERR_PTR(-ENOENT); - __radix_tree_replace(&idr->idr_rt, node, slot, ptr, NULL, NULL); + __radix_tree_replace(&idr->idr_rt, node, slot, ptr, NULL); return entry; } diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8b1feca1230a..c8d55565fafa 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -677,8 +677,7 @@ out: * @root radix tree root */ static inline bool radix_tree_shrink(struct radix_tree_root *root, - radix_tree_update_node_t update_node, - void *private) + radix_tree_update_node_t update_node) { bool shrunk = false; @@ -739,7 +738,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, if (!radix_tree_is_internal_node(child)) { node->slots[0] = (void __rcu *)RADIX_TREE_RETRY; if (update_node) - update_node(node, private); + update_node(node); } WARN_ON_ONCE(!list_empty(&node->private_list)); @@ -752,7 +751,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, static bool delete_node(struct radix_tree_root *root, struct radix_tree_node *node, - radix_tree_update_node_t update_node, void *private) + radix_tree_update_node_t update_node) { bool deleted = false; @@ -762,8 +761,8 @@ static bool delete_node(struct radix_tree_root *root, if (node->count) { if (node_to_entry(node) == rcu_dereference_raw(root->rnode)) - deleted |= radix_tree_shrink(root, update_node, - private); + deleted |= radix_tree_shrink(root, + update_node); return deleted; } @@ -1173,7 +1172,6 @@ static int calculate_count(struct radix_tree_root *root, * @slot: pointer to slot in @node * @item: new item to store in the slot. * @update_node: callback for changing leaf nodes - * @private: private data to pass to @update_node * * For use with __radix_tree_lookup(). Caller must hold tree write locked * across slot lookup and replacement. @@ -1181,7 +1179,7 @@ static int calculate_count(struct radix_tree_root *root, void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot, void *item, - radix_tree_update_node_t update_node, void *private) + radix_tree_update_node_t update_node) { void *old = rcu_dereference_raw(*slot); int exceptional = !!radix_tree_exceptional_entry(item) - @@ -1201,9 +1199,9 @@ void __radix_tree_replace(struct radix_tree_root *root, return; if (update_node) - update_node(node, private); + update_node(node); - delete_node(root, node, update_node, private); + delete_node(root, node, update_node); } /** @@ -1225,7 +1223,7 @@ void __radix_tree_replace(struct radix_tree_root *root, void radix_tree_replace_slot(struct radix_tree_root *root, void __rcu **slot, void *item) { - __radix_tree_replace(root, NULL, slot, item, NULL, NULL); + __radix_tree_replace(root, NULL, slot, item, NULL); } EXPORT_SYMBOL(radix_tree_replace_slot); @@ -1242,7 +1240,7 @@ void radix_tree_iter_replace(struct radix_tree_root *root, const struct radix_tree_iter *iter, void __rcu **slot, void *item) { - __radix_tree_replace(root, iter->node, slot, item, NULL, NULL); + __radix_tree_replace(root, iter->node, slot, item, NULL); } #ifdef CONFIG_RADIX_TREE_MULTIORDER @@ -1972,7 +1970,6 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); * @root: radix tree root * @node: node containing @index * @update_node: callback for changing leaf nodes - * @private: private data to pass to @update_node * * After clearing the slot at @index in @node from radix tree * rooted at @root, call this function to attempt freeing the @@ -1980,10 +1977,9 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); */ void __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node, - radix_tree_update_node_t update_node, - void *private) + radix_tree_update_node_t update_node) { - delete_node(root, node, update_node, private); + delete_node(root, node, update_node); } static bool __radix_tree_delete(struct radix_tree_root *root, @@ -2001,7 +1997,7 @@ static bool __radix_tree_delete(struct radix_tree_root *root, node_tag_clear(root, node, tag, offset); replace_slot(slot, NULL, node, -1, exceptional); - return node && delete_node(root, node, NULL, NULL); + return node && delete_node(root, node, NULL); } /** diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 5b0adf1435de..e5e606ee5f71 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -11,7 +11,6 @@ config DEBUG_PAGEALLOC bool "Debug page memory allocations" depends on DEBUG_KERNEL depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC - depends on !KMEMCHECK select PAGE_EXTENSION select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- diff --git a/mm/Makefile b/mm/Makefile index 4659b93cba43..e7ebd176fb93 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -17,7 +17,6 @@ KCOV_INSTRUMENT_slub.o := n KCOV_INSTRUMENT_page_alloc.o := n KCOV_INSTRUMENT_debug-pagealloc.o := n KCOV_INSTRUMENT_kmemleak.o := n -KCOV_INSTRUMENT_kmemcheck.o := n KCOV_INSTRUMENT_memcontrol.o := n KCOV_INSTRUMENT_mmzone.o := n KCOV_INSTRUMENT_vmstat.o := n @@ -70,7 +69,6 @@ obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += page_poison.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o -obj-$(CONFIG_KMEMCHECK) += kmemcheck.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o @@ -461,7 +461,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, trace_cma_alloc(pfn, page, count, align); if (ret && !(gfp_mask & __GFP_NOWARN)) { - pr_info("%s: alloc failed, req-size: %zu pages, ret: %d\n", + pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", __func__, count, ret); cma_debug_show_areas(cma); } diff --git a/mm/debug.c b/mm/debug.c index 6726bec731c9..d947f3e03b0d 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -105,7 +105,7 @@ void dump_mm(const struct mm_struct *mm) "get_unmapped_area %p\n" #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" - "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" + "pgd %p mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" @@ -135,8 +135,7 @@ void dump_mm(const struct mm_struct *mm) mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), - atomic_long_read((atomic_long_t *)&mm->nr_ptes), - mm_nr_pmds((struct mm_struct *)mm), + mm_pgtables_bytes(mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, diff --git a/mm/filemap.c b/mm/filemap.c index 594d73fef8b4..923fc2ebd74a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -35,6 +35,7 @@ #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/cleancache.h> +#include <linux/shmem_fs.h> #include <linux/rmap.h> #include "internal.h" @@ -134,7 +135,7 @@ static int page_cache_tree_insert(struct address_space *mapping, *shadowp = p; } __radix_tree_replace(&mapping->page_tree, node, slot, page, - workingset_update_node, mapping); + workingset_lookup_update(mapping)); mapping->nrpages++; return 0; } @@ -162,9 +163,12 @@ static void page_cache_tree_delete(struct address_space *mapping, radix_tree_clear_tags(&mapping->page_tree, node, slot); __radix_tree_replace(&mapping->page_tree, node, slot, shadow, - workingset_update_node, mapping); + workingset_lookup_update(mapping)); } + page->mapping = NULL; + /* Leave page->index set: truncation lookup relies upon it */ + if (shadow) { mapping->nrexceptional += nr; /* @@ -178,17 +182,11 @@ static void page_cache_tree_delete(struct address_space *mapping, mapping->nrpages -= nr; } -/* - * Delete a page from the page cache and free it. Caller has to make - * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock. - */ -void __delete_from_page_cache(struct page *page, void *shadow) +static void unaccount_page_cache_page(struct address_space *mapping, + struct page *page) { - struct address_space *mapping = page->mapping; - int nr = hpage_nr_pages(page); + int nr; - trace_mm_filemap_delete_from_page_cache(page); /* * if we're uptodate, flush out into the cleancache, otherwise * invalidate any existing cleancache entries. We can't leave @@ -224,15 +222,12 @@ void __delete_from_page_cache(struct page *page, void *shadow) } } - page_cache_tree_delete(mapping, page, shadow); - - page->mapping = NULL; - /* Leave page->index set: truncation lookup relies upon it */ - /* hugetlb pages do not participate in page cache accounting. */ if (PageHuge(page)) return; + nr = hpage_nr_pages(page); + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); @@ -243,17 +238,51 @@ void __delete_from_page_cache(struct page *page, void *shadow) } /* - * At this point page must be either written or cleaned by truncate. - * Dirty page here signals a bug and loss of unwritten data. + * At this point page must be either written or cleaned by + * truncate. Dirty page here signals a bug and loss of + * unwritten data. * - * This fixes dirty accounting after removing the page entirely but - * leaves PageDirty set: it has no effect for truncated page and - * anyway will be cleared before returning page into buddy allocator. + * This fixes dirty accounting after removing the page entirely + * but leaves PageDirty set: it has no effect for truncated + * page and anyway will be cleared before returning page into + * buddy allocator. */ if (WARN_ON_ONCE(PageDirty(page))) account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); } +/* + * Delete a page from the page cache and free it. Caller has to make + * sure the page is locked and that nobody else uses it - or that usage + * is safe. The caller must hold the mapping's tree_lock. + */ +void __delete_from_page_cache(struct page *page, void *shadow) +{ + struct address_space *mapping = page->mapping; + + trace_mm_filemap_delete_from_page_cache(page); + + unaccount_page_cache_page(mapping, page); + page_cache_tree_delete(mapping, page, shadow); +} + +static void page_cache_free_page(struct address_space *mapping, + struct page *page) +{ + void (*freepage)(struct page *); + + freepage = mapping->a_ops->freepage; + if (freepage) + freepage(page); + + if (PageTransHuge(page) && !PageHuge(page)) { + page_ref_sub(page, HPAGE_PMD_NR); + VM_BUG_ON_PAGE(page_count(page) <= 0, page); + } else { + put_page(page); + } +} + /** * delete_from_page_cache - delete page from page cache * @page: the page which the kernel is trying to remove from page cache @@ -266,27 +295,98 @@ void delete_from_page_cache(struct page *page) { struct address_space *mapping = page_mapping(page); unsigned long flags; - void (*freepage)(struct page *); BUG_ON(!PageLocked(page)); - - freepage = mapping->a_ops->freepage; - spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - if (freepage) - freepage(page); + page_cache_free_page(mapping, page); +} +EXPORT_SYMBOL(delete_from_page_cache); - if (PageTransHuge(page) && !PageHuge(page)) { - page_ref_sub(page, HPAGE_PMD_NR); - VM_BUG_ON_PAGE(page_count(page) <= 0, page); - } else { - put_page(page); +/* + * page_cache_tree_delete_batch - delete several pages from page cache + * @mapping: the mapping to which pages belong + * @pvec: pagevec with pages to delete + * + * The function walks over mapping->page_tree and removes pages passed in @pvec + * from the radix tree. The function expects @pvec to be sorted by page index. + * It tolerates holes in @pvec (radix tree entries at those indices are not + * modified). The function expects only THP head pages to be present in the + * @pvec and takes care to delete all corresponding tail pages from the radix + * tree as well. + * + * The function expects mapping->tree_lock to be held. + */ +static void +page_cache_tree_delete_batch(struct address_space *mapping, + struct pagevec *pvec) +{ + struct radix_tree_iter iter; + void **slot; + int total_pages = 0; + int i = 0, tail_pages = 0; + struct page *page; + pgoff_t start; + + start = pvec->pages[0]->index; + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + if (i >= pagevec_count(pvec) && !tail_pages) + break; + page = radix_tree_deref_slot_protected(slot, + &mapping->tree_lock); + if (radix_tree_exceptional_entry(page)) + continue; + if (!tail_pages) { + /* + * Some page got inserted in our range? Skip it. We + * have our pages locked so they are protected from + * being removed. + */ + if (page != pvec->pages[i]) + continue; + WARN_ON_ONCE(!PageLocked(page)); + if (PageTransHuge(page) && !PageHuge(page)) + tail_pages = HPAGE_PMD_NR - 1; + page->mapping = NULL; + /* + * Leave page->index set: truncation lookup relies + * upon it + */ + i++; + } else { + tail_pages--; + } + radix_tree_clear_tags(&mapping->page_tree, iter.node, slot); + __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL, + workingset_lookup_update(mapping)); + total_pages++; } + mapping->nrpages -= total_pages; +} + +void delete_from_page_cache_batch(struct address_space *mapping, + struct pagevec *pvec) +{ + int i; + unsigned long flags; + + if (!pagevec_count(pvec)) + return; + + spin_lock_irqsave(&mapping->tree_lock, flags); + for (i = 0; i < pagevec_count(pvec); i++) { + trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); + + unaccount_page_cache_page(mapping, pvec->pages[i]); + } + page_cache_tree_delete_batch(mapping, pvec); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + + for (i = 0; i < pagevec_count(pvec); i++) + page_cache_free_page(mapping, pvec->pages[i]); } -EXPORT_SYMBOL(delete_from_page_cache); int filemap_check_errors(struct address_space *mapping) { @@ -419,20 +519,18 @@ static void __filemap_fdatawait_range(struct address_space *mapping, if (end_byte < start_byte) return; - pagevec_init(&pvec, 0); - while ((index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + pagevec_init(&pvec); + while (index <= end) { unsigned i; + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, + end, PAGECACHE_TAG_WRITEBACK); + if (!nr_pages) + break; + for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* until radix tree lookup accepts end_index */ - if (page->index > end) - continue; - wait_on_page_writeback(page); ClearPageError(page); } @@ -1754,9 +1852,10 @@ repeat: EXPORT_SYMBOL(find_get_pages_contig); /** - * find_get_pages_tag - find and return pages that match @tag + * find_get_pages_range_tag - find and return pages in given range matching @tag * @mapping: the address_space to search * @index: the starting page index + * @end: The final page index (inclusive) * @tag: the tag index * @nr_pages: the maximum number of pages * @pages: where the resulting pages are placed @@ -1764,8 +1863,9 @@ EXPORT_SYMBOL(find_get_pages_contig); * Like find_get_pages, except we only return pages which are tagged with * @tag. We update @index to index the next page for the traversal. */ -unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, - int tag, unsigned int nr_pages, struct page **pages) +unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, + pgoff_t end, int tag, unsigned int nr_pages, + struct page **pages) { struct radix_tree_iter iter; void **slot; @@ -1778,6 +1878,9 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, *index, tag) { struct page *head, *page; + + if (iter.index > end) + break; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1819,18 +1922,28 @@ repeat: } pages[ret] = page; - if (++ret == nr_pages) - break; + if (++ret == nr_pages) { + *index = pages[ret - 1]->index + 1; + goto out; + } } + /* + * We come here when we got at @end. We take care to not overflow the + * index @index as it confuses some of the callers. This breaks the + * iteration when there is page at index -1 but that is already broken + * anyway. + */ + if (end == (pgoff_t)-1) + *index = (pgoff_t)-1; + else + *index = end + 1; +out: rcu_read_unlock(); - if (ret) - *index = pages[ret - 1]->index + 1; - return ret; } -EXPORT_SYMBOL(find_get_pages_tag); +EXPORT_SYMBOL(find_get_pages_range_tag); /** * find_get_entries_tag - find and return entries that match @tag @@ -2159,7 +2272,7 @@ no_cached_page: * Ok, it wasn't cached, so we need to create a new * page.. */ - page = page_cache_alloc_cold(mapping); + page = page_cache_alloc(mapping); if (!page) { error = -ENOMEM; goto out; @@ -2271,7 +2384,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) int ret; do { - page = __page_cache_alloc(gfp_mask|__GFP_COLD); + page = __page_cache_alloc(gfp_mask); if (!page) return -ENOMEM; @@ -2675,7 +2788,7 @@ static struct page *do_read_cache_page(struct address_space *mapping, repeat: page = find_get_page(mapping, index); if (!page) { - page = __page_cache_alloc(gfp | __GFP_COLD); + page = __page_cache_alloc(gfp); if (!page) return ERR_PTR(-ENOMEM); err = add_to_page_cache_lru(page, mapping, index, gfp); @@ -803,11 +803,10 @@ static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); static void hmm_devmem_radix_release(struct resource *resource) { - resource_size_t key, align_start, align_size, align_end; + resource_size_t key, align_start, align_size; align_start = resource->start & ~(PA_SECTION_SIZE - 1); align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); - align_end = align_start + align_size - 1; mutex_lock(&hmm_devmem_lock); for (key = resource->start; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 003f7bcd0952..86fe697e8bfb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -606,7 +606,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&vma->vm_mm->nr_ptes); + mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); } @@ -662,7 +662,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, if (pgtable) pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); return true; } @@ -747,7 +747,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); } set_pmd_at(mm, addr, pmd, entry); @@ -942,7 +942,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, set_pmd_at(src_mm, addr, src_pmd, pmd); } add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&dst_mm->nr_ptes); + mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; @@ -978,7 +978,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(src_page); page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&dst_mm->nr_ptes); + mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); pmdp_set_wrprotect(src_mm, addr, src_pmd); @@ -1189,8 +1189,15 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); + /* + * Leave pmd empty until pte is filled note we must notify here as + * concurrent CPU thread might write to new page before the call to + * mmu_notifier_invalidate_range_end() happens which can lead to a + * device seeing memory write in different order than CPU. + * + * See Documentation/vm/mmu_notifier.txt + */ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); - /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); pmd_populate(vma->vm_mm, &_pmd, pgtable); @@ -1216,7 +1223,12 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, page_remove_rmap(page, true); spin_unlock(vmf->ptl); - mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pmdp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, + mmun_end); ret |= VM_FAULT_WRITE; put_page(page); @@ -1365,7 +1377,12 @@ alloc: } spin_unlock(vmf->ptl); out_mn: - mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pmdp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, + mmun_end); out: return ret; out_unlock: @@ -1678,7 +1695,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) pgtable = pgtable_trans_huge_withdraw(mm, pmd); pte_free(mm, pgtable); - atomic_long_dec(&mm->nr_ptes); + mm_dec_nr_ptes(mm); } int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, @@ -2017,7 +2034,12 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, out: spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pudp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + + HPAGE_PUD_SIZE); } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ @@ -2029,8 +2051,15 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmd_t _pmd; int i; - /* leave pmd empty until pte is filled */ - pmdp_huge_clear_flush_notify(vma, haddr, pmd); + /* + * Leave pmd empty until pte is filled note that it is fine to delay + * notification until mmu_notifier_invalidate_range_end() as we are + * replacing a zero pmd write protected page with a zero pte write + * protected page. + * + * See Documentation/vm/mmu_notifier.txt + */ + pmdp_huge_clear_flush(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -2085,6 +2114,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR); return; } else if (is_huge_zero_pmd(*pmd)) { + /* + * FIXME: Do we want to invalidate secondary mmu by calling + * mmu_notifier_invalidate_range() see comments below inside + * __split_huge_pmd() ? + * + * We are going from a zero huge page write protected to zero + * small page also write protected so it does not seems useful + * to invalidate secondary mmu at this time. + */ return __split_huge_zero_page_pmd(vma, haddr, pmd); } @@ -2220,7 +2258,21 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, __split_huge_pmd_locked(vma, pmd, haddr, freeze); out: spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); + /* + * No need to double call mmu_notifier->invalidate_range() callback. + * They are 3 cases to consider inside __split_huge_pmd_locked(): + * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious + * 2) __split_huge_zero_page_pmd() read only zero page and any write + * fault will trigger a flush_notify before pointing to a new page + * (it is fine if the secondary mmu keeps pointing to the old zero + * page in the meantime) + * 3) Split a huge pmd into pte pointing to the same page. No need + * to invalidate secondary tlb entry they are all still valid. + * any further changes to individual pte will notify. So no need + * to call mmu_notifier->invalidate_range() + */ + mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + + HPAGE_PMD_SIZE); } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2d2ff5e8bf2b..681b300185c0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3256,9 +3256,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); } else { if (cow) { + /* + * No need to notify as we are downgrading page + * table protection not changing it to point + * to a new page. + * + * See Documentation/vm/mmu_notifier.txt + */ huge_ptep_set_wrprotect(src, addr, src_pte); - mmu_notifier_invalidate_range(src, mmun_start, - mmun_end); } entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); @@ -4318,7 +4323,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * and that page table be reused and filled with junk. */ flush_hugetlb_tlb_range(vma, start, end); - mmu_notifier_invalidate_range(mm, start, end); + /* + * No need to call mmu_notifier_invalidate_range() we are downgrading + * page table protection not changing it to point to a new page. + * + * See Documentation/vm/mmu_notifier.txt + */ i_mmap_unlock_write(vma->vm_file->f_mapping); mmu_notifier_invalidate_range_end(mm, start, end); diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 6f319fb81718..405bba487df5 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -337,7 +337,7 @@ static size_t optimal_redzone(size_t object_size) } void kasan_cache_create(struct kmem_cache *cache, size_t *size, - unsigned long *flags) + slab_flags_t *flags) { int redzone_adjust; int orig_size = *size; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 43cb3043311b..ea4ff259b671 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1270,7 +1270,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) _pmd = pmdp_collapse_flush(vma, addr, pmd); spin_unlock(ptl); up_write(&vma->vm_mm->mmap_sem); - atomic_long_dec(&vma->vm_mm->nr_ptes); + mm_dec_nr_ptes(vma->vm_mm); pte_free(vma->vm_mm, pmd_pgtable(_pmd)); } } diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index 800d64b854ea..cec594032515 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -1,126 +1 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/gfp.h> -#include <linux/mm_types.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include "slab.h" -#include <linux/kmemcheck.h> - -void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) -{ - struct page *shadow; - int pages; - int i; - - pages = 1 << order; - - /* - * With kmemcheck enabled, we need to allocate a memory area for the - * shadow bits as well. - */ - shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); - if (!shadow) { - if (printk_ratelimit()) - pr_err("kmemcheck: failed to allocate shadow bitmap\n"); - return; - } - - for(i = 0; i < pages; ++i) - page[i].shadow = page_address(&shadow[i]); - - /* - * Mark it as non-present for the MMU so that our accesses to - * this memory will trigger a page fault and let us analyze - * the memory accesses. - */ - kmemcheck_hide_pages(page, pages); -} - -void kmemcheck_free_shadow(struct page *page, int order) -{ - struct page *shadow; - int pages; - int i; - - if (!kmemcheck_page_is_tracked(page)) - return; - - pages = 1 << order; - - kmemcheck_show_pages(page, pages); - - shadow = virt_to_page(page[0].shadow); - - for(i = 0; i < pages; ++i) - page[i].shadow = NULL; - - __free_pages(shadow, order); -} - -void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, - size_t size) -{ - if (unlikely(!object)) /* Skip object if allocation failed */ - return; - - /* - * Has already been memset(), which initializes the shadow for us - * as well. - */ - if (gfpflags & __GFP_ZERO) - return; - - /* No need to initialize the shadow of a non-tracked slab. */ - if (s->flags & SLAB_NOTRACK) - return; - - if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) { - /* - * Allow notracked objects to be allocated from - * tracked caches. Note however that these objects - * will still get page faults on access, they just - * won't ever be flagged as uninitialized. If page - * faults are not acceptable, the slab cache itself - * should be marked NOTRACK. - */ - kmemcheck_mark_initialized(object, size); - } else if (!s->ctor) { - /* - * New objects should be marked uninitialized before - * they're returned to the called. - */ - kmemcheck_mark_uninitialized(object, size); - } -} - -void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) -{ - /* TODO: RCU freeing is unsupported for now; hide false positives. */ - if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU)) - kmemcheck_mark_freed(object, size); -} - -void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order, - gfp_t gfpflags) -{ - int pages; - - if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK)) - return; - - pages = 1 << order; - - /* - * NOTE: We choose to track GFP_ZERO pages too; in fact, they - * can become uninitialized by copying uninitialized memory - * into them. - */ - - /* XXX: Can use zone->node for node? */ - kmemcheck_alloc_shadow(page, order, gfpflags, -1); - - if (gfpflags & __GFP_ZERO) - kmemcheck_mark_initialized_pages(page, pages); - else - kmemcheck_mark_uninitialized_pages(page, pages); -} diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 7780cd83a495..e4738d5e9b8c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -110,7 +110,6 @@ #include <linux/atomic.h> #include <linux/kasan.h> -#include <linux/kmemcheck.h> #include <linux/kmemleak.h> #include <linux/memory_hotplug.h> @@ -1238,9 +1237,6 @@ static bool update_checksum(struct kmemleak_object *object) { u32 old_csum = object->checksum; - if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) - return false; - kasan_disable_current(); object->checksum = crc32(0, (void *)object->pointer, object->size); kasan_enable_current(); @@ -1314,11 +1310,6 @@ static void scan_block(void *_start, void *_end, if (scan_should_stop()) break; - /* don't scan uninitialized memory */ - if (!kmemcheck_is_obj_initialized((unsigned long)ptr, - BYTES_PER_POINTER)) - continue; - kasan_disable_current(); pointer = *ptr; kasan_enable_current(); @@ -2104,7 +2095,7 @@ static int __init kmemleak_late_init(void) return -ENOMEM; } - dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, + dentry = debugfs_create_file("kmemleak", 0644, NULL, NULL, &kmemleak_fops); if (!dentry) pr_warn("Failed to create the debugfs kmemleak file\n"); @@ -1052,8 +1052,13 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * So we clear the pte and flush the tlb before the check * this assure us that no O_DIRECT can happen after the check * or in the middle of the check. + * + * No need to notify as we are downgrading page table to read + * only not changing it to point to a new page. + * + * See Documentation/vm/mmu_notifier.txt */ - entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); + entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); /* * Check that no O_DIRECT or similar I/O is in progress on the * page @@ -1136,7 +1141,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, } flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush_notify(vma, addr, ptep); + /* + * No need to notify as we are replacing a read only page with another + * read only page with the same content. + * + * See Documentation/vm/mmu_notifier.txt + */ + ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); page_remove_rmap(page, false); diff --git a/mm/list_lru.c b/mm/list_lru.c index f141f0c80ff3..fd41e969ede5 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -221,6 +221,7 @@ restart: switch (ret) { case LRU_REMOVED_RETRY: assert_spin_locked(&nlru->lock); + /* fall through */ case LRU_REMOVED: isolated++; nlru->nr_items--; diff --git a/mm/memblock.c b/mm/memblock.c index 91205780e6b1..46aacdfa4f4d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -533,7 +533,7 @@ repeat: base = obase; nr_new = 0; - for_each_memblock_type(type, rgn) { + for_each_memblock_type(idx, type, rgn) { phys_addr_t rbase = rgn->base; phys_addr_t rend = rbase + rgn->size; @@ -637,7 +637,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, if (memblock_double_array(type, base, size) < 0) return -ENOMEM; - for_each_memblock_type(type, rgn) { + for_each_memblock_type(idx, type, rgn) { phys_addr_t rbase = rgn->base; phys_addr_t rend = rbase + rgn->size; @@ -1327,7 +1327,6 @@ again: return NULL; done: ptr = phys_to_virt(alloc); - memset(ptr, 0, size); /* * The min_count is set to 0 so that bootmem allocated blocks @@ -1341,6 +1340,45 @@ done: } /** + * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing + * memory and without panicking + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public function, provides additional debug information (including caller + * info), if enabled. Does not zero allocated memory, does not panic if request + * cannot be satisfied. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_virt_alloc_try_nid_raw( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + void *ptr; + + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr, (void *)_RET_IP_); + + ptr = memblock_virt_alloc_internal(size, align, + min_addr, max_addr, nid); +#ifdef CONFIG_DEBUG_VM + if (ptr && size > 0) + memset(ptr, 0xff, size); +#endif + return ptr; +} + +/** * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size @@ -1351,8 +1389,8 @@ done: * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * - * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides - * additional debug information (including caller info), if enabled. + * Public function, provides additional debug information (including caller + * info), if enabled. This function zeroes the allocated memory. * * RETURNS: * Virtual address of allocated memory block on success, NULL on failure. @@ -1362,11 +1400,17 @@ void * __init memblock_virt_alloc_try_nid_nopanic( phys_addr_t min_addr, phys_addr_t max_addr, int nid) { + void *ptr; + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", __func__, (u64)size, (u64)align, nid, (u64)min_addr, (u64)max_addr, (void *)_RET_IP_); - return memblock_virt_alloc_internal(size, align, min_addr, - max_addr, nid); + + ptr = memblock_virt_alloc_internal(size, align, + min_addr, max_addr, nid); + if (ptr) + memset(ptr, 0, size); + return ptr; } /** @@ -1380,7 +1424,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic( * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * - * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() + * Public panicking version of memblock_virt_alloc_try_nid_nopanic() * which provides debug information (including caller info), if enabled, * and panics if the request can not be satisfied. * @@ -1399,8 +1443,10 @@ void * __init memblock_virt_alloc_try_nid( (u64)max_addr, (void *)_RET_IP_); ptr = memblock_virt_alloc_internal(size, align, min_addr, max_addr, nid); - if (ptr) + if (ptr) { + memset(ptr, 0, size); return ptr; + } panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", __func__, (u64)size, (u64)align, nid, (u64)min_addr, @@ -1715,7 +1761,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt); - for_each_memblock_type(type, rgn) { + for_each_memblock_type(idx, type, rgn) { char nid_buf[32] = ""; base = rgn->base; @@ -1739,7 +1785,7 @@ memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr) unsigned long size = 0; int idx; - for_each_memblock_type((&memblock.reserved), rgn) { + for_each_memblock_type(idx, (&memblock.reserved), rgn) { phys_addr_t start, end; if (rgn->base + rgn->size < start_addr) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 661f046ad318..50e6906314f8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4049,7 +4049,7 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, -#ifdef CONFIG_SLABINFO +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) { .name = "kmem.slabinfo", .seq_start = memcg_slab_start, diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 88366626c0b7..4acdf393a801 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1587,7 +1587,7 @@ static int soft_offline_huge_page(struct page *page, int flags) ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { - pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", + pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n", pfn, ret, page->flags, &page->flags); if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); diff --git a/mm/memory.c b/mm/memory.c index cae514e7dcfc..85e7a87da79f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -438,7 +438,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); - atomic_long_dec(&tlb->mm->nr_ptes); + mm_dec_nr_ptes(tlb->mm); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -506,6 +506,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, pud = pud_offset(p4d, start); p4d_clear(p4d); pud_free_tlb(tlb, pud, start); + mm_dec_nr_puds(tlb->mm); } static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -665,7 +666,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) ptl = pmd_lock(mm, pmd); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); pmd_populate(mm, pmd, new); new = NULL; } @@ -2554,7 +2555,11 @@ static int wp_page_copy(struct vm_fault *vmf) put_page(new_page); pte_unmap_unlock(vmf->pte, vmf->ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above ptep_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); if (old_page) { /* * Don't let another task, with possibly unlocked vma, @@ -2842,7 +2847,7 @@ EXPORT_SYMBOL(unmap_mapping_range); int do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page = NULL, *swapcache; + struct page *page = NULL, *swapcache = NULL; struct mem_cgroup *memcg; struct vma_swap_readahead swap_ra; swp_entry_t entry; @@ -2881,17 +2886,36 @@ int do_swap_page(struct vm_fault *vmf) } goto out; } + + delayacct_set_flag(DELAYACCT_PF_SWAPIN); if (!page) page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, vmf->address); if (!page) { - if (vma_readahead) - page = do_swap_page_readahead(entry, - GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); - else - page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, vmf->address); + struct swap_info_struct *si = swp_swap_info(entry); + + if (si->flags & SWP_SYNCHRONOUS_IO && + __swap_count(si, entry) == 1) { + /* skip swapcache */ + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); + if (page) { + __SetPageLocked(page); + __SetPageSwapBacked(page); + set_page_private(page, entry.val); + lru_cache_add_anon(page); + swap_readpage(page, true); + } + } else { + if (vma_readahead) + page = do_swap_page_readahead(entry, + GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); + else + page = swapin_readahead(entry, + GFP_HIGHUSER_MOVABLE, vma, vmf->address); + swapcache = page; + } + if (!page) { /* * Back out if somebody else faulted in this pte @@ -2920,7 +2944,6 @@ int do_swap_page(struct vm_fault *vmf) goto out_release; } - swapcache = page; locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); @@ -2935,7 +2958,8 @@ int do_swap_page(struct vm_fault *vmf) * test below, are not enough to exclude that. Even if it is still * swapcache, we need to check that the page's swap has not changed. */ - if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) + if (unlikely((!PageSwapCache(page) || + page_private(page) != entry.val)) && swapcache) goto out_page; page = ksm_might_need_to_copy(page, vma, vmf->address); @@ -2988,14 +3012,16 @@ int do_swap_page(struct vm_fault *vmf) pte = pte_mksoft_dirty(pte); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); vmf->orig_pte = pte; - if (page == swapcache) { - do_page_add_anon_rmap(page, vma, vmf->address, exclusive); - mem_cgroup_commit_charge(page, memcg, true, false); - activate_page(page); - } else { /* ksm created a completely new copy */ + + /* ksm created a completely new copy */ + if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); + } else { + do_page_add_anon_rmap(page, vma, vmf->address, exclusive); + mem_cgroup_commit_charge(page, memcg, true, false); + activate_page(page); } swap_free(entry); @@ -3003,7 +3029,7 @@ int do_swap_page(struct vm_fault *vmf) (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); - if (page != swapcache) { + if (page != swapcache && swapcache) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check @@ -3036,7 +3062,7 @@ out_page: unlock_page(page); out_release: put_page(page); - if (page != swapcache) { + if (page != swapcache && swapcache) { unlock_page(swapcache); put_page(swapcache); } @@ -3212,7 +3238,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf) goto map_pte; } - atomic_long_inc(&vma->vm_mm->nr_ptes); + mm_inc_nr_ptes(vma->vm_mm); pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); spin_unlock(vmf->ptl); vmf->prealloc_pte = NULL; @@ -3271,7 +3297,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) * We are going to consume the prealloc table, * count that as nr_ptes. */ - atomic_long_inc(&vma->vm_mm->nr_ptes); + mm_inc_nr_ptes(vma->vm_mm); vmf->prealloc_pte = NULL; } @@ -4124,15 +4150,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_5LEVEL_HACK - if (p4d_present(*p4d)) /* Another has populated it */ - pud_free(mm, new); - else + if (!p4d_present(*p4d)) { + mm_inc_nr_puds(mm); p4d_populate(mm, p4d, new); -#else - if (pgd_present(*p4d)) /* Another has populated it */ + } else /* Another has populated it */ pud_free(mm, new); - else +#else + if (!pgd_present(*p4d)) { + mm_inc_nr_puds(mm); pgd_populate(mm, p4d, new); + } else /* Another has populated it */ + pud_free(mm, new); #endif /* __ARCH_HAS_5LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; @@ -4457,17 +4485,15 @@ void print_vma_addr(char *prefix, unsigned long ip) struct vm_area_struct *vma; /* - * Do not print if we are in atomic - * contexts (in exception stacks, etc.): + * we might be running from an atomic context so we cannot sleep */ - if (preempt_count()) + if (!down_read_trylock(&mm->mmap_sem)) return; - down_read(&mm->mmap_sem); vma = find_vma(mm, ip); if (vma && vma->vm_file) { struct file *f = vma->vm_file; - char *buf = (char *)__get_free_page(GFP_KERNEL); + char *buf = (char *)__get_free_page(GFP_NOWAIT); if (buf) { char *p; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d4b5f29906b9..c52aa05b106c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -265,7 +265,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, /* * Make all the pages reserved so that nobody will stumble over half * initialized state. - * FIXME: We also have to associate it with a node because pfn_to_node + * FIXME: We also have to associate it with a node because page_to_nid * relies on having page with the proper node. */ for (i = 0; i < PAGES_PER_SECTION; i++) { @@ -1590,11 +1590,11 @@ static void node_states_clear_node(int node, struct memory_notify *arg) } static int __ref __offline_pages(unsigned long start_pfn, - unsigned long end_pfn, unsigned long timeout) + unsigned long end_pfn) { - unsigned long pfn, nr_pages, expire; + unsigned long pfn, nr_pages; long offlined_pages; - int ret, drain, retry_max, node; + int ret, node; unsigned long flags; unsigned long valid_start, valid_end; struct zone *zone; @@ -1630,44 +1630,22 @@ static int __ref __offline_pages(unsigned long start_pfn, goto failed_removal; pfn = start_pfn; - expire = jiffies + timeout; - drain = 0; - retry_max = 5; repeat: /* start memory hot removal */ - ret = -EAGAIN; - if (time_after(jiffies, expire)) - goto failed_removal; ret = -EINTR; if (signal_pending(current)) goto failed_removal; - ret = 0; - if (drain) { - lru_add_drain_all_cpuslocked(); - cond_resched(); - drain_all_pages(zone); - } + + cond_resched(); + lru_add_drain_all_cpuslocked(); + drain_all_pages(zone); pfn = scan_movable_pages(start_pfn, end_pfn); if (pfn) { /* We have movable pages */ ret = do_migrate_range(pfn, end_pfn); - if (!ret) { - drain = 1; - goto repeat; - } else { - if (ret < 0) - if (--retry_max == 0) - goto failed_removal; - yield(); - drain = 1; - goto repeat; - } + goto repeat; } - /* drain all zone's lru pagevec, this is asynchronous... */ - lru_add_drain_all_cpuslocked(); - yield(); - /* drain pcp pages, this is synchronous. */ - drain_all_pages(zone); + /* * dissolve free hugepages in the memory block before doing offlining * actually in order to make hugetlbfs's object counting consistent. @@ -1677,10 +1655,8 @@ repeat: goto failed_removal; /* check again */ offlined_pages = check_pages_isolated(start_pfn, end_pfn); - if (offlined_pages < 0) { - ret = -EBUSY; - goto failed_removal; - } + if (offlined_pages < 0) + goto repeat; pr_info("Offlined Pages %ld\n", offlined_pages); /* Ok, all of our target is isolated. We cannot do rollback at this point. */ @@ -1728,7 +1704,7 @@ failed_removal: /* Must be protected by mem_hotplug_begin() or a device_lock */ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { - return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); + return __offline_pages(start_pfn, start_pfn + nr_pages); } #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a2af6d58a68f..4ce44d3ff03d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -85,6 +85,7 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compat.h> +#include <linux/ptrace.h> #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> @@ -1365,7 +1366,6 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, old_nodes, const unsigned long __user *, new_nodes) { - const struct cred *cred = current_cred(), *tcred; struct mm_struct *mm = NULL; struct task_struct *task; nodemask_t task_nodes; @@ -1401,15 +1401,10 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, err = -EINVAL; /* - * Check if this process has the right to modify the specified - * process. The right exists if the process has administrative - * capabilities, superuser privileges or the same - * userid as the target process. + * Check if this process has the right to modify the specified process. + * Use the regular "ptrace_may_access()" checks. */ - tcred = __task_cred(task); - if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && - !capable(CAP_SYS_NICE)) { + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); err = -EPERM; goto out_put; @@ -1920,6 +1915,9 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, struct page *page; page = __alloc_pages(gfp, order, nid); + /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ + if (!static_branch_likely(&vm_numa_stat_key)) + return page; if (page && page_to_nid(page) == nid) { preempt_disable(); __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT); diff --git a/mm/mempool.c b/mm/mempool.c index c4a23cdae3f0..7d8c5a0010a2 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -189,7 +189,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); if (!pool) return NULL; - pool->elements = kmalloc_node(min_nr * sizeof(void *), + pool->elements = kmalloc_array_node(min_nr, sizeof(void *), gfp_mask, node_id); if (!pool->elements) { kfree(pool); diff --git a/mm/migrate.c b/mm/migrate.c index 1236449b4777..4d0be47a322a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2089,7 +2089,11 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pmdp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); /* Take an "isolate" reference and put new page on the LRU. */ get_page(new_page); @@ -2805,9 +2809,14 @@ static void migrate_vma_pages(struct migrate_vma *migrate) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; } + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() + * did already call it. + */ if (notified) - mmu_notifier_invalidate_range_end(mm, mmu_start, - migrate->end); + mmu_notifier_invalidate_range_only_end(mm, mmu_start, + migrate->end); } /* diff --git a/mm/mlock.c b/mm/mlock.c index 46af369c13e5..30472d438794 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -289,7 +289,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) struct pagevec pvec_putback; int pgrescued = 0; - pagevec_init(&pvec_putback, 0); + pagevec_init(&pvec_putback); /* Phase 1: page isolation */ spin_lock_irq(zone_lru_lock(zone)); @@ -448,7 +448,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, struct pagevec pvec; struct zone *zone; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); /* * Although FOLL_DUMP is intended for get_dump_page(), * it just so happens that its special treatment of the @@ -670,8 +670,6 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla if (!can_do_mlock()) return -EPERM; - lru_add_drain_all(); /* flush pagevec */ - len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; @@ -798,9 +796,6 @@ SYSCALL_DEFINE1(mlockall, int, flags) if (!can_do_mlock()) return -EPERM; - if (flags & MCL_CURRENT) - lru_add_drain_all(); /* flush pagevec */ - lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 314285284e6e..96edb33fd09a 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -190,7 +190,9 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, - unsigned long start, unsigned long end) + unsigned long start, + unsigned long end, + bool only_end) { struct mmu_notifier *mn; int id; @@ -204,8 +206,13 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, * subsystem registers either invalidate_range_start()/end() or * invalidate_range(), so this will be no additional overhead * (besides the pointer check). + * + * We skip call to invalidate_range() if we know it is safe ie + * call site use mmu_notifier_invalidate_range_only_end() which + * is safe to do when we know that a call to invalidate_range() + * already happen under page table lock. */ - if (mn->ops->invalidate_range) + if (!only_end && mn->ops->invalidate_range) mn->ops->invalidate_range(mn, mm, start, end); if (mn->ops->invalidate_range_end) mn->ops->invalidate_range_end(mn, mm, start, end); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dee0f75c3013..c86fbd1b590e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,6 +44,7 @@ #include <asm/tlb.h> #include "internal.h" +#include "slab.h" #define CREATE_TRACE_POINTS #include <trace/events/oom.h> @@ -161,6 +162,25 @@ static bool oom_unkillable_task(struct task_struct *p, return false; } +/* + * Print out unreclaimble slabs info when unreclaimable slabs amount is greater + * than all user memory (LRU pages) + */ +static bool is_dump_unreclaim_slabs(void) +{ + unsigned long nr_lru; + + nr_lru = global_node_page_state(NR_ACTIVE_ANON) + + global_node_page_state(NR_INACTIVE_ANON) + + global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE) + + global_node_page_state(NR_ISOLATED_ANON) + + global_node_page_state(NR_ISOLATED_FILE) + + global_node_page_state(NR_UNEVICTABLE); + + return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru); +} + /** * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate @@ -201,7 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + - atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); + mm_pgtables_bytes(p->mm) / PAGE_SIZE; task_unlock(p); /* @@ -369,15 +389,15 @@ static void select_bad_process(struct oom_control *oc) * Dumps the current memory state of all eligible tasks. Tasks not in the same * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes * are not shown. - * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, - * swapents, oom_score_adj value, and name. + * State information includes task's pid, uid, tgid, vm size, rss, + * pgtables_bytes, swapents, oom_score_adj value, and name. */ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *task; - pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) @@ -393,11 +413,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - atomic_long_read(&task->mm->nr_ptes), - mm_nr_pmds(task->mm), + mm_pgtables_bytes(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); @@ -407,23 +426,22 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p) { - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=", - current->comm, oc->gfp_mask, &oc->gfp_mask); - if (oc->nodemask) - pr_cont("%*pbl", nodemask_pr_args(oc->nodemask)); - else - pr_cont("(null)"); - pr_cont(", order=%d, oom_score_adj=%hd\n", - oc->order, current->signal->oom_score_adj); + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", + current->comm, oc->gfp_mask, &oc->gfp_mask, + nodemask_pr_args(oc->nodemask), oc->order, + current->signal->oom_score_adj); if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) pr_warn("COMPACTION is disabled!!!\n"); cpuset_print_current_mems_allowed(); dump_stack(); - if (oc->memcg) + if (is_memcg_oom(oc)) mem_cgroup_print_oom_info(oc->memcg, p); - else + else { show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); + if (is_dump_unreclaim_slabs()) + dump_unreclaimable_slab(); + } if (sysctl_oom_dump_tasks) dump_tasks(oc->memcg, oc->nodemask); } @@ -618,9 +636,6 @@ static int oom_reaper(void *unused) static void wake_oom_reaper(struct task_struct *tsk) { - if (!oom_reaper_th) - return; - /* tsk is already queued? */ if (tsk == oom_reaper_list || tsk->oom_reaper_list) return; @@ -638,11 +653,6 @@ static void wake_oom_reaper(struct task_struct *tsk) static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); - if (IS_ERR(oom_reaper_th)) { - pr_err("Unable to start OOM reaper %ld. Continuing regardless\n", - PTR_ERR(oom_reaper_th)); - oom_reaper_th = NULL; - } return 0; } subsys_initcall(oom_init) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c518c845f202..8a1551154285 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -433,8 +433,11 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) else bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; - if (bg_thresh >= thresh) + if (unlikely(bg_thresh >= thresh)) { + pr_warn("vm direct limit must be set greater than background limit.\n"); bg_thresh = thresh / 2; + } + tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; @@ -625,9 +628,9 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc); * On idle system, we can be called long after we scheduled because we use * deferred timers so count with missed periods. */ -static void writeout_period(unsigned long t) +static void writeout_period(struct timer_list *t) { - struct wb_domain *dom = (void *)t; + struct wb_domain *dom = from_timer(dom, t, period_timer); int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; @@ -650,8 +653,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp) spin_lock_init(&dom->lock); - setup_deferrable_timer(&dom->period_timer, writeout_period, - (unsigned long)dom); + timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE); dom->dirty_limit_tstamp = jiffies; @@ -1543,7 +1545,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) * actually dirty; with m+n sitting in the percpu * deltas. */ - if (dtc->wb_thresh < 2 * wb_stat_error(wb)) { + if (dtc->wb_thresh < 2 * wb_stat_error()) { wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); } else { @@ -1559,8 +1561,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. */ -static void balance_dirty_pages(struct address_space *mapping, - struct bdi_writeback *wb, +static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied) { struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; @@ -1802,7 +1803,7 @@ pause: * more page. However wb_dirty has accounting errors. So use * the larger and more IO friendly wb_stat_error. */ - if (sdtc->wb_dirty <= wb_stat_error(wb)) + if (sdtc->wb_dirty <= wb_stat_error()) break; if (fatal_signal_pending(current)) @@ -1910,7 +1911,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) - balance_dirty_pages(mapping, wb, current->nr_dirtied); + balance_dirty_pages(wb, current->nr_dirtied); wb_put(wb); } @@ -2167,7 +2168,7 @@ int write_cache_pages(struct address_space *mapping, int range_whole = 0; int tag; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -2194,30 +2195,14 @@ retry: while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) { - /* - * can't be range_cyclic (1st pass) because - * end == -1 in that case. - */ - done = 1; - break; - } - done_index = page->index; lock_page(page); @@ -2623,7 +2608,7 @@ EXPORT_SYMBOL(set_page_dirty_lock); * page without actually doing it through the VM. Can you say "ext3 is * horribly ugly"? Thought you could. */ -void cancel_dirty_page(struct page *page) +void __cancel_dirty_page(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -2644,7 +2629,7 @@ void cancel_dirty_page(struct page *page) ClearPageDirty(page); } } -EXPORT_SYMBOL(cancel_dirty_page); +EXPORT_SYMBOL(__cancel_dirty_page); /* * Clear a page's dirty flag, while caring for dirty memory accounting. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 77e4d3c5c57b..55ded92f9809 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -24,7 +24,6 @@ #include <linux/memblock.h> #include <linux/compiler.h> #include <linux/kernel.h> -#include <linux/kmemcheck.h> #include <linux/kasan.h> #include <linux/module.h> #include <linux/suspend.h> @@ -83,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); #endif +DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); + #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. @@ -290,28 +291,37 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + +/* + * Determine how many pages need to be initialized durig early boot + * (non-deferred initialization). + * The value of first_deferred_pfn will be set later, once non-deferred pages + * are initialized, but for now set it ULONG_MAX. + */ static inline void reset_deferred_meminit(pg_data_t *pgdat) { - unsigned long max_initialise; - unsigned long reserved_lowmem; + phys_addr_t start_addr, end_addr; + unsigned long max_pgcnt; + unsigned long reserved; /* * Initialise at least 2G of a node but also take into account that * two large system hashes that can take up 1GB for 0.25TB/node. */ - max_initialise = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); + max_pgcnt = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); /* * Compensate the all the memblock reservations (e.g. crash kernel) * from the initial estimation to make sure we will initialize enough * memory to boot. */ - reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, - pgdat->node_start_pfn + max_initialise); - max_initialise += reserved_lowmem; + start_addr = PFN_PHYS(pgdat->node_start_pfn); + end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); + reserved = memblock_reserved_memory_within(start_addr, end_addr); + max_pgcnt += PHYS_PFN(reserved); - pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); + pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages); pgdat->first_deferred_pfn = ULONG_MAX; } @@ -338,7 +348,7 @@ static inline bool update_defer_init(pg_data_t *pgdat, if (zone_end < pgdat_end_pfn(pgdat)) return true; (*nr_initialised)++; - if ((*nr_initialised > pgdat->static_init_size) && + if ((*nr_initialised > pgdat->static_init_pgcnt) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; @@ -1013,7 +1023,6 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(PageTail(page), page); trace_mm_page_free(page, order); - kmemcheck_free_shadow(page, order); /* * Check tail pages before head page information is cleared to @@ -1170,6 +1179,7 @@ static void free_one_page(struct zone *zone, static void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid) { + mm_zero_struct_page(page); set_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); @@ -1410,14 +1420,17 @@ void clear_zone_contiguous(struct zone *zone) } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -static void __init deferred_free_range(struct page *page, - unsigned long pfn, int nr_pages) +static void __init deferred_free_range(unsigned long pfn, + unsigned long nr_pages) { - int i; + struct page *page; + unsigned long i; - if (!page) + if (!nr_pages) return; + page = pfn_to_page(pfn); + /* Free a large naturally-aligned chunk if possible */ if (nr_pages == pageblock_nr_pages && (pfn & (pageblock_nr_pages - 1)) == 0) { @@ -1443,19 +1456,109 @@ static inline void __init pgdat_init_report_one_done(void) complete(&pgdat_init_all_done_comp); } +/* + * Helper for deferred_init_range, free the given range, reset the counters, and + * return number of pages freed. + */ +static inline unsigned long __init __def_free(unsigned long *nr_free, + unsigned long *free_base_pfn, + struct page **page) +{ + unsigned long nr = *nr_free; + + deferred_free_range(*free_base_pfn, nr); + *free_base_pfn = 0; + *nr_free = 0; + *page = NULL; + + return nr; +} + +static unsigned long __init deferred_init_range(int nid, int zid, + unsigned long start_pfn, + unsigned long end_pfn) +{ + struct mminit_pfnnid_cache nid_init_state = { }; + unsigned long nr_pgmask = pageblock_nr_pages - 1; + unsigned long free_base_pfn = 0; + unsigned long nr_pages = 0; + unsigned long nr_free = 0; + struct page *page = NULL; + unsigned long pfn; + + /* + * First we check if pfn is valid on architectures where it is possible + * to have holes within pageblock_nr_pages. On systems where it is not + * possible, this function is optimized out. + * + * Then, we check if a current large page is valid by only checking the + * validity of the head pfn. + * + * meminit_pfn_in_nid is checked on systems where pfns can interleave + * within a node: a pfn is between start and end of a node, but does not + * belong to this memory node. + * + * Finally, we minimize pfn page lookups and scheduler checks by + * performing it only once every pageblock_nr_pages. + * + * We do it in two loops: first we initialize struct page, than free to + * buddy allocator, becuse while we are freeing pages we can access + * pages that are ahead (computing buddy page in __free_one_page()). + */ + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + continue; + if ((pfn & nr_pgmask) || pfn_valid(pfn)) { + if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { + if (page && (pfn & nr_pgmask)) + page++; + else + page = pfn_to_page(pfn); + __init_single_page(page, pfn, zid, nid); + cond_resched(); + } + } + } + + page = NULL; + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) { + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) { + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + } else if (page && (pfn & nr_pgmask)) { + page++; + nr_free++; + } else { + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + page = pfn_to_page(pfn); + free_base_pfn = pfn; + nr_free = 1; + cond_resched(); + } + } + /* Free the last block of pages to allocator */ + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + + return nr_pages; +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; int nid = pgdat->node_id; - struct mminit_pfnnid_cache nid_init_state = { }; unsigned long start = jiffies; unsigned long nr_pages = 0; - unsigned long walk_start, walk_end; - int i, zid; + unsigned long spfn, epfn; + phys_addr_t spa, epa; + int zid; struct zone *zone; unsigned long first_init_pfn = pgdat->first_deferred_pfn; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + u64 i; if (first_init_pfn == ULONG_MAX) { pgdat_init_report_one_done(); @@ -1477,83 +1580,12 @@ static int __init deferred_init_memmap(void *data) if (first_init_pfn < zone_end_pfn(zone)) break; } + first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); - for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { - unsigned long pfn, end_pfn; - struct page *page = NULL; - struct page *free_base_page = NULL; - unsigned long free_base_pfn = 0; - int nr_to_free = 0; - - end_pfn = min(walk_end, zone_end_pfn(zone)); - pfn = first_init_pfn; - if (pfn < walk_start) - pfn = walk_start; - if (pfn < zone->zone_start_pfn) - pfn = zone->zone_start_pfn; - - for (; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) - goto free_range; - - /* - * Ensure pfn_valid is checked every - * pageblock_nr_pages for memory holes - */ - if ((pfn & (pageblock_nr_pages - 1)) == 0) { - if (!pfn_valid(pfn)) { - page = NULL; - goto free_range; - } - } - - if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { - page = NULL; - goto free_range; - } - - /* Minimise pfn page lookups and scheduler checks */ - if (page && (pfn & (pageblock_nr_pages - 1)) != 0) { - page++; - } else { - nr_pages += nr_to_free; - deferred_free_range(free_base_page, - free_base_pfn, nr_to_free); - free_base_page = NULL; - free_base_pfn = nr_to_free = 0; - - page = pfn_to_page(pfn); - cond_resched(); - } - - if (page->flags) { - VM_BUG_ON(page_zone(page) != zone); - goto free_range; - } - - __init_single_page(page, pfn, zid, nid); - if (!free_base_page) { - free_base_page = page; - free_base_pfn = pfn; - nr_to_free = 0; - } - nr_to_free++; - - /* Where possible, batch up pages for a single free */ - continue; -free_range: - /* Free the current block of pages to allocator */ - nr_pages += nr_to_free; - deferred_free_range(free_base_page, free_base_pfn, - nr_to_free); - free_base_page = NULL; - free_base_pfn = nr_to_free = 0; - } - /* Free the last block of pages to allocator */ - nr_pages += nr_to_free; - deferred_free_range(free_base_page, free_base_pfn, nr_to_free); - - first_init_pfn = max(end_pfn, first_init_pfn); + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + nr_pages += deferred_init_range(nid, zid, spfn, epfn); } /* Sanity check that the next zone really is unpopulated */ @@ -1792,7 +1824,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags * Go through the free lists for the given migratetype and remove * the smallest available page from the freelists */ -static inline +static __always_inline struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, int migratetype) { @@ -1836,7 +1868,7 @@ static int fallbacks[MIGRATE_TYPES][4] = { }; #ifdef CONFIG_CMA -static struct page *__rmqueue_cma_fallback(struct zone *zone, +static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, unsigned int order) { return __rmqueue_smallest(zone, order, MIGRATE_CMA); @@ -2217,7 +2249,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * deviation from the rest of this file, to make the for loop * condition simpler. */ -static inline bool +static __always_inline bool __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) { struct free_area *area; @@ -2289,8 +2321,8 @@ do_steal: * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ -static struct page *__rmqueue(struct zone *zone, unsigned int order, - int migratetype) +static __always_inline struct page * +__rmqueue(struct zone *zone, unsigned int order, int migratetype) { struct page *page; @@ -2315,7 +2347,7 @@ retry: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype, bool cold) + int migratetype) { int i, alloced = 0; @@ -2329,19 +2361,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, continue; /* - * Split buddy pages returned by expand() are received here - * in physical page order. The page is added to the callers and - * list and the list head then moves forward. From the callers - * perspective, the linked list is ordered by page number in - * some conditions. This is useful for IO devices that can - * merge IO requests if the physical pages are ordered - * properly. + * Split buddy pages returned by expand() are received here in + * physical page order. The page is added to the tail of + * caller's list. From the callers perspective, the linked list + * is ordered by page number under some conditions. This is + * useful for IO devices that can forward direction from the + * head, thus also in the physical page order. This is useful + * for IO devices that can merge IO requests if the physical + * pages are ordered properly. */ - if (likely(!cold)) - list_add(&page->lru, list); - else - list_add_tail(&page->lru, list); - list = &page->lru; + list_add_tail(&page->lru, list); alloced++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, @@ -2590,24 +2619,25 @@ void mark_free_pages(struct zone *zone) } #endif /* CONFIG_PM */ -/* - * Free a 0-order page - * cold == true ? free a cold page : free a hot page - */ -void free_hot_cold_page(struct page *page, bool cold) +static bool free_unref_page_prepare(struct page *page, unsigned long pfn) { - struct zone *zone = page_zone(page); - struct per_cpu_pages *pcp; - unsigned long flags; - unsigned long pfn = page_to_pfn(page); int migratetype; if (!free_pcp_prepare(page)) - return; + return false; migratetype = get_pfnblock_migratetype(page, pfn); set_pcppage_migratetype(page, migratetype); - local_irq_save(flags); + return true; +} + +static void free_unref_page_commit(struct page *page, unsigned long pfn) +{ + struct zone *zone = page_zone(page); + struct per_cpu_pages *pcp; + int migratetype; + + migratetype = get_pcppage_migratetype(page); __count_vm_event(PGFREE); /* @@ -2620,38 +2650,62 @@ void free_hot_cold_page(struct page *page, bool cold) if (migratetype >= MIGRATE_PCPTYPES) { if (unlikely(is_migrate_isolate(migratetype))) { free_one_page(zone, page, pfn, 0, migratetype); - goto out; + return; } migratetype = MIGRATE_MOVABLE; } pcp = &this_cpu_ptr(zone->pageset)->pcp; - if (!cold) - list_add(&page->lru, &pcp->lists[migratetype]); - else - list_add_tail(&page->lru, &pcp->lists[migratetype]); + list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { unsigned long batch = READ_ONCE(pcp->batch); free_pcppages_bulk(zone, batch, pcp); pcp->count -= batch; } +} -out: +/* + * Free a 0-order page + */ +void free_unref_page(struct page *page) +{ + unsigned long flags; + unsigned long pfn = page_to_pfn(page); + + if (!free_unref_page_prepare(page, pfn)) + return; + + local_irq_save(flags); + free_unref_page_commit(page, pfn); local_irq_restore(flags); } /* * Free a list of 0-order pages */ -void free_hot_cold_page_list(struct list_head *list, bool cold) +void free_unref_page_list(struct list_head *list) { struct page *page, *next; + unsigned long flags, pfn; + + /* Prepare pages for freeing */ + list_for_each_entry_safe(page, next, list, lru) { + pfn = page_to_pfn(page); + if (!free_unref_page_prepare(page, pfn)) + list_del(&page->lru); + set_page_private(page, pfn); + } + local_irq_save(flags); list_for_each_entry_safe(page, next, list, lru) { - trace_mm_page_free_batched(page, cold); - free_hot_cold_page(page, cold); + unsigned long pfn = page_private(page); + + set_page_private(page, 0); + trace_mm_page_free_batched(page); + free_unref_page_commit(page, pfn); } + local_irq_restore(flags); } /* @@ -2669,15 +2723,6 @@ void split_page(struct page *page, unsigned int order) VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!page_count(page), page); -#ifdef CONFIG_KMEMCHECK - /* - * Split shadow pages too, because free(page[0]) would - * otherwise free the whole shadow. - */ - if (kmemcheck_page_is_tracked(page)) - split_page(virt_to_page(page[0].shadow), order); -#endif - for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, order); @@ -2743,6 +2788,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) #ifdef CONFIG_NUMA enum numa_stat_item local_stat = NUMA_LOCAL; + /* skip numa counters update if numa stats is disabled */ + if (!static_branch_likely(&vm_numa_stat_key)) + return; + if (z->node != numa_node_id()) local_stat = NUMA_OTHER; @@ -2758,7 +2807,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) /* Remove page from the per-cpu list, caller must protect the list */ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, - bool cold, struct per_cpu_pages *pcp, + struct per_cpu_pages *pcp, struct list_head *list) { struct page *page; @@ -2767,16 +2816,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, list, - migratetype, cold); + migratetype); if (unlikely(list_empty(list))) return NULL; } - if (cold) - page = list_last_entry(list, struct page, lru); - else - page = list_first_entry(list, struct page, lru); - + page = list_first_entry(list, struct page, lru); list_del(&page->lru); pcp->count--; } while (check_new_pcp(page)); @@ -2791,14 +2836,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, { struct per_cpu_pages *pcp; struct list_head *list; - bool cold = ((gfp_flags & __GFP_COLD) != 0); struct page *page; unsigned long flags; local_irq_save(flags); pcp = &this_cpu_ptr(zone->pageset)->pcp; list = &pcp->lists[migratetype]; - page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); + page = __rmqueue_pcplist(zone, migratetype, pcp, list); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); @@ -3006,9 +3050,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, if (!area->nr_free) continue; - if (alloc_harder) - return true; - for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { if (!list_empty(&area->free_list[mt])) return true; @@ -3020,6 +3061,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true; } #endif + if (alloc_harder && + !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + return true; } return false; } @@ -3235,20 +3279,14 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) return; - pr_warn("%s: ", current->comm); - va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - pr_cont("%pV", &vaf); + pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n", + current->comm, &vaf, gfp_mask, &gfp_mask, + nodemask_pr_args(nodemask)); va_end(args); - pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask); - if (nodemask) - pr_cont("%*pbl\n", nodemask_pr_args(nodemask)); - else - pr_cont("(null)\n"); - cpuset_print_current_mems_allowed(); dump_stack(); @@ -3868,8 +3906,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, enum compact_result compact_result; int compaction_retries; int no_progress_loops; - unsigned long alloc_start = jiffies; - unsigned int stall_timeout = 10 * HZ; unsigned int cpuset_mems_cookie; int reserve_flags; @@ -4001,14 +4037,6 @@ retry: if (!can_direct_reclaim) goto nopage; - /* Make sure we know about allocations which stall for too long */ - if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask, - "page allocation stalls for %ums, order:%u", - jiffies_to_msecs(jiffies-alloc_start), order); - stall_timeout += 10 * HZ; - } - /* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) goto nopage; @@ -4223,9 +4251,6 @@ out: page = NULL; } - if (kmemcheck_enabled && page) - kmemcheck_pagealloc_alloc(page, order, gfp_mask); - trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); return page; @@ -4262,7 +4287,7 @@ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { if (order == 0) - free_hot_cold_page(page, false); + free_unref_page(page); else __free_pages_ok(page, order); } @@ -4320,7 +4345,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) unsigned int order = compound_order(page); if (order == 0) - free_hot_cold_page(page, false); + free_unref_page(page); else __free_pages_ok(page, order); } @@ -6126,6 +6151,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) } } +#ifdef CONFIG_FLAT_NODE_MEM_MAP static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long __maybe_unused start = 0; @@ -6135,7 +6161,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) if (!pgdat->node_spanned_pages) return; -#ifdef CONFIG_FLAT_NODE_MEM_MAP start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); offset = pgdat->node_start_pfn - start; /* ia64 gets its own node_mem_map, before this, without bootmem */ @@ -6157,6 +6182,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) pgdat->node_id); pgdat->node_mem_map = map + offset; } + pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", + __func__, pgdat->node_id, (unsigned long)pgdat, + (unsigned long)pgdat->node_mem_map); #ifndef CONFIG_NEED_MULTIPLE_NODES /* * With no DISCONTIG, the global mem_map is just set as node 0's @@ -6169,8 +6197,10 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif -#endif /* CONFIG_FLAT_NODE_MEM_MAP */ } +#else +static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } +#endif /* CONFIG_FLAT_NODE_MEM_MAP */ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) @@ -6197,16 +6227,49 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, zones_size, zholes_size); alloc_node_mem_map(pgdat); -#ifdef CONFIG_FLAT_NODE_MEM_MAP - printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", - nid, (unsigned long)pgdat, - (unsigned long)pgdat->node_mem_map); -#endif reset_deferred_meminit(pgdat); free_area_init_core(pgdat); } +#ifdef CONFIG_HAVE_MEMBLOCK +/* + * Only struct pages that are backed by physical memory are zeroed and + * initialized by going through __init_single_page(). But, there are some + * struct pages which are reserved in memblock allocator and their fields + * may be accessed (for example page_to_pfn() on some configuration accesses + * flags). We must explicitly zero those struct pages. + */ +void __paginginit zero_resv_unavail(void) +{ + phys_addr_t start, end; + unsigned long pfn; + u64 i, pgcnt; + + /* + * Loop through ranges that are reserved, but do not have reported + * physical memory backing. + */ + pgcnt = 0; + for_each_resv_unavail_range(i, &start, &end) { + for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { + mm_zero_struct_page(pfn_to_page(pfn)); + pgcnt++; + } + } + + /* + * Struct pages that do not have backing memory. This could be because + * firmware is using some of this memory, or for some other reasons. + * Once memblock is changed so such behaviour is not allowed: i.e. + * list of "reserved" memory must be a subset of list of "memory", then + * this code can be removed. + */ + if (pgcnt) + pr_info("Reserved but unavailable: %lld pages", pgcnt); +} +#endif /* CONFIG_HAVE_MEMBLOCK */ + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #if MAX_NUMNODES > 1 @@ -6630,6 +6693,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) node_set_state(nid, N_MEMORY); check_for_memory(pgdat, nid); } + zero_resv_unavail(); } static int __init cmdline_parse_core(char *p, unsigned long *core) @@ -6793,6 +6857,7 @@ void __init free_area_init(unsigned long *zones_size) { free_area_init_node(0, zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); + zero_resv_unavail(); } static int page_alloc_cpu_dead(unsigned int cpu) @@ -7305,18 +7370,17 @@ void *__init alloc_large_system_hash(const char *tablename, log2qty = ilog2(numentries); - /* - * memblock allocator returns zeroed memory already, so HASH_ZERO is - * currently not used when HASH_EARLY is specified. - */ gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; do { size = bucketsize << log2qty; - if (flags & HASH_EARLY) - table = memblock_virt_alloc_nopanic(size, 0); - else if (hashdist) + if (flags & HASH_EARLY) { + if (flags & HASH_ZERO) + table = memblock_virt_alloc_nopanic(size, 0); + else + table = memblock_virt_alloc_raw(size, 0); + } else if (hashdist) { table = __vmalloc(size, gfp_flags, PAGE_KERNEL); - else { + } else { /* * If bucketsize is not a power-of-two, we may free * some pages at the end of hash table which @@ -7353,10 +7417,10 @@ void *__init alloc_large_system_hash(const char *tablename, * race condition. So you can't expect this function should be exact. */ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, + int migratetype, bool skip_hwpoisoned_pages) { unsigned long pfn, iter, found; - int mt; /* * For avoiding noise data, lru_add_drain_all() should be called @@ -7364,8 +7428,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, */ if (zone_idx(zone) == ZONE_MOVABLE) return false; - mt = get_pageblock_migratetype(page); - if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) + + /* + * CMA allocations (alloc_contig_range) really need to mark isolate + * CMA pageblocks even when they are not movable in fact so consider + * them movable here. + */ + if (is_migrate_cma(migratetype) && + is_migrate_cma(get_pageblock_migratetype(page))) return false; pfn = page_to_pfn(page); @@ -7377,6 +7447,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, page = pfn_to_page(check); + if (PageReserved(page)) + return true; + /* * Hugepages are not in LRU lists, but they're movable. * We need not scan over tail pages bacause we don't @@ -7450,7 +7523,7 @@ bool is_pageblock_removable_nolock(struct page *page) if (!zone_spans_pfn(zone, pfn)) return false; - return !has_unmovable_pages(zone, page, 0, true); + return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); } #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) diff --git a/mm/page_ext.c b/mm/page_ext.c index 4f0367d472c4..2c16216c29b6 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -125,7 +125,6 @@ struct page_ext *lookup_page_ext(struct page *page) struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; -#if defined(CONFIG_DEBUG_VM) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are @@ -134,7 +133,6 @@ struct page_ext *lookup_page_ext(struct page *page) */ if (unlikely(!base)) return NULL; -#endif index = pfn - round_down(node_start_pfn(page_to_nid(page)), MAX_ORDER_NR_PAGES); return get_entry(base, index); @@ -199,7 +197,6 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#if defined(CONFIG_DEBUG_VM) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are @@ -208,7 +205,6 @@ struct page_ext *lookup_page_ext(struct page *page) */ if (!section->page_ext) return NULL; -#endif return get_entry(section->page_ext, pfn); } diff --git a/mm/page_io.c b/mm/page_io.c index cd52b9cc169b..e93f1a4cacd7 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -347,7 +347,7 @@ out: return ret; } -int swap_readpage(struct page *page, bool do_poll) +int swap_readpage(struct page *page, bool synchronous) { struct bio *bio; int ret = 0; @@ -355,7 +355,7 @@ int swap_readpage(struct page *page, bool do_poll) blk_qc_t qc; struct gendisk *disk; - VM_BUG_ON_PAGE(!PageSwapCache(page), page); + VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageUptodate(page), page); if (frontswap_load(page) == 0) { @@ -403,7 +403,7 @@ int swap_readpage(struct page *page, bool do_poll) count_vm_event(PSWPIN); bio_get(bio); qc = submit_bio(bio); - while (do_poll) { + while (synchronous) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(bio->bi_private)) break; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 44f213935bf6..165ed8117bd1 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -15,7 +15,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/page_isolation.h> -static int set_migratetype_isolate(struct page *page, +static int set_migratetype_isolate(struct page *page, int migratetype, bool skip_hwpoisoned_pages) { struct zone *zone; @@ -52,7 +52,7 @@ static int set_migratetype_isolate(struct page *page, * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. * We just check MOVABLE pages. */ - if (!has_unmovable_pages(zone, page, arg.pages_found, + if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, skip_hwpoisoned_pages)) ret = 0; @@ -64,14 +64,14 @@ static int set_migratetype_isolate(struct page *page, out: if (!ret) { unsigned long nr_pages; - int migratetype = get_pageblock_migratetype(page); + int mt = get_pageblock_migratetype(page); set_pageblock_migratetype(page, MIGRATE_ISOLATE); zone->nr_isolate_pageblock++; nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, NULL); - __mod_zone_freepage_state(zone, -nr_pages, migratetype); + __mod_zone_freepage_state(zone, -nr_pages, mt); } spin_unlock_irqrestore(&zone->lock, flags); @@ -183,7 +183,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); if (page && - set_migratetype_isolate(page, skip_hwpoisoned_pages)) { + set_migratetype_isolate(page, migratetype, skip_hwpoisoned_pages)) { undo_pfn = pfn; goto undo; } diff --git a/mm/page_owner.c b/mm/page_owner.c index 4f44b95b9d1e..8592543a0f15 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -20,9 +20,9 @@ #define PAGE_OWNER_STACK_DEPTH (16) struct page_owner { - unsigned int order; + unsigned short order; + short last_migrate_reason; gfp_t gfp_mask; - int last_migrate_reason; depot_stack_handle_t handle; }; diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 15dab691ea70..9158e5a81391 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -81,7 +81,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, static int pcpu_alloc_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end) { - const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; + const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM; unsigned int cpu, tcpu; int i; diff --git a/mm/rmap.c b/mm/rmap.c index b874c4761e84..47db27f8049e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -899,7 +899,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); while (page_vma_mapped_walk(&pvmw)) { - unsigned long cstart, cend; + unsigned long cstart; int ret = 0; cstart = address = pvmw.address; @@ -915,7 +915,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(vma->vm_mm, address, pte, entry); - cend = cstart + PAGE_SIZE; ret = 1; } else { #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE @@ -931,7 +930,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); cstart &= PMD_MASK; - cend = cstart + PMD_SIZE; ret = 1; #else /* unexpected pmd-mapped page? */ @@ -939,10 +937,15 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, #endif } - if (ret) { - mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend); + /* + * No need to call mmu_notifier_invalidate_range() as we are + * downgrading page table protection not changing it to point + * to a new page. + * + * See Documentation/vm/mmu_notifier.txt + */ + if (ret) (*cleaned)++; - } } mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); @@ -1318,7 +1321,7 @@ void page_remove_rmap(struct page *page, bool compound) * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap * which increments mapcount after us but sets mapping - * before us: so leave the reset to free_hot_cold_page, + * before us: so leave the reset to free_unref_page, * and remember that it's only reliable while mapped. * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. @@ -1426,6 +1429,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + /* + * No need to invalidate here it will synchronize on + * against the special swap migration pte. + */ goto discard; } @@ -1483,6 +1490,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * will take care of the rest. */ dec_mm_counter(mm, mm_counter(page)); + /* We have to invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { swp_entry_t entry; @@ -1498,6 +1508,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); + /* + * No need to invalidate here it will synchronize on + * against the special swap migration pte. + */ } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(subpage) }; pte_t swp_pte; @@ -1509,6 +1523,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, WARN_ON_ONCE(1); ret = false; /* We have to invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); page_vma_mapped_walk_done(&pvmw); break; } @@ -1516,6 +1532,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* MADV_FREE page check */ if (!PageSwapBacked(page)) { if (!PageDirty(page)) { + /* Invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, + address, address + PAGE_SIZE); dec_mm_counter(mm, MM_ANONPAGES); goto discard; } @@ -1549,13 +1568,39 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); - } else + /* Invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); + } else { + /* + * We should not need to notify here as we reach this + * case only from freeze_page() itself only call from + * split_huge_page_to_list() so everything below must + * be true: + * - page is not anonymous + * - page is locked + * + * So as it is a locked file back page thus it can not + * be remove from the page cache and replace by a new + * page before mmu_notifier_invalidate_range_end so no + * concurrent thread might update its page table to + * point at new page while a device still is using this + * page. + * + * See Documentation/vm/mmu_notifier.txt + */ dec_mm_counter(mm, mm_counter_file(page)); + } discard: + /* + * No need to call mmu_notifier_invalidate_range() it has be + * done above for all cases requiring it to happen under page + * table lock before mmu_notifier_invalidate_range_end() + * + * See Documentation/vm/mmu_notifier.txt + */ page_remove_rmap(subpage, PageHuge(page)); put_page(page); - mmu_notifier_invalidate_range(mm, address, - address + PAGE_SIZE); } mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); diff --git a/mm/shmem.c b/mm/shmem.c index 07a1d22807be..ab22eaa2412e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -338,7 +338,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping, if (item != expected) return -ENOENT; __radix_tree_replace(&mapping->page_tree, node, pslot, - replacement, NULL, NULL); + replacement, NULL); return 0; } @@ -747,7 +747,7 @@ void shmem_unlock_mapping(struct address_space *mapping) pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index = 0; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); /* * Minor point, but we might as well stop if someone else SHM_LOCKs it. */ @@ -790,7 +790,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (lend == -1) end = -1; /* unsigned, so actually very big */ - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = start; while (index < end) { pvec.nr = find_get_entries(mapping, index, @@ -2528,7 +2528,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, bool done = false; int i; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); pvec.nr = 1; /* start small: we may be there already */ while (!done) { pvec.nr = find_get_entries(mapping, index, @@ -3862,12 +3862,11 @@ static void shmem_init_inode(void *foo) inode_init_once(&info->vfs_inode); } -static int shmem_init_inodecache(void) +static void shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); - return 0; } static void shmem_destroy_inodecache(void) @@ -3991,9 +3990,7 @@ int __init shmem_init(void) if (shmem_inode_cachep) return 0; - error = shmem_init_inodecache(); - if (error) - goto out3; + shmem_init_inodecache(); error = register_filesystem(&shmem_fs_type); if (error) { @@ -4020,7 +4017,6 @@ out1: unregister_filesystem(&shmem_fs_type); out2: shmem_destroy_inodecache(); -out3: shm_mnt = ERR_PTR(error); return error; } @@ -4102,6 +4098,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) if (i_size >= HPAGE_PMD_SIZE && i_size >> PAGE_SHIFT >= off) return true; + /* fall through */ case SHMEM_HUGE_ADVISE: /* TODO: implement fadvise() hints */ return (vma->vm_flags & VM_HUGEPAGE); diff --git a/mm/slab.c b/mm/slab.c index b7095884fd93..183e996dde5f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -114,7 +114,6 @@ #include <linux/rtmutex.h> #include <linux/reciprocal_div.h> #include <linux/debugobjects.h> -#include <linux/kmemcheck.h> #include <linux/memory.h> #include <linux/prefetch.h> #include <linux/sched/task_stack.h> @@ -252,8 +251,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ } while (0) -#define CFLGS_OBJFREELIST_SLAB (0x40000000UL) -#define CFLGS_OFF_SLAB (0x80000000UL) +#define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000U) +#define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000U) #define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) @@ -441,7 +440,7 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) * Calculate the number of objects and left-over bytes for a given buffer size. */ static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, - unsigned long flags, size_t *left_over) + slab_flags_t flags, size_t *left_over) { unsigned int num; size_t slab_size = PAGE_SIZE << gfporder; @@ -1410,10 +1409,8 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nr_pages; flags |= cachep->allocflags; - if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - flags |= __GFP_RECLAIMABLE; - page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); + page = __alloc_pages_node(nodeid, flags, cachep->gfporder); if (!page) { slab_out_of_memory(cachep, flags, nodeid); return NULL; @@ -1435,15 +1432,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, if (sk_memalloc_socks() && page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); - if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { - kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); - - if (cachep->ctor) - kmemcheck_mark_uninitialized_pages(page, nr_pages); - else - kmemcheck_mark_unallocated_pages(page, nr_pages); - } - return page; } @@ -1455,8 +1443,6 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) int order = cachep->gfporder; unsigned long nr_freed = (1 << order); - kmemcheck_free_shadow(page, order); - if (cachep->flags & SLAB_RECLAIM_ACCOUNT) mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed); else @@ -1761,7 +1747,7 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * towards high-order requests, this should be changed. */ static size_t calculate_slab_order(struct kmem_cache *cachep, - size_t size, unsigned long flags) + size_t size, slab_flags_t flags) { size_t left_over = 0; int gfporder; @@ -1888,8 +1874,8 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) return 0; } -unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, +slab_flags_t kmem_cache_flags(unsigned long object_size, + slab_flags_t flags, const char *name, void (*ctor)(void *)) { return flags; @@ -1897,7 +1883,7 @@ unsigned long kmem_cache_flags(unsigned long object_size, struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) + slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache *cachep; @@ -1915,7 +1901,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, } static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, - size_t size, unsigned long flags) + size_t size, slab_flags_t flags) { size_t left; @@ -1938,7 +1924,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, } static bool set_off_slab_cache(struct kmem_cache *cachep, - size_t size, unsigned long flags) + size_t size, slab_flags_t flags) { size_t left; @@ -1972,7 +1958,7 @@ static bool set_off_slab_cache(struct kmem_cache *cachep, } static bool set_on_slab_cache(struct kmem_cache *cachep, - size_t size, unsigned long flags) + size_t size, slab_flags_t flags) { size_t left; @@ -2008,8 +1994,7 @@ static bool set_on_slab_cache(struct kmem_cache *cachep, * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ -int -__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) +int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) { size_t ralign = BYTES_PER_WORD; gfp_t gfp; @@ -2144,6 +2129,8 @@ done: cachep->allocflags = __GFP_COMP; if (flags & SLAB_CACHE_DMA) cachep->allocflags |= GFP_DMA; + if (flags & SLAB_RECLAIM_ACCOUNT) + cachep->allocflags |= __GFP_RECLAIMABLE; cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); @@ -3516,8 +3503,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); - kmemcheck_slab_free(cachep, objp, cachep->object_size); - /* * Skip calling cache_free_alien() when the platform is not numa. * This will avoid cache misses that happen while accessing slabp (which @@ -4097,7 +4082,6 @@ out: schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); } -#ifdef CONFIG_SLABINFO void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { unsigned long active_objs, num_objs, active_slabs; @@ -4405,7 +4389,6 @@ static int __init slab_proc_init(void) return 0; } module_init(slab_proc_init); -#endif #ifdef CONFIG_HARDENED_USERCOPY /* diff --git a/mm/slab.h b/mm/slab.h index 86d7c7d860f9..ad657ffa44e5 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -21,7 +21,7 @@ struct kmem_cache { unsigned int object_size;/* The original size of the object */ unsigned int size; /* The aligned/padded/added on size */ unsigned int align; /* Alignment as calculated */ - unsigned long flags; /* Active flags on the slab */ + slab_flags_t flags; /* Active flags on the slab */ const char *name; /* Slab name for sysfs */ int refcount; /* Use counter */ void (*ctor)(void *); /* Called on object slot creation */ @@ -40,7 +40,6 @@ struct kmem_cache { #include <linux/memcontrol.h> #include <linux/fault-inject.h> -#include <linux/kmemcheck.h> #include <linux/kasan.h> #include <linux/kmemleak.h> #include <linux/random.h> @@ -79,13 +78,13 @@ extern const struct kmalloc_info_struct { unsigned long size; } kmalloc_info[]; -unsigned long calculate_alignment(unsigned long flags, +unsigned long calculate_alignment(slab_flags_t flags, unsigned long align, unsigned long size); #ifndef CONFIG_SLOB /* Kmalloc array related functions */ void setup_kmalloc_cache_index_table(void); -void create_kmalloc_caches(unsigned long); +void create_kmalloc_caches(slab_flags_t); /* Find the kmalloc slab corresponding for a certain size */ struct kmem_cache *kmalloc_slab(size_t, gfp_t); @@ -93,32 +92,32 @@ struct kmem_cache *kmalloc_slab(size_t, gfp_t); /* Functions provided by the slab allocators */ -extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); +int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, - unsigned long flags); + slab_flags_t flags); extern void create_boot_cache(struct kmem_cache *, const char *name, - size_t size, unsigned long flags); + size_t size, slab_flags_t flags); int slab_unmergeable(struct kmem_cache *s); struct kmem_cache *find_mergeable(size_t size, size_t align, - unsigned long flags, const char *name, void (*ctor)(void *)); + slab_flags_t flags, const char *name, void (*ctor)(void *)); #ifndef CONFIG_SLOB struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)); + slab_flags_t flags, void (*ctor)(void *)); -unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, +slab_flags_t kmem_cache_flags(unsigned long object_size, + slab_flags_t flags, const char *name, void (*ctor)(void *)); #else static inline struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) + slab_flags_t flags, void (*ctor)(void *)) { return NULL; } -static inline unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, +static inline slab_flags_t kmem_cache_flags(unsigned long object_size, + slab_flags_t flags, const char *name, void (*ctor)(void *)) { return flags; @@ -142,10 +141,10 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #if defined(CONFIG_SLAB) #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ - SLAB_NOTRACK | SLAB_ACCOUNT) + SLAB_ACCOUNT) #elif defined(CONFIG_SLUB) #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT) + SLAB_TEMPORARY | SLAB_ACCOUNT) #else #define SLAB_CACHE_FLAGS (0) #endif @@ -164,7 +163,6 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, SLAB_NOLEAKTRACE | \ SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | \ - SLAB_NOTRACK | \ SLAB_ACCOUNT) int __kmem_cache_shutdown(struct kmem_cache *); @@ -439,7 +437,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, for (i = 0; i < size; i++) { void *object = p[i]; - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); kasan_slab_alloc(s, object, flags); @@ -506,6 +503,14 @@ void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos); void memcg_slab_stop(struct seq_file *m, void *p); int memcg_slab_show(struct seq_file *m, void *p); +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) +void dump_unreclaimable_slab(void); +#else +static inline void dump_unreclaimable_slab(void) +{ +} +#endif + void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); #ifdef CONFIG_SLAB_FREELIST_RANDOM diff --git a/mm/slab_common.c b/mm/slab_common.c index 0d7fe71ff5e4..c8cb36774ba1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -44,7 +44,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, SLAB_FAILSLAB | SLAB_KASAN) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ - SLAB_NOTRACK | SLAB_ACCOUNT) + SLAB_ACCOUNT) /* * Merge control. If this is set then no merging of slab caches will occur. @@ -291,7 +291,7 @@ int slab_unmergeable(struct kmem_cache *s) } struct kmem_cache *find_mergeable(size_t size, size_t align, - unsigned long flags, const char *name, void (*ctor)(void *)) + slab_flags_t flags, const char *name, void (*ctor)(void *)) { struct kmem_cache *s; @@ -341,7 +341,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, * Figure out what the alignment of the objects will be given a set of * flags, a user specified alignment and the size of the objects. */ -unsigned long calculate_alignment(unsigned long flags, +unsigned long calculate_alignment(slab_flags_t flags, unsigned long align, unsigned long size) { /* @@ -366,7 +366,7 @@ unsigned long calculate_alignment(unsigned long flags, static struct kmem_cache *create_cache(const char *name, size_t object_size, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *), + slab_flags_t flags, void (*ctor)(void *), struct mem_cgroup *memcg, struct kmem_cache *root_cache) { struct kmem_cache *s; @@ -431,7 +431,7 @@ out_free_cache: */ struct kmem_cache * kmem_cache_create(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) + slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache *s = NULL; const char *cache_name; @@ -879,7 +879,7 @@ bool slab_is_available(void) #ifndef CONFIG_SLOB /* Create a cache during boot when no slab services are available yet */ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, - unsigned long flags) + slab_flags_t flags) { int err; @@ -899,7 +899,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz } struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, - unsigned long flags) + slab_flags_t flags) { struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); @@ -1057,7 +1057,7 @@ void __init setup_kmalloc_cache_index_table(void) } } -static void __init new_kmalloc_cache(int idx, unsigned long flags) +static void __init new_kmalloc_cache(int idx, slab_flags_t flags) { kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, kmalloc_info[idx].size, flags); @@ -1068,7 +1068,7 @@ static void __init new_kmalloc_cache(int idx, unsigned long flags) * may already have been created because they were needed to * enable allocations for slab creation. */ -void __init create_kmalloc_caches(unsigned long flags) +void __init create_kmalloc_caches(slab_flags_t flags) { int i; @@ -1184,8 +1184,7 @@ void cache_random_seq_destroy(struct kmem_cache *cachep) } #endif /* CONFIG_SLAB_FREELIST_RANDOM */ -#ifdef CONFIG_SLABINFO - +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) #ifdef CONFIG_SLAB #define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR) #else @@ -1281,7 +1280,41 @@ static int slab_show(struct seq_file *m, void *p) return 0; } -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +void dump_unreclaimable_slab(void) +{ + struct kmem_cache *s, *s2; + struct slabinfo sinfo; + + /* + * Here acquiring slab_mutex is risky since we don't prefer to get + * sleep in oom path. But, without mutex hold, it may introduce a + * risk of crash. + * Use mutex_trylock to protect the list traverse, dump nothing + * without acquiring the mutex. + */ + if (!mutex_trylock(&slab_mutex)) { + pr_warn("excessive unreclaimable slab but cannot dump stats\n"); + return; + } + + pr_info("Unreclaimable slab info:\n"); + pr_info("Name Used Total\n"); + + list_for_each_entry_safe(s, s2, &slab_caches, list) { + if (!is_root_cache(s) || (s->flags & SLAB_RECLAIM_ACCOUNT)) + continue; + + get_slabinfo(s, &sinfo); + + if (sinfo.num_objs > 0) + pr_info("%-17s %10luKB %10luKB\n", cache_name(s), + (sinfo.active_objs * s->size) / 1024, + (sinfo.num_objs * s->size) / 1024); + } + mutex_unlock(&slab_mutex); +} + +#if defined(CONFIG_MEMCG) void *memcg_slab_start(struct seq_file *m, loff_t *pos) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -1355,7 +1388,7 @@ static int __init slab_proc_init(void) return 0; } module_init(slab_proc_init); -#endif /* CONFIG_SLABINFO */ +#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */ static __always_inline void *__do_krealloc(const void *p, size_t new_size, gfp_t flags) diff --git a/mm/slob.c b/mm/slob.c index 10249160b693..623e8a5c46ce 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -330,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) BUG_ON(!b); spin_unlock_irqrestore(&slob_lock, flags); } - if (unlikely((gfp & __GFP_ZERO) && b)) + if (unlikely(gfp & __GFP_ZERO)) memset(b, 0, size); return b; } @@ -524,7 +524,7 @@ size_t ksize(const void *block) } EXPORT_SYMBOL(ksize); -int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) +int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) { if (flags & SLAB_TYPESAFE_BY_RCU) { /* leave room for rcu footer at the end of object */ diff --git a/mm/slub.c b/mm/slub.c index 1efbb8123037..cfd56e5a35fb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -22,7 +22,6 @@ #include <linux/notifier.h> #include <linux/seq_file.h> #include <linux/kasan.h> -#include <linux/kmemcheck.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/mempolicy.h> @@ -193,8 +192,10 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ /* Internal SLUB flags */ -#define __OBJECT_POISON 0x80000000UL /* Poison object */ -#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ +/* Poison object */ +#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U) +/* Use cmpxchg_double */ +#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U) /* * Tracking user of a slab. @@ -485,9 +486,9 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) * Debug settings: */ #if defined(CONFIG_SLUB_DEBUG_ON) -static int slub_debug = DEBUG_DEFAULT_FLAGS; +static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; #else -static int slub_debug; +static slab_flags_t slub_debug; #endif static char *slub_debug_slabs; @@ -1289,8 +1290,8 @@ out: __setup("slub_debug", setup_slub_debug); -unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, +slab_flags_t kmem_cache_flags(unsigned long object_size, + slab_flags_t flags, const char *name, void (*ctor)(void *)) { /* @@ -1322,8 +1323,8 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} -unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, +slab_flags_t kmem_cache_flags(unsigned long object_size, + slab_flags_t flags, const char *name, void (*ctor)(void *)) { return flags; @@ -1370,12 +1371,11 @@ static inline void *slab_free_hook(struct kmem_cache *s, void *x) * So in order to make the debug calls that expect irqs to be * disabled we need to disable interrupts temporarily. */ -#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) +#ifdef CONFIG_LOCKDEP { unsigned long flags; local_irq_save(flags); - kmemcheck_slab_free(s, x, s->object_size); debug_check_no_locks_freed(x, s->object_size); local_irq_restore(flags); } @@ -1399,8 +1399,7 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s, * Compiler cannot detect this function can be removed if slab_free_hook() * evaluates to nothing. Thus, catch all relevant config debug options here. */ -#if defined(CONFIG_KMEMCHECK) || \ - defined(CONFIG_LOCKDEP) || \ +#if defined(CONFIG_LOCKDEP) || \ defined(CONFIG_DEBUG_KMEMLEAK) || \ defined(CONFIG_DEBUG_OBJECTS_FREE) || \ defined(CONFIG_KASAN) @@ -1436,8 +1435,6 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, struct page *page; int order = oo_order(oo); - flags |= __GFP_NOTRACK; - if (node == NUMA_NO_NODE) page = alloc_pages(flags, order); else @@ -1596,22 +1593,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) stat(s, ORDER_FALLBACK); } - if (kmemcheck_enabled && - !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { - int pages = 1 << oo_order(oo); - - kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); - - /* - * Objects from caches that have a constructor don't get - * cleared when they're allocated, so we need to do it here. - */ - if (s->ctor) - kmemcheck_mark_uninitialized_pages(page, pages); - else - kmemcheck_mark_unallocated_pages(page, pages); - } - page->objects = oo_objects(oo); order = compound_order(page); @@ -1687,8 +1668,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) check_object(s, page, p, SLUB_RED_INACTIVE); } - kmemcheck_free_shadow(page, compound_order(page)); - mod_lruvec_page_state(page, (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, @@ -3477,7 +3456,7 @@ static void set_cpu_partial(struct kmem_cache *s) */ static int calculate_sizes(struct kmem_cache *s, int forced_order) { - unsigned long flags = s->flags; + slab_flags_t flags = s->flags; size_t size = s->object_size; int order; @@ -3593,7 +3572,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) return !!oo_objects(s->oo); } -static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) +static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); s->reserved = 0; @@ -3655,7 +3634,7 @@ error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n", s->name, (unsigned long)s->size, s->size, - oo_order(s->oo), s->offset, flags); + oo_order(s->oo), s->offset, (unsigned long)flags); return -EINVAL; } @@ -3792,7 +3771,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK; + flags |= __GFP_COMP; page = alloc_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -4245,7 +4224,7 @@ void __init kmem_cache_init_late(void) struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) + slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache *s, *c; @@ -4275,7 +4254,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, return s; } -int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) +int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) { int err; @@ -5655,8 +5634,6 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'a'; if (s->flags & SLAB_CONSISTENCY_CHECKS) *p++ = 'F'; - if (!(s->flags & SLAB_NOTRACK)) - *p++ = 't'; if (s->flags & SLAB_ACCOUNT) *p++ = 'A'; if (p != name + 1) @@ -5704,6 +5681,10 @@ static int sysfs_slab_add(struct kmem_cache *s) return 0; } + if (!unmergeable && disable_higher_order_debug && + (slub_debug & DEBUG_METADATA_FLAGS)) + unmergeable = 1; + if (unmergeable) { /* * Slabcache can never be merged so we can use the name proper. @@ -5852,7 +5833,7 @@ __initcall(slab_sysfs_init); /* * The /proc/slabinfo ABI */ -#ifdef CONFIG_SLABINFO +#ifdef CONFIG_SLUB_DEBUG void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) { unsigned long nr_slabs = 0; @@ -5884,4 +5865,4 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, { return -EIO; } -#endif /* CONFIG_SLABINFO */ +#endif /* CONFIG_SLUB_DEBUG */ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 478ce6d4a2c4..17acf01791fa 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -42,7 +42,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return memblock_virt_alloc_try_nid(size, align, goal, + return memblock_virt_alloc_try_nid_raw(size, align, goal, BOOTMEM_ALLOC_ACCESSIBLE, node); } @@ -53,13 +53,20 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) { /* If the main allocator is up use that, fallback to bootmem. */ if (slab_is_available()) { + gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; + int order = get_order(size); + static bool warned; struct page *page; - page = alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, - get_order(size)); + page = alloc_pages_node(node, gfp_mask, order); if (page) return page_address(page); + + if (!warned) { + warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL, + "vmemmap alloc failure: order:%u", order); + warned = true; + } return NULL; } else return __earlyonly_bootmem_alloc(node, size, size, @@ -180,11 +187,22 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) return pte; } +static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node) +{ + void *p = vmemmap_alloc_block(size, node); + + if (!p) + return NULL; + memset(p, 0, size); + + return p; +} + pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) { pmd_t *pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { - void *p = vmemmap_alloc_block(PAGE_SIZE, node); + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; pmd_populate_kernel(&init_mm, pmd, p); @@ -196,7 +214,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) { pud_t *pud = pud_offset(p4d, addr); if (pud_none(*pud)) { - void *p = vmemmap_alloc_block(PAGE_SIZE, node); + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; pud_populate(&init_mm, pud, p); @@ -208,7 +226,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) { p4d_t *p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { - void *p = vmemmap_alloc_block(PAGE_SIZE, node); + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; p4d_populate(&init_mm, p4d, p); @@ -220,7 +238,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) { pgd_t *pgd = pgd_offset_k(addr); if (pgd_none(*pgd)) { - void *p = vmemmap_alloc_block(PAGE_SIZE, node); + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; pgd_populate(&init_mm, pgd, p); diff --git a/mm/sparse.c b/mm/sparse.c index 60805abf98af..7a5dacaa06e3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -453,9 +453,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, } size = PAGE_ALIGN(size); - map = memblock_virt_alloc_try_nid(size * map_count, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nodeid); + map = memblock_virt_alloc_try_nid_raw(size * map_count, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nodeid); if (map) { for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) diff --git a/mm/swap.c b/mm/swap.c index a77d68f2c1b6..38e1b6374a97 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -76,7 +76,7 @@ static void __page_cache_release(struct page *page) static void __put_single_page(struct page *page) { __page_cache_release(page); - free_hot_cold_page(page, false); + free_unref_page(page); } static void __put_compound_page(struct page *page) @@ -210,7 +210,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, } if (pgdat) spin_unlock_irqrestore(&pgdat->lru_lock, flags); - release_pages(pvec->pages, pvec->nr, pvec->cold); + release_pages(pvec->pages, pvec->nr); pagevec_reinit(pvec); } @@ -740,7 +740,7 @@ void lru_add_drain_all(void) * Decrement the reference count on all the pages in @pages. If it * fell to zero, remove the page from the LRU and free it. */ -void release_pages(struct page **pages, int nr, bool cold) +void release_pages(struct page **pages, int nr) { int i; LIST_HEAD(pages_to_free); @@ -817,7 +817,7 @@ void release_pages(struct page **pages, int nr, bool cold) spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); mem_cgroup_uncharge_list(&pages_to_free); - free_hot_cold_page_list(&pages_to_free, cold); + free_unref_page_list(&pages_to_free); } EXPORT_SYMBOL(release_pages); @@ -833,8 +833,11 @@ EXPORT_SYMBOL(release_pages); */ void __pagevec_release(struct pagevec *pvec) { - lru_add_drain(); - release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); + if (!pvec->percpu_pvec_drained) { + lru_add_drain(); + pvec->percpu_pvec_drained = true; + } + release_pages(pvec->pages, pagevec_count(pvec)); pagevec_reinit(pvec); } EXPORT_SYMBOL(__pagevec_release); @@ -986,15 +989,25 @@ unsigned pagevec_lookup_range(struct pagevec *pvec, } EXPORT_SYMBOL(pagevec_lookup_range); -unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, - pgoff_t *index, int tag, unsigned nr_pages) +unsigned pagevec_lookup_range_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag) { - pvec->nr = find_get_pages_tag(mapping, index, tag, - nr_pages, pvec->pages); + pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, + PAGEVEC_SIZE, pvec->pages); return pagevec_count(pvec); } -EXPORT_SYMBOL(pagevec_lookup_tag); +EXPORT_SYMBOL(pagevec_lookup_range_tag); +unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned max_pages) +{ + pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, + min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages); + return pagevec_count(pvec); +} +EXPORT_SYMBOL(pagevec_lookup_range_nr_tag); /* * Perform any setup for the swap system */ diff --git a/mm/swap_slots.c b/mm/swap_slots.c index d81cfc5a43d5..bebc19292018 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -149,6 +149,13 @@ static int alloc_swap_slot_cache(unsigned int cpu) cache->nr = 0; cache->cur = 0; cache->n_ret = 0; + /* + * We initialized alloc_lock and free_lock earlier. We use + * !cache->slots or !cache->slots_ret to know if it is safe to acquire + * the corresponding lock and use the cache. Memory barrier below + * ensures the assumption. + */ + mb(); cache->slots = slots; slots = NULL; cache->slots_ret = slots_ret; @@ -275,7 +282,7 @@ int free_swap_slot(swp_entry_t entry) struct swap_slots_cache *cache; cache = raw_cpu_ptr(&swp_slots); - if (use_swap_slot_cache && cache->slots_ret) { + if (likely(use_swap_slot_cache && cache->slots_ret)) { spin_lock_irq(&cache->free_lock); /* Swap slots cache may be deactivated before acquiring lock */ if (!use_swap_slot_cache || !cache->slots_ret) { @@ -326,7 +333,7 @@ swp_entry_t get_swap_page(struct page *page) */ cache = raw_cpu_ptr(&swp_slots); - if (check_cache_active()) { + if (likely(check_cache_active() && cache->slots)) { mutex_lock(&cache->alloc_lock); if (cache->slots) { repeat: diff --git a/mm/swap_state.c b/mm/swap_state.c index 326439428daf..39ae7cfad90f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -36,9 +36,9 @@ static const struct address_space_operations swap_aops = { #endif }; -struct address_space *swapper_spaces[MAX_SWAPFILES]; -static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; -bool swap_vma_readahead = true; +struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; +static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; +bool swap_vma_readahead __read_mostly = true; #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) @@ -319,7 +319,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr) lru_add_drain(); for (i = 0; i < nr; i++) free_swap_cache(pagep[i]); - release_pages(pagep, nr, false); + release_pages(pagep, nr); } /* @@ -559,6 +559,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long offset = entry_offset; unsigned long start_offset, end_offset; unsigned long mask; + struct swap_info_struct *si = swp_swap_info(entry); struct blk_plug plug; bool do_poll = true, page_allocated; @@ -572,6 +573,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, end_offset = offset | mask; if (!start_offset) /* First page is swap header. */ start_offset++; + if (end_offset >= si->max) + end_offset = si->max - 1; blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { diff --git a/mm/swapfile.c b/mm/swapfile.c index e47a21e64764..3074b02eaa09 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1328,6 +1328,13 @@ int page_swapcount(struct page *page) return count; } +int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +{ + pgoff_t offset = swp_offset(entry); + + return swap_count(si->swap_map[offset]); +} + static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { int count = 0; @@ -3169,6 +3176,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) p->flags |= SWP_STABLE_WRITES; + if (bdi_cap_synchronous_io(inode_to_bdi(inode))) + p->flags |= SWP_SYNCHRONOUS_IO; + if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { int cpu; unsigned long ci, nr_cluster; @@ -3452,10 +3462,15 @@ int swapcache_prepare(swp_entry_t entry) return __swap_duplicate(entry, SWAP_HAS_CACHE); } +struct swap_info_struct *swp_swap_info(swp_entry_t entry) +{ + return swap_info[swp_type(entry)]; +} + struct swap_info_struct *page_swap_info(struct page *page) { - swp_entry_t swap = { .val = page_private(page) }; - return swap_info[swp_type(swap)]; + swp_entry_t entry = { .val = page_private(page) }; + return swp_swap_info(entry); } /* @@ -3463,7 +3478,6 @@ struct swap_info_struct *page_swap_info(struct page *page) */ struct address_space *__page_file_mapping(struct page *page) { - VM_BUG_ON_PAGE(!PageSwapCache(page), page); return page_swap_info(page)->swap_file->f_mapping; } EXPORT_SYMBOL_GPL(__page_file_mapping); @@ -3471,7 +3485,6 @@ EXPORT_SYMBOL_GPL(__page_file_mapping); pgoff_t __page_file_index(struct page *page) { swp_entry_t swap = { .val = page_private(page) }; - VM_BUG_ON_PAGE(!PageSwapCache(page), page); return swp_offset(swap); } EXPORT_SYMBOL_GPL(__page_file_index); diff --git a/mm/truncate.c b/mm/truncate.c index 2330223841fb..e4b4cf0f4070 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -25,44 +25,85 @@ #include <linux/rmap.h> #include "internal.h" -static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, - void *entry) +/* + * Regular page slots are stabilized by the page lock even without the tree + * itself locked. These unlocked entries need verification under the tree + * lock. + */ +static inline void __clear_shadow_entry(struct address_space *mapping, + pgoff_t index, void *entry) { struct radix_tree_node *node; void **slot; - spin_lock_irq(&mapping->tree_lock); - /* - * Regular page slots are stabilized by the page lock even - * without the tree itself locked. These unlocked entries - * need verification under the tree lock. - */ if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) - goto unlock; + return; if (*slot != entry) - goto unlock; + return; __radix_tree_replace(&mapping->page_tree, node, slot, NULL, - workingset_update_node, mapping); + workingset_update_node); mapping->nrexceptional--; -unlock: +} + +static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, + void *entry) +{ + spin_lock_irq(&mapping->tree_lock); + __clear_shadow_entry(mapping, index, entry); spin_unlock_irq(&mapping->tree_lock); } /* - * Unconditionally remove exceptional entry. Usually called from truncate path. + * Unconditionally remove exceptional entries. Usually called from truncate + * path. Note that the pagevec may be altered by this function by removing + * exceptional entries similar to what pagevec_remove_exceptionals does. */ -static void truncate_exceptional_entry(struct address_space *mapping, - pgoff_t index, void *entry) +static void truncate_exceptional_pvec_entries(struct address_space *mapping, + struct pagevec *pvec, pgoff_t *indices, + pgoff_t end) { + int i, j; + bool dax, lock; + /* Handled by shmem itself */ if (shmem_mapping(mapping)) return; - if (dax_mapping(mapping)) { - dax_delete_mapping_entry(mapping, index); + for (j = 0; j < pagevec_count(pvec); j++) + if (radix_tree_exceptional_entry(pvec->pages[j])) + break; + + if (j == pagevec_count(pvec)) return; + + dax = dax_mapping(mapping); + lock = !dax && indices[j] < end; + if (lock) + spin_lock_irq(&mapping->tree_lock); + + for (i = j; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + pgoff_t index = indices[i]; + + if (!radix_tree_exceptional_entry(page)) { + pvec->pages[j++] = page; + continue; + } + + if (index >= end) + continue; + + if (unlikely(dax)) { + dax_delete_mapping_entry(mapping, index); + continue; + } + + __clear_shadow_entry(mapping, index, page); } - clear_shadow_entry(mapping, index, entry); + + if (lock) + spin_unlock_irq(&mapping->tree_lock); + pvec->nr = j; } /* @@ -134,11 +175,17 @@ void do_invalidatepage(struct page *page, unsigned int offset, * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static int -truncate_complete_page(struct address_space *mapping, struct page *page) +static void +truncate_cleanup_page(struct address_space *mapping, struct page *page) { - if (page->mapping != mapping) - return -EIO; + if (page_mapped(page)) { + loff_t holelen; + + holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; + unmap_mapping_range(mapping, + (loff_t)page->index << PAGE_SHIFT, + holelen, 0); + } if (page_has_private(page)) do_invalidatepage(page, 0, PAGE_SIZE); @@ -150,8 +197,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page) */ cancel_dirty_page(page); ClearPageMappedToDisk(page); - delete_from_page_cache(page); - return 0; } /* @@ -180,16 +225,14 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) int truncate_inode_page(struct address_space *mapping, struct page *page) { - loff_t holelen; VM_BUG_ON_PAGE(PageTail(page), page); - holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; - if (page_mapped(page)) { - unmap_mapping_range(mapping, - (loff_t)page->index << PAGE_SHIFT, - holelen, 0); - } - return truncate_complete_page(mapping, page); + if (page->mapping != mapping) + return -EIO; + + truncate_cleanup_page(mapping, page); + delete_from_page_cache(page); + return 0; } /* @@ -287,11 +330,19 @@ void truncate_inode_pages_range(struct address_space *mapping, else end = (lend + 1) >> PAGE_SHIFT; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = start; while (index < end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { + /* + * Pagevec array has exceptional entries and we may also fail + * to lock some pages. So we store pages that can be deleted + * in a new pagevec. + */ + struct pagevec locked_pvec; + + pagevec_init(&locked_pvec); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -300,11 +351,8 @@ void truncate_inode_pages_range(struct address_space *mapping, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { - truncate_exceptional_entry(mapping, index, - page); + if (radix_tree_exceptional_entry(page)) continue; - } if (!trylock_page(page)) continue; @@ -313,15 +361,22 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); continue; } - truncate_inode_page(mapping, page); - unlock_page(page); + if (page->mapping != mapping) { + unlock_page(page); + continue; + } + pagevec_add(&locked_pvec, page); } - pagevec_remove_exceptionals(&pvec); + for (i = 0; i < pagevec_count(&locked_pvec); i++) + truncate_cleanup_page(mapping, locked_pvec.pages[i]); + delete_from_page_cache_batch(mapping, &locked_pvec); + for (i = 0; i < pagevec_count(&locked_pvec); i++) + unlock_page(locked_pvec.pages[i]); + truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); pagevec_release(&pvec); cond_resched(); index++; } - if (partial_start) { struct page *page = find_lock_page(mapping, start - 1); if (page) { @@ -379,6 +434,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); break; } + for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -390,11 +446,8 @@ void truncate_inode_pages_range(struct address_space *mapping, break; } - if (radix_tree_exceptional_entry(page)) { - truncate_exceptional_entry(mapping, index, - page); + if (radix_tree_exceptional_entry(page)) continue; - } lock_page(page); WARN_ON(page_to_index(page) != index); @@ -402,7 +455,7 @@ void truncate_inode_pages_range(struct address_space *mapping, truncate_inode_page(mapping, page); unlock_page(page); } - pagevec_remove_exceptionals(&pvec); + truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); pagevec_release(&pvec); index++; } @@ -500,7 +553,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, unsigned long count = 0; int i; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, indices)) { @@ -630,7 +683,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, if (mapping->nrpages == 0 && mapping->nrexceptional == 0) goto out; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = start; while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, diff --git a/mm/vmscan.c b/mm/vmscan.c index 15b483ef6440..c02c850ea349 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1349,7 +1349,7 @@ keep: mem_cgroup_uncharge_list(&free_pages); try_to_unmap_flush(); - free_hot_cold_page_list(&free_pages, true); + free_unref_page_list(&free_pages); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); @@ -1824,7 +1824,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&page_list); - free_hot_cold_page_list(&page_list, true); + free_unref_page_list(&page_list); /* * If reclaim is isolating dirty pages under writeback, it implies @@ -2063,7 +2063,7 @@ static void shrink_active_list(unsigned long nr_to_scan, spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&l_hold); - free_hot_cold_page_list(&l_hold, true); + free_unref_page_list(&l_hold); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } @@ -2082,7 +2082,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * If that fails and refaulting is observed, the inactive list grows. * * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages - * on this LRU, maintained by the pageout code. A zone->inactive_ratio + * on this LRU, maintained by the pageout code. An inactive_ratio * of 3 means 3:1 or 25% of the pages are kept on the inactive list. * * total target max diff --git a/mm/vmstat.c b/mm/vmstat.c index 4bb13e72ac97..40b2db6db6b1 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -32,6 +32,77 @@ #define NUMA_STATS_THRESHOLD (U16_MAX - 2) +#ifdef CONFIG_NUMA +int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; + +/* zero numa counters within a zone */ +static void zero_zone_numa_counters(struct zone *zone) +{ + int item, cpu; + + for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) { + atomic_long_set(&zone->vm_numa_stat[item], 0); + for_each_online_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item] + = 0; + } +} + +/* zero numa counters of all the populated zones */ +static void zero_zones_numa_counters(void) +{ + struct zone *zone; + + for_each_populated_zone(zone) + zero_zone_numa_counters(zone); +} + +/* zero global numa counters */ +static void zero_global_numa_counters(void) +{ + int item; + + for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) + atomic_long_set(&vm_numa_stat[item], 0); +} + +static void invalid_numa_statistics(void) +{ + zero_zones_numa_counters(); + zero_global_numa_counters(); +} + +static DEFINE_MUTEX(vm_numa_stat_lock); + +int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret, oldval; + + mutex_lock(&vm_numa_stat_lock); + if (write) + oldval = sysctl_vm_numa_stat; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret || !write) + goto out; + + if (oldval == sysctl_vm_numa_stat) + goto out; + else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) { + static_branch_enable(&vm_numa_stat_key); + pr_info("enable numa statistics\n"); + } else { + static_branch_disable(&vm_numa_stat_key); + invalid_numa_statistics(); + pr_info("disable numa statistics, and clear numa counters\n"); + } + +out: + mutex_unlock(&vm_numa_stat_lock); + return ret; +} +#endif + #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); @@ -1564,11 +1635,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, } seq_printf(m, "\n node_unreclaimable: %u" - "\n start_pfn: %lu" - "\n node_inactive_ratio: %u", + "\n start_pfn: %lu", pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, - zone->zone_start_pfn, - zone->zone_pgdat->inactive_ratio); + zone->zone_start_pfn); seq_putc(m, '\n'); } diff --git a/mm/workingset.c b/mm/workingset.c index b997c9de28f6..b7d616a3bbbe 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -340,14 +340,8 @@ out: static struct list_lru shadow_nodes; -void workingset_update_node(struct radix_tree_node *node, void *private) +void workingset_update_node(struct radix_tree_node *node) { - struct address_space *mapping = private; - - /* Only regular page cache has shadow entries */ - if (dax_mapping(mapping) || shmem_mapping(mapping)) - return; - /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. @@ -475,7 +469,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->page_tree, node, - workingset_update_node, mapping); + workingset_lookup_update(mapping)); out_invalid: spin_unlock(&mapping->tree_lock); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 7c38e850a8fc..685049a9048d 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1349,7 +1349,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, * pools/users, we can't allow mapping in interrupt context * because it can corrupt another users mappings. */ - WARN_ON_ONCE(in_interrupt()); + BUG_ON(in_interrupt()); /* From now on, migration cannot move the object */ pin_tag(handle); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8134c00df6c2..6b0ff396fa9d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -41,7 +41,6 @@ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/kmemcheck.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/in.h> @@ -234,14 +233,12 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, shinfo = skb_shinfo(skb); memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); - kmemcheck_annotate_variable(shinfo->destructor_arg); if (flags & SKB_ALLOC_FCLONE) { struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); - kmemcheck_annotate_bitfield(&fclones->skb2, flags1); skb->fclone = SKB_FCLONE_ORIG; refcount_set(&fclones->fclone_ref, 1); @@ -301,7 +298,6 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) shinfo = skb_shinfo(skb); memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); - kmemcheck_annotate_variable(shinfo->destructor_arg); return skb; } @@ -357,7 +353,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) */ void *netdev_alloc_frag(unsigned int fragsz) { - return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); + return __netdev_alloc_frag(fragsz, GFP_ATOMIC); } EXPORT_SYMBOL(netdev_alloc_frag); @@ -370,7 +366,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) void *napi_alloc_frag(unsigned int fragsz) { - return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); + return __napi_alloc_frag(fragsz, GFP_ATOMIC); } EXPORT_SYMBOL(napi_alloc_frag); @@ -1283,7 +1279,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) if (!n) return NULL; - kmemcheck_annotate_bitfield(n, flags1); n->fclone = SKB_FCLONE_UNAVAILABLE; } diff --git a/net/core/sock.c b/net/core/sock.c index 13719af7b4e3..c0b5b2f17412 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1469,8 +1469,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, sk = kmalloc(prot->obj_size, priority); if (sk != NULL) { - kmemcheck_annotate_bitfield(sk, flags); - if (security_sk_alloc(sk, family, priority)) goto out_free; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index a4bab81f1462..c690cd0d9b3f 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -9,7 +9,6 @@ */ #include <linux/kernel.h> -#include <linux/kmemcheck.h> #include <linux/slab.h> #include <linux/module.h> #include <net/inet_hashtables.h> @@ -167,8 +166,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, if (tw) { const struct inet_sock *inet = inet_sk(sk); - kmemcheck_annotate_bitfield(tw, flags); - tw->tw_dr = dr; /* Give us an identity. */ tw->tw_daddr = inet->inet_daddr; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dabbf1d392fb..f844c06c0676 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6130,7 +6130,6 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, if (req) { struct inet_request_sock *ireq = inet_rsk(req); - kmemcheck_annotate_bitfield(ireq, flags); ireq->ireq_opt = NULL; #if IS_ENABLED(CONFIG_IPV6) ireq->pktopts = NULL; diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c index 86ef907067bb..e0f70c4051b6 100644 --- a/net/rds/ib_fmr.c +++ b/net/rds/ib_fmr.c @@ -139,8 +139,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, return -EINVAL; } - dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, - rdsibdev_to_node(rds_ibdev)); + dma_pages = kmalloc_array_node(sizeof(u64), page_cnt, GFP_ATOMIC, + rdsibdev_to_node(rds_ibdev)); if (!dma_pages) { ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL); return -ENOMEM; diff --git a/net/socket.c b/net/socket.c index c729625eb5d3..42d8e9c9ccd5 100644 --- a/net/socket.c +++ b/net/socket.c @@ -568,7 +568,6 @@ struct socket *sock_alloc(void) sock = SOCKET_I(inode); - kmemcheck_annotate_bitfield(sock, type); inode->i_ino = get_next_ino(); inode->i_mode = S_IFSOCK | S_IRWXUGO; inode->i_uid = current_fsuid(); diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter index a27677146410..6f099f915dcf 100755 --- a/scripts/bloat-o-meter +++ b/scripts/bloat-o-meter @@ -12,18 +12,22 @@ from signal import signal, SIGPIPE, SIG_DFL signal(SIGPIPE, SIG_DFL) -if len(sys.argv) != 3: - sys.stderr.write("usage: %s file1 file2\n" % sys.argv[0]) +if len(sys.argv) < 3: + sys.stderr.write("usage: %s [option] file1 file2\n" % sys.argv[0]) + sys.stderr.write("The options are:\n") + sys.stderr.write("-c cateogrize output based on symbole type\n") + sys.stderr.write("-d Show delta of Data Section\n") + sys.stderr.write("-t Show delta of text Section\n") sys.exit(-1) re_NUMBER = re.compile(r'\.[0-9]+') -def getsizes(file): +def getsizes(file, format): sym = {} with os.popen("nm --size-sort " + file) as f: for line in f: size, type, name = line.split() - if type in "tTdDbBrR": + if type in format: # strip generated symbols if name.startswith("__mod_"): continue if name.startswith("SyS_"): continue @@ -34,44 +38,61 @@ def getsizes(file): sym[name] = sym.get(name, 0) + int(size, 16) return sym -old = getsizes(sys.argv[1]) -new = getsizes(sys.argv[2]) -grow, shrink, add, remove, up, down = 0, 0, 0, 0, 0, 0 -delta, common = [], {} -otot, ntot = 0, 0 +def calc(oldfile, newfile, format): + old = getsizes(oldfile, format) + new = getsizes(newfile, format) + grow, shrink, add, remove, up, down = 0, 0, 0, 0, 0, 0 + delta, common = [], {} + otot, ntot = 0, 0 -for a in old: - if a in new: - common[a] = 1 + for a in old: + if a in new: + common[a] = 1 -for name in old: - otot += old[name] - if name not in common: - remove += 1 - down += old[name] - delta.append((-old[name], name)) + for name in old: + otot += old[name] + if name not in common: + remove += 1 + down += old[name] + delta.append((-old[name], name)) -for name in new: - ntot += new[name] - if name not in common: - add += 1 - up += new[name] - delta.append((new[name], name)) + for name in new: + ntot += new[name] + if name not in common: + add += 1 + up += new[name] + delta.append((new[name], name)) -for name in common: + for name in common: d = new.get(name, 0) - old.get(name, 0) if d>0: grow, up = grow+1, up+d if d<0: shrink, down = shrink+1, down-d delta.append((d, name)) -delta.sort() -delta.reverse() + delta.sort() + delta.reverse() + return grow, shrink, add, remove, up, down, delta, old, new, otot, ntot -print("add/remove: %s/%s grow/shrink: %s/%s up/down: %s/%s (%s)" % \ - (add, remove, grow, shrink, up, -down, up-down)) -print("%-40s %7s %7s %+7s" % ("function", "old", "new", "delta")) -for d, n in delta: - if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d)) +def print_result(symboltype, symbolformat, argc): + grow, shrink, add, remove, up, down, delta, old, new, otot, ntot = \ + calc(sys.argv[argc - 1], sys.argv[argc], symbolformat) -print("Total: Before=%d, After=%d, chg %+.2f%%" % \ - (otot, ntot, (ntot - otot)*100.0/otot)) + print("add/remove: %s/%s grow/shrink: %s/%s up/down: %s/%s (%s)" % \ + (add, remove, grow, shrink, up, -down, up-down)) + print("%-40s %7s %7s %+7s" % (symboltype, "old", "new", "delta")) + for d, n in delta: + if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d)) + + print("Total: Before=%d, After=%d, chg %+.2f%%" % \ + (otot, ntot, (ntot - otot)*100.0/otot)) + +if sys.argv[1] == "-c": + print_result("Function", "tT", 3) + print_result("Data", "dDbB", 3) + print_result("RO Data", "rR", 3) +elif sys.argv[1] == "-d": + print_result("Data", "dDbBrR", 3) +elif sys.argv[1] == "-t": + print_result("Function", "tT", 3) +else: + print_result("Function", "tTdDbBrR", 2) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 67d051edd615..7bd52b8f63d4 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -2182,8 +2182,6 @@ sub dump_struct($$) { # strip comments: $members =~ s/\/\*.*?\*\///gos; $nested =~ s/\/\*.*?\*\///gos; - # strip kmemcheck_bitfield_{begin,end}.*; - $members =~ s/kmemcheck_bitfield_.*?;//gos; # strip attributes $members =~ s/__attribute__\s*\(\([a-z,_\*\s\(\)]*\)\)//i; $members =~ s/__aligned\s*\([^;]*\)//gos; diff --git a/tools/include/linux/kmemcheck.h b/tools/include/linux/kmemcheck.h index 2bccd2c7b897..ea32a7d3cf1b 100644 --- a/tools/include/linux/kmemcheck.h +++ b/tools/include/linux/kmemcheck.h @@ -1,9 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LIBLOCKDEP_LINUX_KMEMCHECK_H_ -#define _LIBLOCKDEP_LINUX_KMEMCHECK_H_ - -static inline void kmemcheck_mark_initialized(void *address, unsigned int n) -{ -} - -#endif diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 557d391f564a..ae11e4c3516a 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -641,7 +641,6 @@ static const struct { { "__GFP_ATOMIC", "_A" }, { "__GFP_IO", "I" }, { "__GFP_FS", "F" }, - { "__GFP_COLD", "CO" }, { "__GFP_NOWARN", "NWR" }, { "__GFP_RETRY_MAYFAIL", "R" }, { "__GFP_NOFAIL", "NF" }, @@ -655,7 +654,6 @@ static const struct { { "__GFP_RECLAIMABLE", "RC" }, { "__GFP_MOVABLE", "M" }, { "__GFP_ACCOUNT", "AC" }, - { "__GFP_NOTRACK", "NT" }, { "__GFP_WRITE", "WR" }, { "__GFP_RECLAIM", "R" }, { "__GFP_DIRECT_RECLAIM", "DR" }, diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 06c71178d07d..59245b3d587c 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -618,7 +618,7 @@ static void multiorder_account(void) __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12); __radix_tree_lookup(&tree, 1 << 5, &node, &slot); assert(node->count == node->exceptional * 2); - __radix_tree_replace(&tree, node, slot, NULL, NULL, NULL); + __radix_tree_replace(&tree, node, slot, NULL, NULL); assert(node->exceptional == 0); item_kill_tree(&tree); diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index b0b7ef6d0de1..f82c2eaa859d 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -84,6 +84,7 @@ int output_lines = -1; int sort_loss; int extended_totals; int show_bytes; +int unreclaim_only; /* Debug options */ int sanity; @@ -133,6 +134,7 @@ static void usage(void) "-L|--Loss Sort by loss\n" "-X|--Xtotals Show extended summary information\n" "-B|--Bytes Show size in bytes\n" + "-U|--Unreclaim Show unreclaimable slabs only\n" "\nValid debug options (FZPUT may be combined)\n" "a / A Switch on all debug options (=FZUP)\n" "- Switch off all debug options\n" @@ -569,6 +571,9 @@ static void slabcache(struct slabinfo *s) if (strcmp(s->name, "*") == 0) return; + if (unreclaim_only && s->reclaim_account) + return; + if (actual_slabs == 1) { report(s); return; @@ -1347,6 +1352,7 @@ struct option opts[] = { { "Loss", no_argument, NULL, 'L'}, { "Xtotals", no_argument, NULL, 'X'}, { "Bytes", no_argument, NULL, 'B'}, + { "Unreclaim", no_argument, NULL, 'U'}, { NULL, 0, NULL, 0 } }; @@ -1358,7 +1364,7 @@ int main(int argc, char *argv[]) page_size = getpagesize(); - while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:LXB", + while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:LXBU", opts, NULL)) != -1) switch (c) { case '1': @@ -1439,6 +1445,9 @@ int main(int argc, char *argv[]) case 'B': show_bytes = 1; break; + case 'U': + unreclaim_only = 1; + break; default: fatal("%s: Invalid option '%c'\n", argv[0], optopt); |