diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-15 06:08:51 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-15 06:08:51 +0300 |
commit | 1064d857738187c764c0bd76040f424397f857c7 (patch) | |
tree | 13d16c0aed50b64c20b8fe235b15172f3c997f15 | |
parent | 35c99ffa20edd3c24be352d28a63cd3a23121282 (diff) | |
parent | def0fdae813dbbbbb588bfc5f52856be2e842b35 (diff) | |
download | linux-1064d857738187c764c0bd76040f424397f857c7.tar.xz |
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
- a couple of hotfixes
- almost all of the rest of MM
- lib/ updates
- binfmt_elf updates
- autofs updates
- quite a lot of misc fixes and updates
- reiserfs, fatfs
- signals
- exec
- cpumask
- rapidio
- sysctl
- pids
- eventfd
- gcov
- panic
- pps
- gdb script updates
- ipc updates
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (126 commits)
mm: memcontrol: fix NUMA round-robin reclaim at intermediate level
mm: memcontrol: fix recursive statistics correctness & scalabilty
mm: memcontrol: move stat/event counting functions out-of-line
mm: memcontrol: make cgroup stats and events query API explicitly local
drivers/virt/fsl_hypervisor.c: prevent integer overflow in ioctl
drivers/virt/fsl_hypervisor.c: dereferencing error pointers in ioctl
mm, memcg: rename ambiguously named memory.stat counters and functions
arch: remove <asm/sizes.h> and <asm-generic/sizes.h>
treewide: replace #include <asm/sizes.h> with #include <linux/sizes.h>
fs/block_dev.c: Remove duplicate header
fs/cachefiles/namei.c: remove duplicate header
include/linux/sched/signal.h: replace `tsk' with `task'
fs/coda/psdev.c: remove duplicate header
ipc: do cyclic id allocation for the ipc object.
ipc: conserve sequence numbers in ipcmni_extend mode
ipc: allow boot time extension of IPCMNI from 32k to 16M
ipc/mqueue: optimize msg_get()
ipc/mqueue: remove redundant wq task assignment
ipc: prevent lockup on alloc_msg and free_msg
scripts/gdb: print cached rate in lx-clk-summary
...
218 files changed, 4500 insertions, 1178 deletions
diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt index 7e71c9c1d8e9..5cbe5659e3b7 100644 --- a/Documentation/accounting/psi.txt +++ b/Documentation/accounting/psi.txt @@ -63,6 +63,110 @@ as well as medium and long term trends. The total absolute stall time spikes which wouldn't necessarily make a dent in the time averages, or to average trends over custom time frames. +Monitoring for pressure thresholds +================================== + +Users can register triggers and use poll() to be woken up when resource +pressure exceeds certain thresholds. + +A trigger describes the maximum cumulative stall time over a specific +time window, e.g. 100ms of total stall time within any 500ms window to +generate a wakeup event. + +To register a trigger user has to open psi interface file under +/proc/pressure/ representing the resource to be monitored and write the +desired threshold and time window. The open file descriptor should be +used to wait for trigger events using select(), poll() or epoll(). +The following format is used: + +<some|full> <stall amount in us> <time window in us> + +For example writing "some 150000 1000000" into /proc/pressure/memory +would add 150ms threshold for partial memory stall measured within +1sec time window. Writing "full 50000 1000000" into /proc/pressure/io +would add 50ms threshold for full io stall measured within 1sec time window. + +Triggers can be set on more than one psi metric and more than one trigger +for the same psi metric can be specified. However for each trigger a separate +file descriptor is required to be able to poll it separately from others, +therefore for each trigger a separate open() syscall should be made even +when opening the same psi interface file. + +Monitors activate only when system enters stall state for the monitored +psi metric and deactivates upon exit from the stall state. While system is +in the stall state psi signal growth is monitored at a rate of 10 times per +tracking window. + +The kernel accepts window sizes ranging from 500ms to 10s, therefore min +monitoring update interval is 50ms and max is 1s. Min limit is set to +prevent overly frequent polling. Max limit is chosen as a high enough number +after which monitors are most likely not needed and psi averages can be used +instead. + +When activated, psi monitor stays active for at least the duration of one +tracking window to avoid repeated activations/deactivations when system is +bouncing in and out of the stall state. + +Notifications to the userspace are rate-limited to one per tracking window. + +The trigger will de-register when the file descriptor used to define the +trigger is closed. + +Userspace monitor usage example +=============================== + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <poll.h> +#include <string.h> +#include <unistd.h> + +/* + * Monitor memory partial stall with 1s tracking window size + * and 150ms threshold. + */ +int main() { + const char trig[] = "some 150000 1000000"; + struct pollfd fds; + int n; + + fds.fd = open("/proc/pressure/memory", O_RDWR | O_NONBLOCK); + if (fds.fd < 0) { + printf("/proc/pressure/memory open error: %s\n", + strerror(errno)); + return 1; + } + fds.events = POLLPRI; + + if (write(fds.fd, trig, strlen(trig) + 1) < 0) { + printf("/proc/pressure/memory write error: %s\n", + strerror(errno)); + return 1; + } + + printf("waiting for events...\n"); + while (1) { + n = poll(&fds, 1, -1); + if (n < 0) { + printf("poll error: %s\n", strerror(errno)); + return 1; + } + if (fds.revents & POLLERR) { + printf("got POLLERR, event source is gone\n"); + return 0; + } + if (fds.revents & POLLPRI) { + printf("event triggered!\n"); + } else { + printf("unknown event received: 0x%x\n", fds.revents); + return 1; + } + } + + return 0; +} + Cgroup2 interface ================= @@ -71,3 +175,6 @@ mounted, pressure stall information is also tracked for tasks grouped into cgroups. Each subdirectory in the cgroupfs mountpoint contains cpu.pressure, memory.pressure, and io.pressure files; the format is the same as the /proc/pressure/ files. + +Per-cgroup psi monitors can be specified and used the same way as +system-wide ones. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 43176340c73d..d1d1da911085 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1830,6 +1830,9 @@ ip= [IP_PNP] See Documentation/filesystems/nfs/nfsroot.txt. + ipcmni_extend [KNL] Extend the maximum number of unique System V + IPC identifiers from 32,768 to 16,777,216. + irqaffinity= [SMP] Set the default irq affinity mask The argument is a cpu list, as described above. @@ -3174,6 +3177,16 @@ This will also cause panics on machine check exceptions. Useful together with panic=30 to trigger a reboot. + page_alloc.shuffle= + [KNL] Boolean flag to control whether the page allocator + should randomize its free lists. The randomization may + be automatically enabled if the kernel detects it is + running on a platform with a direct-mapped memory-side + cache, and this parameter can be used to + override/disable that behavior. The state of the flag + can be read from sysfs at: + /sys/module/page_alloc/parameters/shuffle. + page_owner= [KNL] Boot-time page_owner enabling option. Storage of the information about who allocated each page is disabled in default. With this switch, @@ -4054,7 +4067,9 @@ [[,]s[mp]#### \ [[,]b[ios] | a[cpi] | k[bd] | t[riple] | e[fi] | p[ci]] \ [[,]f[orce] - Where reboot_mode is one of warm (soft) or cold (hard) or gpio, + Where reboot_mode is one of warm (soft) or cold (hard) or gpio + (prefix with 'panic_' to set mode for panic + reboot only), reboot_type is one of bios, acpi, kbd, triple, efi, or pci, reboot_force is either force or not specified, reboot_cpu is s[mp]#### with #### being the processor diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 71f5d2fe39b7..a29c99d13331 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -147,10 +147,10 @@ Division Functions .. kernel-doc:: include/linux/math64.h :internal: -.. kernel-doc:: lib/div64.c +.. kernel-doc:: lib/math/div64.c :functions: div_s64_rem div64_u64_rem div64_u64 div64_s64 -.. kernel-doc:: lib/gcd.c +.. kernel-doc:: lib/math/gcd.c :export: UUID/GUID diff --git a/Documentation/dev-tools/gcov.rst b/Documentation/dev-tools/gcov.rst index 69a7d90c320a..46aae52a41d0 100644 --- a/Documentation/dev-tools/gcov.rst +++ b/Documentation/dev-tools/gcov.rst @@ -34,10 +34,6 @@ Configure the kernel with:: CONFIG_DEBUG_FS=y CONFIG_GCOV_KERNEL=y -select the gcc's gcov format, default is autodetect based on gcc version:: - - CONFIG_GCOV_FORMAT_AUTODETECT=y - and to get coverage data for the entire kernel:: CONFIG_GCOV_PROFILE_ALL=y @@ -169,6 +165,20 @@ b) gcov is run on the BUILD machine [user@build] gcov -o /tmp/coverage/tmp/out/init main.c +Note on compilers +----------------- + +GCC and LLVM gcov tools are not necessarily compatible. Use gcov_ to work with +GCC-generated .gcno and .gcda files, and use llvm-cov_ for Clang. + +.. _gcov: http://gcc.gnu.org/onlinedocs/gcc/Gcov.html +.. _llvm-cov: https://llvm.org/docs/CommandGuide/llvm-cov.html + +Build differences between GCC and Clang gcov are handled by Kconfig. It +automatically selects the appropriate gcov format depending on the detected +toolchain. + + Troubleshooting --------------- diff --git a/Documentation/devicetree/bindings/pps/pps-gpio.txt b/Documentation/devicetree/bindings/pps/pps-gpio.txt index 3683874832ae..9012a2a02e14 100644 --- a/Documentation/devicetree/bindings/pps/pps-gpio.txt +++ b/Documentation/devicetree/bindings/pps/pps-gpio.txt @@ -7,6 +7,10 @@ Required properties: - compatible: should be "pps-gpio" - gpios: one PPS GPIO in the format described by ../gpio/gpio.txt +Additional required properties for the PPS ECHO functionality: +- echo-gpios: one PPS ECHO GPIO in the format described by ../gpio/gpio.txt +- echo-active-ms: duration in ms of the active portion of the echo pulse + Optional properties: - assert-falling-edge: when present, assert is indicated by a falling edge (instead of by a rising edge) @@ -19,5 +23,8 @@ Example: gpios = <&gpio1 26 GPIO_ACTIVE_HIGH>; assert-falling-edge; + echo-gpios = <&gpio1 27 GPIO_ACTIVE_HIGH>; + echo-active-ms = <100>; + compatible = "pps-gpio"; }; diff --git a/Documentation/filesystems/autofs-mount-control.txt b/Documentation/filesystems/autofs-mount-control.txt index 45edad6933cc..acc02fc57993 100644 --- a/Documentation/filesystems/autofs-mount-control.txt +++ b/Documentation/filesystems/autofs-mount-control.txt @@ -354,8 +354,10 @@ this ioctl is called until no further expire candidates are found. The call requires an initialized struct autofs_dev_ioctl with the ioctlfd field set to the descriptor obtained from the open call. In -addition an immediate expire, independent of the mount timeout, can be -requested by setting the how field of struct args_expire to 1. If no +addition an immediate expire that's independent of the mount timeout, +and a forced expire that's independent of whether the mount is busy, +can be requested by setting the how field of struct args_expire to +AUTOFS_EXP_IMMEDIATE or AUTOFS_EXP_FORCED, respectively . If no expire candidates can be found the ioctl returns -1 with errno set to EAGAIN. diff --git a/Documentation/filesystems/autofs.txt b/Documentation/filesystems/autofs.txt index 373ad25852d3..3af38c7fd26d 100644 --- a/Documentation/filesystems/autofs.txt +++ b/Documentation/filesystems/autofs.txt @@ -116,7 +116,7 @@ that purpose there is another flag. **DCACHE_MANAGE_TRANSIT** If a dentry has DCACHE_MANAGE_TRANSIT set then two very different but -related behaviors are invoked, both using the `d_op->d_manage()` +related behaviours are invoked, both using the `d_op->d_manage()` dentry operation. Firstly, before checking to see if any filesystem is mounted on the @@ -193,8 +193,8 @@ VFS remain in RCU-walk mode, but can only tell it to get out of RCU-walk mode by returning `-ECHILD`. So `d_manage()`, when called with `rcu_walk` set, should either return --ECHILD if there is any reason to believe it is unsafe to end the -mounted filesystem, and otherwise should return 0. +-ECHILD if there is any reason to believe it is unsafe to enter the +mounted filesystem, otherwise it should return 0. autofs will return `-ECHILD` if an expiry of the filesystem has been initiated or is being considered, otherwise it returns 0. @@ -210,7 +210,7 @@ mounts that were created by `d_automount()` returning a filesystem to be mounted. As autofs doesn't return such a filesystem but leaves the mounting to the automount daemon, it must involve the automount daemon in unmounting as well. This also means that autofs has more control -of expiry. +over expiry. The VFS also supports "expiry" of mounts using the MNT_EXPIRE flag to the `umount` system call. Unmounting with MNT_EXPIRE will fail unless @@ -225,7 +225,7 @@ unmount any filesystems mounted on the autofs filesystem or remove any symbolic links or empty directories any time it likes. If the unmount or removal is successful the filesystem will be returned to the state it was before the mount or creation, so that any access of the name -will trigger normal auto-mount processing. In particlar, `rmdir` and +will trigger normal auto-mount processing. In particular, `rmdir` and `unlink` do not leave negative entries in the dcache as a normal filesystem would, so an attempt to access a recently-removed object is passed to autofs for handling. @@ -240,11 +240,18 @@ Normally the daemon only wants to remove entries which haven't been used for a while. For this purpose autofs maintains a "`last_used`" time stamp on each directory or symlink. For symlinks it genuinely does record the last time the symlink was "used" or followed to find -out where it points to. For directories the field is a slight -misnomer. It actually records the last time that autofs checked if -the directory or one of its descendents was busy and found that it -was. This is just as useful and doesn't require updating the field so -often. +out where it points to. For directories the field is used slightly +differently. The field is updated at mount time and during expire +checks if it is found to be in use (ie. open file descriptor or +process working directory) and during path walks. The update done +during path walks prevents frequent expire and immediate mount of +frequently accessed automounts. But in the case where a GUI continually +access or an application frequently scans an autofs directory tree +there can be an accumulation of mounts that aren't actually being +used. To cater for this case the "`strictexpire`" autofs mount option +can be used to avoid the "`last_used`" update on path walk thereby +preventing this apparent inability to expire mounts that aren't +really in use. The daemon is able to ask autofs if anything is due to be expired, using an `ioctl` as discussed later. For a *direct* mount, autofs @@ -255,8 +262,12 @@ up. There is an option with indirect mounts to consider each of the leaves that has been mounted on instead of considering the top-level names. -This is intended for compatability with version 4 of autofs and should -be considered as deprecated. +This was originally intended for compatibility with version 4 of autofs +and should be considered as deprecated for Sun Format automount maps. +However, it may be used again for amd format mount maps (which are +generally indirect maps) because the amd automounter allows for the +setting of an expire timeout for individual mounts. But there are +some difficulties in making the needed changes for this. When autofs considers a directory it checks the `last_used` time and compares it with the "timeout" value set when the filesystem was @@ -273,7 +284,7 @@ mounts. If it finds something in the root directory to expire it will return the name of that thing. Once a name has been returned the automount daemon needs to unmount any filesystems mounted below the name normally. As described above, this is unsafe for non-toplevel -mounts in a version-5 autofs. For this reason the current `automountd` +mounts in a version-5 autofs. For this reason the current `automount(8)` does not use this ioctl. The second mechanism uses either the **AUTOFS_DEV_IOCTL_EXPIRE_CMD** or @@ -345,7 +356,7 @@ The `wait_queue_token` is a unique number which can identify a particular request to be acknowledged. When a message is sent over the pipe the affected dentry is marked as either "active" or "expiring" and other accesses to it block until the message is -acknowledged using one of the ioctls below and the relevant +acknowledged using one of the ioctls below with the relevant `wait_queue_token`. Communicating with autofs: root directory ioctls @@ -367,15 +378,14 @@ The available ioctl commands are: This mode is also entered if a write to the pipe fails. - **AUTOFS_IOC_PROTOVER**: This returns the protocol version in use. - **AUTOFS_IOC_PROTOSUBVER**: Returns the protocol sub-version which - is really a version number for the implementation. It is - currently 2. + is really a version number for the implementation. - **AUTOFS_IOC_SETTIMEOUT**: This passes a pointer to an unsigned long. The value is used to set the timeout for expiry, and the current timeout value is stored back through the pointer. - **AUTOFS_IOC_ASKUMOUNT**: Returns, in the pointed-to `int`, 1 if the filesystem could be unmounted. This is only a hint as the situation could change at any instant. This call can be - use to avoid a more expensive full unmount attempt. + used to avoid a more expensive full unmount attempt. - **AUTOFS_IOC_EXPIRE**: as described above, this asks if there is anything suitable to expire. A pointer to a packet: @@ -400,6 +410,11 @@ The available ioctl commands are: **AUTOFS_EXP_IMMEDIATE** causes `last_used` time to be ignored and objects are expired if the are not in use. + **AUTOFS_EXP_FORCED** causes the in use status to be ignored + and objects are expired ieven if they are in use. This assumes + that the daemon has requested this because it is capable of + performing the umount. + **AUTOFS_EXP_LEAVES** will select a leaf rather than a top-level name to expire. This is only safe when *maxproto* is 4. @@ -415,7 +430,7 @@ which can be used to communicate directly with the autofs filesystem. It requires CAP_SYS_ADMIN for access. The `ioctl`s that can be used on this device are described in a separate -document `autofs-mount-control.txt`, and are summarized briefly here. +document `autofs-mount-control.txt`, and are summarised briefly here. Each ioctl is passed a pointer to an `autofs_dev_ioctl` structure: struct autofs_dev_ioctl { @@ -511,6 +526,21 @@ directories. Catatonic mode can only be left via the **AUTOFS_DEV_IOCTL_OPENMOUNT_CMD** ioctl on the `/dev/autofs`. +The "ignore" mount option +------------------------- + +The "ignore" mount option can be used to provide a generic indicator +to applications that the mount entry should be ignored when displaying +mount information. + +In other OSes that provide autofs and that provide a mount list to user +space based on the kernel mount list a no-op mount option ("ignore" is +the one use on the most common OSes) is allowed so that autofs file +system users can optionally use it. + +This is intended to be used by user space programs to exclude autofs +mounts from consideration when reading the mounts list. + autofs, name spaces, and shared mounts -------------------------------------- diff --git a/arch/arm/common/sa1111.c b/arch/arm/common/sa1111.c index 45412d21aa6b..179ca8757a74 100644 --- a/arch/arm/common/sa1111.c +++ b/arch/arm/common/sa1111.c @@ -32,7 +32,7 @@ #include <mach/hardware.h> #include <asm/mach/irq.h> #include <asm/mach-types.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/hardware/sa1111.h> diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 41deac2451af..0b2ecc98e086 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -17,7 +17,6 @@ generic-y += seccomp.h generic-y += segment.h generic-y += serial.h generic-y += simd.h -generic-y += sizes.h generic-y += trace_clock.h generated-y += mach-types.h diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h index cba23eaa6072..7a88f160b1fb 100644 --- a/arch/arm/include/asm/hardirq.h +++ b/arch/arm/include/asm/hardirq.h @@ -6,6 +6,7 @@ #include <linux/threads.h> #include <asm/irq.h> +/* number of IPIS _not_ including IPI_CPU_BACKTRACE */ #define NR_IPI 7 typedef struct { diff --git a/arch/arm/kernel/atags.h b/arch/arm/kernel/atags.h index 201100226301..067e12edc341 100644 --- a/arch/arm/kernel/atags.h +++ b/arch/arm/kernel/atags.h @@ -5,7 +5,7 @@ void convert_to_tag_list(struct tag *tags); const struct machine_desc *setup_machine_tags(phys_addr_t __atags_pointer, unsigned int machine_nr); #else -static inline const struct machine_desc * +static inline const struct machine_desc * __init __noreturn setup_machine_tags(phys_addr_t __atags_pointer, unsigned int machine_nr) { early_print("no ATAGS support: can't continue\n"); diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index facd4240ca02..c93fe0f256de 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -70,6 +70,10 @@ enum ipi_msg_type { IPI_CPU_STOP, IPI_IRQ_WORK, IPI_COMPLETION, + /* + * CPU_BACKTRACE is special and not included in NR_IPI + * or tracable with trace_ipi_* + */ IPI_CPU_BACKTRACE, /* * SGI8-15 can be reserved by secure firmware, and thus may @@ -797,7 +801,7 @@ core_initcall(register_cpufreq_notifier); static void raise_nmi(cpumask_t *mask) { - smp_cross_call(mask, IPI_CPU_BACKTRACE); + __smp_cross_call(mask, IPI_CPU_BACKTRACE); } void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) diff --git a/arch/arm/mach-imx/devices/platform-fec.c b/arch/arm/mach-imx/devices/platform-fec.c index b403a4fe2892..605c0af5851d 100644 --- a/arch/arm/mach-imx/devices/platform-fec.c +++ b/arch/arm/mach-imx/devices/platform-fec.c @@ -7,7 +7,7 @@ * Free Software Foundation. */ #include <linux/dma-mapping.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include "../hardware.h" #include "devices-common.h" diff --git a/arch/arm/mach-imx/devices/platform-gpio_keys.c b/arch/arm/mach-imx/devices/platform-gpio_keys.c index 486282539c76..9f0a132ea1bc 100644 --- a/arch/arm/mach-imx/devices/platform-gpio_keys.c +++ b/arch/arm/mach-imx/devices/platform-gpio_keys.c @@ -15,7 +15,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. */ -#include <asm/sizes.h> +#include <linux/sizes.h> #include "../hardware.h" #include "devices-common.h" diff --git a/arch/arm/mach-imx/devices/platform-imx2-wdt.c b/arch/arm/mach-imx/devices/platform-imx2-wdt.c index 8c134c8d7500..0c6d3c05fd6d 100644 --- a/arch/arm/mach-imx/devices/platform-imx2-wdt.c +++ b/arch/arm/mach-imx/devices/platform-imx2-wdt.c @@ -6,7 +6,7 @@ * the terms of the GNU General Public License version 2 as published by the * Free Software Foundation. */ -#include <asm/sizes.h> +#include <linux/sizes.h> #include "../hardware.h" #include "devices-common.h" diff --git a/arch/arm/mach-imx/devices/platform-mxc_nand.c b/arch/arm/mach-imx/devices/platform-mxc_nand.c index 676df4920c7b..046e0cc826c1 100644 --- a/arch/arm/mach-imx/devices/platform-mxc_nand.c +++ b/arch/arm/mach-imx/devices/platform-mxc_nand.c @@ -6,7 +6,7 @@ * the terms of the GNU General Public License version 2 as published by the * Free Software Foundation. */ -#include <asm/sizes.h> +#include <linux/sizes.h> #include "../hardware.h" #include "devices-common.h" diff --git a/arch/arm/mach-imx/hardware.h b/arch/arm/mach-imx/hardware.h index 90e10cbd8fd1..b5ca8cebe1d6 100644 --- a/arch/arm/mach-imx/hardware.h +++ b/arch/arm/mach-imx/hardware.h @@ -24,7 +24,7 @@ #include <asm/io.h> #include <soc/imx/revision.h> #endif -#include <asm/sizes.h> +#include <linux/sizes.h> #define addr_in_module(addr, mod) \ ((unsigned long)(addr) - mod ## _BASE_ADDR < mod ## _SIZE) diff --git a/arch/arm/mach-integrator/impd1.c b/arch/arm/mach-integrator/impd1.c index 8dfad012dfae..6ddbe153910a 100644 --- a/arch/arm/mach-integrator/impd1.c +++ b/arch/arm/mach-integrator/impd1.c @@ -27,7 +27,7 @@ #include <linux/irqchip/arm-vic.h> #include <linux/gpio/machine.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include "lm.h" #include "impd1.h" diff --git a/arch/arm/mach-iop13xx/pci.c b/arch/arm/mach-iop13xx/pci.c index 070d92ae1b6f..8426ab9e2f5a 100644 --- a/arch/arm/mach-iop13xx/pci.c +++ b/arch/arm/mach-iop13xx/pci.c @@ -24,7 +24,7 @@ #include <linux/export.h> #include <asm/irq.h> #include <mach/hardware.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/signal.h> #include <asm/mach/pci.h> #include "pci.h" diff --git a/arch/arm/mach-iop13xx/tpmi.c b/arch/arm/mach-iop13xx/tpmi.c index 116feb6b261e..d3d8c78e7d10 100644 --- a/arch/arm/mach-iop13xx/tpmi.c +++ b/arch/arm/mach-iop13xx/tpmi.c @@ -23,7 +23,7 @@ #include <linux/dma-mapping.h> #include <linux/io.h> #include <asm/irq.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <mach/irqs.h> /* assumes CONTROLLER_ONLY# is never asserted in the ESSR register */ diff --git a/arch/arm/mach-ixp4xx/common-pci.c b/arch/arm/mach-ixp4xx/common-pci.c index 6835b17113e5..a53104bb28f5 100644 --- a/arch/arm/mach-ixp4xx/common-pci.c +++ b/arch/arm/mach-ixp4xx/common-pci.c @@ -31,7 +31,7 @@ #include <asm/cputype.h> #include <asm/irq.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/mach/pci.h> #include <mach/hardware.h> diff --git a/arch/arm/mach-ks8695/include/mach/hardware.h b/arch/arm/mach-ks8695/include/mach/hardware.h index 959c748ee8bb..877629b3d944 100644 --- a/arch/arm/mach-ks8695/include/mach/hardware.h +++ b/arch/arm/mach-ks8695/include/mach/hardware.h @@ -14,7 +14,7 @@ #ifndef __ASM_ARCH_HARDWARE_H #define __ASM_ARCH_HARDWARE_H -#include <asm/sizes.h> +#include <linux/sizes.h> /* * Clocks are derived from MCLK, which is 25MHz diff --git a/arch/arm/mach-omap1/include/mach/hardware.h b/arch/arm/mach-omap1/include/mach/hardware.h index 5875a5098d35..e7c8ac7d83e3 100644 --- a/arch/arm/mach-omap1/include/mach/hardware.h +++ b/arch/arm/mach-omap1/include/mach/hardware.h @@ -36,7 +36,7 @@ #ifndef __ASM_ARCH_OMAP_HARDWARE_H #define __ASM_ARCH_OMAP_HARDWARE_H -#include <asm/sizes.h> +#include <linux/sizes.h> #ifndef __ASSEMBLER__ #include <asm/types.h> #include <mach/soc.h> diff --git a/arch/arm/mach-omap2/omap_hwmod_2xxx_interconnect_data.c b/arch/arm/mach-omap2/omap_hwmod_2xxx_interconnect_data.c index 9b30b6b471ae..e19f620c4074 100644 --- a/arch/arm/mach-omap2/omap_hwmod_2xxx_interconnect_data.c +++ b/arch/arm/mach-omap2/omap_hwmod_2xxx_interconnect_data.c @@ -11,7 +11,7 @@ * XXX handle crossbar/shared link difference for L3? * XXX these should be marked initdata for multi-OMAP kernels */ -#include <asm/sizes.h> +#include <linux/sizes.h> #include "omap_hwmod.h" #include "l3_2xxx.h" diff --git a/arch/arm/mach-prima2/common.c b/arch/arm/mach-prima2/common.c index ffe05c27087e..1607deab5290 100644 --- a/arch/arm/mach-prima2/common.c +++ b/arch/arm/mach-prima2/common.c @@ -8,7 +8,7 @@ #include <linux/init.h> #include <linux/kernel.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/mach-types.h> #include <asm/mach/arch.h> #include <linux/of.h> diff --git a/arch/arm/mach-pxa/balloon3.c b/arch/arm/mach-pxa/balloon3.c index 4bcbd3d55b36..1f24e0259f99 100644 --- a/arch/arm/mach-pxa/balloon3.c +++ b/arch/arm/mach-pxa/balloon3.c @@ -35,7 +35,7 @@ #include <asm/setup.h> #include <asm/mach-types.h> #include <asm/irq.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/mach/arch.h> #include <asm/mach/map.h> diff --git a/arch/arm/mach-pxa/colibri-pxa270.c b/arch/arm/mach-pxa/colibri-pxa270.c index e68acdd0cdbb..510625dde3cb 100644 --- a/arch/arm/mach-pxa/colibri-pxa270.c +++ b/arch/arm/mach-pxa/colibri-pxa270.c @@ -24,7 +24,7 @@ #include <asm/mach/arch.h> #include <asm/mach/flash.h> #include <asm/mach-types.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <mach/audio.h> #include "colibri.h" diff --git a/arch/arm/mach-pxa/colibri-pxa300.c b/arch/arm/mach-pxa/colibri-pxa300.c index 6a5558d95d4e..2f635bdc797f 100644 --- a/arch/arm/mach-pxa/colibri-pxa300.c +++ b/arch/arm/mach-pxa/colibri-pxa300.c @@ -18,7 +18,7 @@ #include <linux/interrupt.h> #include <asm/mach-types.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/mach/arch.h> #include <asm/mach/irq.h> diff --git a/arch/arm/mach-pxa/colibri-pxa320.c b/arch/arm/mach-pxa/colibri-pxa320.c index 17067a3039a8..ffcefe6dbc82 100644 --- a/arch/arm/mach-pxa/colibri-pxa320.c +++ b/arch/arm/mach-pxa/colibri-pxa320.c @@ -19,7 +19,7 @@ #include <linux/usb/gpio_vbus.h> #include <asm/mach-types.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/mach/arch.h> #include <asm/mach/irq.h> diff --git a/arch/arm/mach-pxa/colibri-pxa3xx.c b/arch/arm/mach-pxa/colibri-pxa3xx.c index e31a591e949f..0c88e4e417b4 100644 --- a/arch/arm/mach-pxa/colibri-pxa3xx.c +++ b/arch/arm/mach-pxa/colibri-pxa3xx.c @@ -17,7 +17,7 @@ #include <linux/etherdevice.h> #include <asm/mach-types.h> #include <mach/hardware.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/system_info.h> #include <asm/mach/arch.h> #include <asm/mach/irq.h> diff --git a/arch/arm/mach-pxa/gumstix.c b/arch/arm/mach-pxa/gumstix.c index 4764acca5480..eb03283ccdee 100644 --- a/arch/arm/mach-pxa/gumstix.c +++ b/arch/arm/mach-pxa/gumstix.c @@ -33,7 +33,7 @@ #include <asm/mach-types.h> #include <mach/hardware.h> #include <asm/irq.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/mach/arch.h> #include <asm/mach/map.h> diff --git a/arch/arm/mach-pxa/lpd270.c b/arch/arm/mach-pxa/lpd270.c index e9f401b0a432..5c03c4f7b82e 100644 --- a/arch/arm/mach-pxa/lpd270.c +++ b/arch/arm/mach-pxa/lpd270.c @@ -33,7 +33,7 @@ #include <asm/mach-types.h> #include <mach/hardware.h> #include <asm/irq.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/mach/arch.h> #include <asm/mach/map.h> diff --git a/arch/arm/mach-pxa/lubbock.c b/arch/arm/mach-pxa/lubbock.c index c1bd0d544981..825939877839 100644 --- a/arch/arm/mach-pxa/lubbock.c +++ b/arch/arm/mach-pxa/lubbock.c @@ -39,7 +39,7 @@ #include <asm/mach-types.h> #include <mach/hardware.h> #include <asm/irq.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/mach/arch.h> #include <asm/mach/map.h> diff --git a/arch/arm/mach-pxa/mainstone.c b/arch/arm/mach-pxa/mainstone.c index d6e17d407ac0..b3f8592eebe6 100644 --- a/arch/arm/mach-pxa/mainstone.c +++ b/arch/arm/mach-pxa/mainstone.c @@ -40,7 +40,7 @@ #include <asm/mach-types.h> #include <mach/hardware.h> #include <asm/irq.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/mach/arch.h> #include <asm/mach/map.h> diff --git a/arch/arm/mach-pxa/trizeps4.c b/arch/arm/mach-pxa/trizeps4.c index c76f1daecfc9..99a2ee433f1f 100644 --- a/arch/arm/mach-pxa/trizeps4.c +++ b/arch/arm/mach-pxa/trizeps4.c @@ -35,7 +35,7 @@ #include <asm/memory.h> #include <asm/mach-types.h> #include <asm/irq.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/mach/arch.h> #include <asm/mach/map.h> diff --git a/arch/arm/mach-pxa/viper.c b/arch/arm/mach-pxa/viper.c index ab2f89266bbd..c4c25a2f24f6 100644 --- a/arch/arm/mach-pxa/viper.c +++ b/arch/arm/mach-pxa/viper.c @@ -58,7 +58,7 @@ #include <asm/setup.h> #include <asm/mach-types.h> #include <asm/irq.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/system_info.h> #include <asm/mach/arch.h> diff --git a/arch/arm/mach-s3c24xx/include/mach/hardware.h b/arch/arm/mach-s3c24xx/include/mach/hardware.h index 1b2975708e3f..f28ac6c78d82 100644 --- a/arch/arm/mach-s3c24xx/include/mach/hardware.h +++ b/arch/arm/mach-s3c24xx/include/mach/hardware.h @@ -15,7 +15,7 @@ extern unsigned int s3c2410_modify_misccr(unsigned int clr, unsigned int chg); #endif /* __ASSEMBLY__ */ -#include <asm/sizes.h> +#include <linux/sizes.h> #include <mach/map.h> #endif /* __ASM_ARCH_HARDWARE_H */ diff --git a/arch/arm/mach-sa1100/include/mach/memory.h b/arch/arm/mach-sa1100/include/mach/memory.h index fa5cf4744992..3b19296f5062 100644 --- a/arch/arm/mach-sa1100/include/mach/memory.h +++ b/arch/arm/mach-sa1100/include/mach/memory.h @@ -8,7 +8,7 @@ #ifndef __ASM_ARCH_MEMORY_H #define __ASM_ARCH_MEMORY_H -#include <asm/sizes.h> +#include <linux/sizes.h> /* * Because of the wide memory address space between physical RAM banks on the diff --git a/arch/arm/mach-sa1100/neponset.c b/arch/arm/mach-sa1100/neponset.c index eb60a71cf125..a671e4c994cf 100644 --- a/arch/arm/mach-sa1100/neponset.c +++ b/arch/arm/mach-sa1100/neponset.c @@ -21,7 +21,7 @@ #include <asm/mach-types.h> #include <asm/mach/map.h> #include <asm/hardware/sa1111.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <mach/hardware.h> #include <mach/assabet.h> diff --git a/arch/arm/mach-tegra/iomap.h b/arch/arm/mach-tegra/iomap.h index 9bc291e76887..4af9e92a216f 100644 --- a/arch/arm/mach-tegra/iomap.h +++ b/arch/arm/mach-tegra/iomap.h @@ -20,7 +20,7 @@ #define __MACH_TEGRA_IOMAP_H #include <asm/pgtable.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #define TEGRA_IRAM_BASE 0x40000000 #define TEGRA_IRAM_SIZE SZ_256K diff --git a/arch/arm/mach-tegra/irammap.h b/arch/arm/mach-tegra/irammap.h index e32e1742c9a1..6a7bb887585e 100644 --- a/arch/arm/mach-tegra/irammap.h +++ b/arch/arm/mach-tegra/irammap.h @@ -17,7 +17,7 @@ #ifndef __MACH_TEGRA_IRAMMAP_H #define __MACH_TEGRA_IRAMMAP_H -#include <asm/sizes.h> +#include <linux/sizes.h> /* The first 1K of IRAM is permanently reserved for the CPU reset handler */ #define TEGRA_IRAM_RESET_HANDLER_OFFSET 0 diff --git a/arch/arm/mach-w90x900/include/mach/hardware.h b/arch/arm/mach-w90x900/include/mach/hardware.h index fe3c6265a466..2e6555df538e 100644 --- a/arch/arm/mach-w90x900/include/mach/hardware.h +++ b/arch/arm/mach-w90x900/include/mach/hardware.h @@ -18,7 +18,7 @@ #ifndef __ASM_ARCH_HARDWARE_H #define __ASM_ARCH_HARDWARE_H -#include <asm/sizes.h> +#include <linux/sizes.h> #include <mach/map.h> #endif /* __ASM_ARCH_HARDWARE_H */ diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index eb0df239a759..9e977dedf193 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -20,7 +20,6 @@ generic-y += qspinlock.h generic-y += segment.h generic-y += serial.h generic-y += set_memory.h -generic-y += sizes.h generic-y += switch_to.h generic-y += trace_clock.h generic-y += unaligned.h diff --git a/arch/arm64/include/asm/boot.h b/arch/arm64/include/asm/boot.h index 355e552a9175..c7f67da13cd9 100644 --- a/arch/arm64/include/asm/boot.h +++ b/arch/arm64/include/asm/boot.h @@ -3,7 +3,7 @@ #ifndef __ASM_BOOT_H #define __ASM_BOOT_H -#include <asm/sizes.h> +#include <linux/sizes.h> /* * arm64 requires the DTB to be 8 byte aligned and diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index f210bcf096f7..bc895c869892 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -401,7 +401,7 @@ unsigned long cpu_get_elf_hwcap2(void); #define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name)) /* System capability check for constant caps */ -static inline bool __cpus_have_const_cap(int num) +static __always_inline bool __cpus_have_const_cap(int num) { if (num >= ARM64_NCAPS) return false; @@ -415,7 +415,7 @@ static inline bool cpus_have_cap(unsigned int num) return test_bit(num, cpu_hwcaps); } -static inline bool cpus_have_const_cap(int num) +static __always_inline bool cpus_have_const_cap(int num) { if (static_branch_likely(&arm64_const_caps_ready)) return __cpus_have_const_cap(num); diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 2cb8248fa2c8..8ffcf5a512bb 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -26,7 +26,7 @@ #include <linux/types.h> #include <asm/bug.h> #include <asm/page-def.h> -#include <asm/sizes.h> +#include <linux/sizes.h> /* * Size of the PCI I/O space. This must remain a power of two so that diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 007c05a4cce0..d2adffb81b5d 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -48,7 +48,7 @@ #include <asm/numa.h> #include <asm/sections.h> #include <asm/setup.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/tlb.h> #include <asm/alternative.h> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ef32d4839c3f..a170c6369a68 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -40,7 +40,7 @@ #include <asm/kernel-pgtable.h> #include <asm/sections.h> #include <asm/setup.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/tlb.h> #include <asm/mmu_context.h> #include <asm/ptdump.h> diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild index 123d8f54be4a..f2e22058e488 100644 --- a/arch/h8300/include/asm/Kbuild +++ b/arch/h8300/include/asm/Kbuild @@ -42,7 +42,6 @@ generic-y += scatterlist.h generic-y += sections.h generic-y += serial.h generic-y += shmparam.h -generic-y += sizes.h generic-y += spinlock.h generic-y += timex.h generic-y += tlbflush.h diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index 6234a303d2a3..4a3d72f76ea2 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -32,7 +32,6 @@ generic-y += sections.h generic-y += segment.h generic-y += serial.h generic-y += shmparam.h -generic-y += sizes.h generic-y += topology.h generic-y += trace_clock.h generic-y += unaligned.h diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h index 830c93a010c3..9a466dde9b96 100644 --- a/arch/mips/include/asm/bitops.h +++ b/arch/mips/include/asm/bitops.h @@ -482,7 +482,7 @@ static inline void __clear_bit_unlock(unsigned long nr, volatile unsigned long * * Return the bit position (0..63) of the most significant 1 bit in a word * Returns -1 if no 1 bit exists */ -static inline unsigned long __fls(unsigned long word) +static __always_inline unsigned long __fls(unsigned long word) { int num; @@ -548,7 +548,7 @@ static inline unsigned long __fls(unsigned long word) * Returns 0..SZLONG-1 * Undefined if no bit exists, so code should check against 0 first. */ -static inline unsigned long __ffs(unsigned long word) +static __always_inline unsigned long __ffs(unsigned long word) { return __fls(word & -word); } diff --git a/arch/mips/kernel/cpu-bugs64.c b/arch/mips/kernel/cpu-bugs64.c index bada74af7641..c04b97aace4a 100644 --- a/arch/mips/kernel/cpu-bugs64.c +++ b/arch/mips/kernel/cpu-bugs64.c @@ -42,8 +42,8 @@ static inline void align_mod(const int align, const int mod) : "n"(align), "n"(mod)); } -static inline void mult_sh_align_mod(long *v1, long *v2, long *w, - const int align, const int mod) +static __always_inline void mult_sh_align_mod(long *v1, long *v2, long *w, + const int align, const int mod) { unsigned long flags; int m1, m2; diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild index 688b6ed26227..f67a327777b5 100644 --- a/arch/nds32/include/asm/Kbuild +++ b/arch/nds32/include/asm/Kbuild @@ -39,7 +39,6 @@ generic-y += preempt.h generic-y += sections.h generic-y += segment.h generic-y += serial.h -generic-y += sizes.h generic-y += switch_to.h generic-y += timex.h generic-y += topology.h diff --git a/arch/nds32/include/asm/pgtable.h b/arch/nds32/include/asm/pgtable.h index 9f52db930c00..ee59c1f9e4fc 100644 --- a/arch/nds32/include/asm/pgtable.h +++ b/arch/nds32/include/asm/pgtable.h @@ -6,7 +6,7 @@ #define __PAGETABLE_PMD_FOLDED 1 #include <asm-generic/4level-fixup.h> -#include <asm-generic/sizes.h> +#include <linux/sizes.h> #include <asm/memory.h> #include <asm/nds32.h> diff --git a/arch/nds32/kernel/head.S b/arch/nds32/kernel/head.S index db64b78b1232..fcefb62606ca 100644 --- a/arch/nds32/kernel/head.S +++ b/arch/nds32/kernel/head.S @@ -7,7 +7,7 @@ #include <asm/asm-offsets.h> #include <asm/page.h> #include <asm/pgtable.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/thread_info.h> #ifdef CONFIG_CPU_BIG_ENDIAN diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 523bb99d7676..00682b8df330 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -628,14 +628,14 @@ static int __init prom_next_node(phandle *nodep) } } -static inline int prom_getprop(phandle node, const char *pname, - void *value, size_t valuelen) +static inline int __init prom_getprop(phandle node, const char *pname, + void *value, size_t valuelen) { return call_prom("getprop", 4, 1, node, ADDR(pname), (u32)(unsigned long) value, (u32) valuelen); } -static inline int prom_getproplen(phandle node, const char *pname) +static inline int __init prom_getproplen(phandle node, const char *pname) { return call_prom("getproplen", 2, 1, node, ADDR(pname)); } diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index e8e93c2c7d03..7a1708875d27 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -610,7 +610,7 @@ SYSFS_PMCSETUP(pa6t_pmc2, SPRN_PA6T_PMC2); SYSFS_PMCSETUP(pa6t_pmc3, SPRN_PA6T_PMC3); SYSFS_PMCSETUP(pa6t_pmc4, SPRN_PA6T_PMC4); SYSFS_PMCSETUP(pa6t_pmc5, SPRN_PA6T_PMC5); -#ifdef CONFIG_DEBUG_KERNEL +#ifdef CONFIG_DEBUG_MISC SYSFS_SPRSETUP(hid0, SPRN_HID0); SYSFS_SPRSETUP(hid1, SPRN_HID1); SYSFS_SPRSETUP(hid4, SPRN_HID4); @@ -639,7 +639,7 @@ SYSFS_SPRSETUP(tsr0, SPRN_PA6T_TSR0); SYSFS_SPRSETUP(tsr1, SPRN_PA6T_TSR1); SYSFS_SPRSETUP(tsr2, SPRN_PA6T_TSR2); SYSFS_SPRSETUP(tsr3, SPRN_PA6T_TSR3); -#endif /* CONFIG_DEBUG_KERNEL */ +#endif /* CONFIG_DEBUG_MISC */ #endif /* HAS_PPC_PMC_PA6T */ #ifdef HAS_PPC_PMC_IBM @@ -680,7 +680,7 @@ static struct device_attribute pa6t_attrs[] = { __ATTR(pmc3, 0600, show_pa6t_pmc3, store_pa6t_pmc3), __ATTR(pmc4, 0600, show_pa6t_pmc4, store_pa6t_pmc4), __ATTR(pmc5, 0600, show_pa6t_pmc5, store_pa6t_pmc5), -#ifdef CONFIG_DEBUG_KERNEL +#ifdef CONFIG_DEBUG_MISC __ATTR(hid0, 0600, show_hid0, store_hid0), __ATTR(hid1, 0600, show_hid1, store_hid1), __ATTR(hid4, 0600, show_hid4, store_hid4), @@ -709,7 +709,7 @@ static struct device_attribute pa6t_attrs[] = { __ATTR(tsr1, 0600, show_tsr1, store_tsr1), __ATTR(tsr2, 0600, show_tsr2, store_tsr2), __ATTR(tsr3, 0600, show_tsr3, store_tsr3), -#endif /* CONFIG_DEBUG_KERNEL */ +#endif /* CONFIG_DEBUG_MISC */ }; #endif /* HAS_PPC_PMC_PA6T */ #endif /* HAS_PPC_PMC_CLASSIC */ diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 6a23b9ebd2a1..4d841369399f 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -90,7 +90,7 @@ void radix__tlbiel_all(unsigned int action) asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); } -static inline void __tlbiel_pid(unsigned long pid, int set, +static __always_inline void __tlbiel_pid(unsigned long pid, int set, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -106,7 +106,7 @@ static inline void __tlbiel_pid(unsigned long pid, int set, trace_tlbie(0, 1, rb, rs, ric, prs, r); } -static inline void __tlbie_pid(unsigned long pid, unsigned long ric) +static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -120,7 +120,7 @@ static inline void __tlbie_pid(unsigned long pid, unsigned long ric) trace_tlbie(0, 0, rb, rs, ric, prs, r); } -static inline void __tlbiel_lpid(unsigned long lpid, int set, +static __always_inline void __tlbiel_lpid(unsigned long lpid, int set, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -136,7 +136,7 @@ static inline void __tlbiel_lpid(unsigned long lpid, int set, trace_tlbie(lpid, 1, rb, rs, ric, prs, r); } -static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) +static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -928,7 +928,7 @@ void radix__tlb_flush(struct mmu_gather *tlb) tlb->need_flush_all = 0; } -static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, +static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize, bool also_pwc) { diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h index 3cc52e37b4b2..f316de40e51b 100644 --- a/arch/s390/include/asm/cpacf.h +++ b/arch/s390/include/asm/cpacf.h @@ -202,7 +202,7 @@ static inline int __cpacf_check_opcode(unsigned int opcode) } } -static inline int cpacf_query(unsigned int opcode, cpacf_mask_t *mask) +static __always_inline int cpacf_query(unsigned int opcode, cpacf_mask_t *mask) { if (__cpacf_check_opcode(opcode)) { __cpacf_query(opcode, mask); diff --git a/arch/sh/boards/board-apsh4a3a.c b/arch/sh/boards/board-apsh4a3a.c index 346eda7a2ef6..abf19a947df3 100644 --- a/arch/sh/boards/board-apsh4a3a.c +++ b/arch/sh/boards/board-apsh4a3a.c @@ -16,7 +16,7 @@ #include <linux/irq.h> #include <linux/clk.h> #include <asm/machvec.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/clock.h> static struct mtd_partition nor_flash_partitions[] = { diff --git a/arch/sh/boards/board-apsh4ad0a.c b/arch/sh/boards/board-apsh4ad0a.c index 4efa9c571f64..fa031a16c9b5 100644 --- a/arch/sh/boards/board-apsh4ad0a.c +++ b/arch/sh/boards/board-apsh4ad0a.c @@ -15,7 +15,7 @@ #include <linux/irq.h> #include <linux/clk.h> #include <asm/machvec.h> -#include <asm/sizes.h> +#include <linux/sizes.h> /* Dummy supplies, where voltage doesn't matter */ static struct regulator_consumer_supply dummy_supplies[] = { diff --git a/arch/sh/boards/board-edosk7705.c b/arch/sh/boards/board-edosk7705.c index 67a8803eb3f9..0de7d603da2d 100644 --- a/arch/sh/boards/board-edosk7705.c +++ b/arch/sh/boards/board-edosk7705.c @@ -16,7 +16,7 @@ #include <linux/smc91x.h> #include <linux/sh_intc.h> #include <asm/machvec.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #define SMC_IOBASE 0xA2000000 #define SMC_IO_OFFSET 0x300 diff --git a/arch/sh/boards/board-edosk7760.c b/arch/sh/boards/board-edosk7760.c index 0fbe91cba67a..7569d85c5ff5 100644 --- a/arch/sh/boards/board-edosk7760.c +++ b/arch/sh/boards/board-edosk7760.c @@ -18,7 +18,7 @@ #include <asm/addrspace.h> #include <asm/delay.h> #include <asm/i2c-sh7760.h> -#include <asm/sizes.h> +#include <linux/sizes.h> /* Bus state controller registers for CS4 area */ #define BSC_CS4BCR 0xA4FD0010 diff --git a/arch/sh/boards/board-espt.c b/arch/sh/boards/board-espt.c index f478fee3b48a..6e784b5cf5a0 100644 --- a/arch/sh/boards/board-espt.c +++ b/arch/sh/boards/board-espt.c @@ -13,7 +13,7 @@ #include <linux/sh_eth.h> #include <linux/sh_intc.h> #include <asm/machvec.h> -#include <asm/sizes.h> +#include <linux/sizes.h> /* NOR Flash */ static struct mtd_partition espt_nor_flash_partitions[] = { diff --git a/arch/sh/boards/board-urquell.c b/arch/sh/boards/board-urquell.c index 799af57c0b81..dad2b3b40735 100644 --- a/arch/sh/boards/board-urquell.c +++ b/arch/sh/boards/board-urquell.c @@ -21,7 +21,7 @@ #include <mach/urquell.h> #include <cpu/sh7786.h> #include <asm/heartbeat.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/smp-ops.h> /* diff --git a/arch/sh/boards/mach-microdev/setup.c b/arch/sh/boards/mach-microdev/setup.c index 706b48f797be..f4a777fe2d01 100644 --- a/arch/sh/boards/mach-microdev/setup.c +++ b/arch/sh/boards/mach-microdev/setup.c @@ -15,7 +15,7 @@ #include <mach/microdev.h> #include <asm/io.h> #include <asm/machvec.h> -#include <asm/sizes.h> +#include <linux/sizes.h> static struct resource smc91x_resources[] = { [0] = { diff --git a/arch/sh/boards/mach-sdk7786/fpga.c b/arch/sh/boards/mach-sdk7786/fpga.c index 6d2a3d381c2a..895576ff8376 100644 --- a/arch/sh/boards/mach-sdk7786/fpga.c +++ b/arch/sh/boards/mach-sdk7786/fpga.c @@ -8,7 +8,7 @@ #include <linux/io.h> #include <linux/bcd.h> #include <mach/fpga.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #define FPGA_REGS_OFFSET 0x03fff800 #define FPGA_REGS_SIZE 0x490 diff --git a/arch/sh/boards/mach-sdk7786/setup.c b/arch/sh/boards/mach-sdk7786/setup.c index 65721c3a482c..d183026dbeb1 100644 --- a/arch/sh/boards/mach-sdk7786/setup.c +++ b/arch/sh/boards/mach-sdk7786/setup.c @@ -19,7 +19,7 @@ #include <mach/irq.h> #include <asm/machvec.h> #include <asm/heartbeat.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/clock.h> #include <asm/reboot.h> #include <asm/smp-ops.h> diff --git a/arch/sh/boards/mach-sdk7786/sram.c b/arch/sh/boards/mach-sdk7786/sram.c index d76cdb7ede39..7c6ca976f332 100644 --- a/arch/sh/boards/mach-sdk7786/sram.c +++ b/arch/sh/boards/mach-sdk7786/sram.c @@ -13,7 +13,7 @@ #include <linux/string.h> #include <mach/fpga.h> #include <asm/sram.h> -#include <asm/sizes.h> +#include <linux/sizes.h> static int __init fpga_sram_init(void) { diff --git a/arch/sh/boards/mach-se/7343/irq.c b/arch/sh/boards/mach-se/7343/irq.c index 39a3175e72b2..1aedbfe32654 100644 --- a/arch/sh/boards/mach-se/7343/irq.c +++ b/arch/sh/boards/mach-se/7343/irq.c @@ -16,7 +16,7 @@ #include <linux/interrupt.h> #include <linux/irqdomain.h> #include <linux/io.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <mach-se/mach/se7343.h> #define PA_CPLD_BASE_ADDR 0x11400000 diff --git a/arch/sh/boards/mach-se/7722/irq.c b/arch/sh/boards/mach-se/7722/irq.c index f6e3009edd4e..6d34592767f8 100644 --- a/arch/sh/boards/mach-se/7722/irq.c +++ b/arch/sh/boards/mach-se/7722/irq.c @@ -14,7 +14,7 @@ #include <linux/irqdomain.h> #include <linux/io.h> #include <linux/err.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <mach-se/mach/se7722.h> #define IRQ01_BASE_ADDR 0x11800000 diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c index 1b9e5caac389..11ed21c2e9bb 100644 --- a/arch/sh/drivers/pci/pci-sh7751.c +++ b/arch/sh/drivers/pci/pci-sh7751.c @@ -14,7 +14,7 @@ #include <linux/io.h> #include "pci-sh4.h" #include <asm/addrspace.h> -#include <asm/sizes.h> +#include <linux/sizes.h> static int __init __area_sdram_check(struct pci_channel *chan, unsigned int area) diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index 3fd0f392a0ee..287b3a68570c 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -16,7 +16,7 @@ #include <linux/log2.h> #include "pci-sh4.h" #include <asm/mmu.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #if defined(CONFIG_CPU_BIG_ENDIAN) # define PCICR_ENDIANNESS SH4_PCICR_BSWP diff --git a/arch/sh/drivers/pci/pcie-sh7786.c b/arch/sh/drivers/pci/pcie-sh7786.c index a58b77cea295..e0b568aaa701 100644 --- a/arch/sh/drivers/pci/pcie-sh7786.c +++ b/arch/sh/drivers/pci/pcie-sh7786.c @@ -18,7 +18,7 @@ #include <linux/sh_intc.h> #include <cpu/sh7786.h> #include "pcie-sh7786.h" -#include <asm/sizes.h> +#include <linux/sizes.h> struct sh7786_pcie_port { struct pci_channel *hose; diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index 73fff39a0122..51a54df22c11 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -18,6 +18,5 @@ generic-y += parport.h generic-y += percpu.h generic-y += preempt.h generic-y += serial.h -generic-y += sizes.h generic-y += trace_clock.h generic-y += xor.h diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index b95e343e3c9d..5aeb4d7099a1 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -26,7 +26,7 @@ #include <asm/sections.h> #include <asm/setup.h> #include <asm/cache.h> -#include <asm/sizes.h> +#include <linux/sizes.h> pgd_t swapper_pg_dir[PTRS_PER_PGD]; diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c index 7b2cc490ebb7..a53a040d0054 100644 --- a/arch/sh/mm/pmb.c +++ b/arch/sh/mm/pmb.c @@ -24,7 +24,7 @@ #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <asm/cacheflush.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <linux/uaccess.h> #include <asm/pgtable.h> #include <asm/page.h> diff --git a/arch/sh/mm/uncached.c b/arch/sh/mm/uncached.c index 010010bf205a..bd1585e8efed 100644 --- a/arch/sh/mm/uncached.c +++ b/arch/sh/mm/uncached.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/module.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/page.h> #include <asm/addrspace.h> diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild index b301a0b3c0b2..c93dc6478cb2 100644 --- a/arch/unicore32/include/asm/Kbuild +++ b/arch/unicore32/include/asm/Kbuild @@ -31,7 +31,6 @@ generic-y += sections.h generic-y += segment.h generic-y += serial.h generic-y += shmparam.h -generic-y += sizes.h generic-y += syscalls.h generic-y += topology.h generic-y += trace_clock.h diff --git a/arch/unicore32/include/asm/memory.h b/arch/unicore32/include/asm/memory.h index 66bb9f6525c0..46cf27efbb7e 100644 --- a/arch/unicore32/include/asm/memory.h +++ b/arch/unicore32/include/asm/memory.h @@ -16,7 +16,7 @@ #include <linux/compiler.h> #include <linux/const.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <mach/memory.h> /* diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index b4442f3060ce..c994cdf14119 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -23,7 +23,7 @@ #include <asm/sections.h> #include <asm/setup.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/tlb.h> #include <asm/memblock.h> #include <mach/map.h> diff --git a/arch/unicore32/mm/ioremap.c b/arch/unicore32/mm/ioremap.c index bf012b2b71a9..b69cb18ce8b1 100644 --- a/arch/unicore32/mm/ioremap.c +++ b/arch/unicore32/mm/ioremap.c @@ -34,7 +34,7 @@ #include <asm/mmu_context.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <mach/map.h> #include "mm.h" diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c index aa2060beb408..f0ae623b305f 100644 --- a/arch/unicore32/mm/mmu.c +++ b/arch/unicore32/mm/mmu.c @@ -22,7 +22,7 @@ #include <asm/cputype.h> #include <asm/sections.h> #include <asm/setup.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/tlb.h> #include <asm/memblock.h> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 818b361094ed..326b2d5bab9d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -305,9 +305,6 @@ config ZONE_DMA32 config AUDIT_ARCH def_bool y if X86_64 -config ARCH_SUPPORTS_OPTIMIZED_INLINING - def_bool y - config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 15d0fbe27872..f730680dc818 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -266,20 +266,6 @@ config CPA_DEBUG ---help--- Do change_page_attr() self-tests every 30 seconds. -config OPTIMIZE_INLINING - bool "Allow gcc to uninline functions marked 'inline'" - ---help--- - This option determines if the kernel forces gcc to inline the functions - developers have marked 'inline'. Doing so takes away freedom from gcc to - do what it thinks is best, which is desirable for the gcc 3.x series of - compilers. The gcc 4.x series have a rewritten inlining algorithm and - enabling this option will generate a smaller kernel there. Hopefully - this algorithm is so good that allowing gcc 4.x and above to make the - decision will become the default in the future. Until then this option - is there to test gcc for this. - - If unsure, say N. - config DEBUG_ENTRY bool "Debug low-level entry code" depends on DEBUG_KERNEL diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 7cdd7b13bbda..890a3fb5706f 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -23,7 +23,7 @@ #include <linux/device.h> #include <linux/coredump.h> -#include <asm-generic/sizes.h> +#include <linux/sizes.h> #include <asm/perf_event.h> #include "../perf_event.h" diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 2bdbbbcfa393..cdf44aa9a501 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: GPL-2.0 */ /* * This file contains definitions from Hyper-V Hypervisor Top-Level Functional diff --git a/arch/xtensa/include/asm/irqflags.h b/arch/xtensa/include/asm/irqflags.h index 9b5e8526afe5..12890681587b 100644 --- a/arch/xtensa/include/asm/irqflags.h +++ b/arch/xtensa/include/asm/irqflags.h @@ -27,7 +27,7 @@ static inline unsigned long arch_local_irq_save(void) { unsigned long flags; #if XTENSA_FAKE_NMI -#if defined(CONFIG_DEBUG_KERNEL) && (LOCKLEVEL | TOPLEVEL) >= XCHAL_DEBUGLEVEL +#if defined(CONFIG_DEBUG_MISC) && (LOCKLEVEL | TOPLEVEL) >= XCHAL_DEBUGLEVEL unsigned long tmp; asm volatile("rsr %0, ps\t\n" diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c index 3699d6d3e479..83b244ce61ee 100644 --- a/arch/xtensa/kernel/smp.c +++ b/arch/xtensa/kernel/smp.c @@ -126,7 +126,7 @@ void secondary_start_kernel(void) init_mmu(); -#ifdef CONFIG_DEBUG_KERNEL +#ifdef CONFIG_DEBUG_MISC if (boot_secondary_processors == 0) { pr_debug("%s: boot_secondary_processors:%d; Hanging cpu:%d\n", __func__, boot_secondary_processors, cpu); diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index eb33d2d00d77..e20e6b429804 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -33,7 +33,7 @@ #include <linux/types.h> #include <linux/of_graph.h> #include <linux/of_device.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <linux/kthread.h> #include <drm/drmP.h> diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 9fb0eb7a4d02..b4b87d6ae67f 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -34,7 +34,7 @@ #include <linux/of_iommu.h> #include <asm/cacheflush.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include "msm_iommu_hw-8xxx.h" #include "msm_iommu.h" diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c index e22bbff89c8d..9cb93e15b197 100644 --- a/drivers/mmc/host/mvsdio.c +++ b/drivers/mmc/host/mvsdio.c @@ -24,7 +24,7 @@ #include <linux/mmc/host.h> #include <linux/mmc/slot-gpio.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/unaligned.h> #include "mvsdio.h" diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c index c1d3f0e38921..e7d80c83da2c 100644 --- a/drivers/mmc/host/pxamci.c +++ b/drivers/mmc/host/pxamci.c @@ -35,7 +35,7 @@ #include <linux/of.h> #include <linux/of_device.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <mach/hardware.h> #include <linux/platform_data/mmc-pxamci.h> diff --git a/drivers/mtd/maps/sa1100-flash.c b/drivers/mtd/maps/sa1100-flash.c index fd5fe12d7461..893239629d6b 100644 --- a/drivers/mtd/maps/sa1100-flash.c +++ b/drivers/mtd/maps/sa1100-flash.c @@ -20,7 +20,7 @@ #include <linux/mtd/concat.h> #include <mach/hardware.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/mach/flash.h> struct sa_subdev_info { diff --git a/drivers/mtd/nand/raw/vf610_nfc.c b/drivers/mtd/nand/raw/vf610_nfc.c index 6d43ddb3332f..e4fe8c4bc711 100644 --- a/drivers/mtd/nand/raw/vf610_nfc.c +++ b/drivers/mtd/nand/raw/vf610_nfc.c @@ -364,7 +364,7 @@ static int vf610_nfc_cmd(struct nand_chip *chip, { const struct nand_op_instr *instr; struct vf610_nfc *nfc = chip_to_nfc(chip); - int op_id = -1, trfr_sz = 0, offset; + int op_id = -1, trfr_sz = 0, offset = 0; u32 col = 0, row = 0, cmd1 = 0, cmd2 = 0, code = 0; bool force8bit = false; diff --git a/drivers/pcmcia/omap_cf.c b/drivers/pcmcia/omap_cf.c index c2a17a79f0b2..267fb875e40f 100644 --- a/drivers/pcmcia/omap_cf.c +++ b/drivers/pcmcia/omap_cf.c @@ -22,7 +22,7 @@ #include <mach/hardware.h> #include <asm/io.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <mach/mux.h> #include <mach/tc.h> diff --git a/drivers/pps/clients/pps-gpio.c b/drivers/pps/clients/pps-gpio.c index dd5d1103e02b..4b6418039387 100644 --- a/drivers/pps/clients/pps-gpio.c +++ b/drivers/pps/clients/pps-gpio.c @@ -31,19 +31,25 @@ #include <linux/slab.h> #include <linux/pps_kernel.h> #include <linux/pps-gpio.h> -#include <linux/gpio.h> +#include <linux/gpio/consumer.h> #include <linux/list.h> #include <linux/of_device.h> #include <linux/of_gpio.h> +#include <linux/timer.h> +#include <linux/jiffies.h> /* Info for each registered platform device */ struct pps_gpio_device_data { int irq; /* IRQ used as PPS source */ struct pps_device *pps; /* PPS source device */ struct pps_source_info info; /* PPS source information */ + struct gpio_desc *gpio_pin; /* GPIO port descriptors */ + struct gpio_desc *echo_pin; + struct timer_list echo_timer; /* timer to reset echo active state */ bool assert_falling_edge; bool capture_clear; - unsigned int gpio_pin; + unsigned int echo_active_ms; /* PPS echo active duration */ + unsigned long echo_timeout; /* timer timeout value in jiffies */ }; /* @@ -61,18 +67,101 @@ static irqreturn_t pps_gpio_irq_handler(int irq, void *data) info = data; - rising_edge = gpio_get_value(info->gpio_pin); + rising_edge = gpiod_get_value(info->gpio_pin); if ((rising_edge && !info->assert_falling_edge) || (!rising_edge && info->assert_falling_edge)) - pps_event(info->pps, &ts, PPS_CAPTUREASSERT, NULL); + pps_event(info->pps, &ts, PPS_CAPTUREASSERT, data); else if (info->capture_clear && ((rising_edge && info->assert_falling_edge) || - (!rising_edge && !info->assert_falling_edge))) - pps_event(info->pps, &ts, PPS_CAPTURECLEAR, NULL); + (!rising_edge && !info->assert_falling_edge))) + pps_event(info->pps, &ts, PPS_CAPTURECLEAR, data); return IRQ_HANDLED; } +/* This function will only be called when an ECHO GPIO is defined */ +static void pps_gpio_echo(struct pps_device *pps, int event, void *data) +{ + /* add_timer() needs to write into info->echo_timer */ + struct pps_gpio_device_data *info = data; + + switch (event) { + case PPS_CAPTUREASSERT: + if (pps->params.mode & PPS_ECHOASSERT) + gpiod_set_value(info->echo_pin, 1); + break; + + case PPS_CAPTURECLEAR: + if (pps->params.mode & PPS_ECHOCLEAR) + gpiod_set_value(info->echo_pin, 1); + break; + } + + /* fire the timer */ + if (info->pps->params.mode & (PPS_ECHOASSERT | PPS_ECHOCLEAR)) { + info->echo_timer.expires = jiffies + info->echo_timeout; + add_timer(&info->echo_timer); + } +} + +/* Timer callback to reset the echo pin to the inactive state */ +static void pps_gpio_echo_timer_callback(struct timer_list *t) +{ + const struct pps_gpio_device_data *info; + + info = from_timer(info, t, echo_timer); + + gpiod_set_value(info->echo_pin, 0); +} + +static int pps_gpio_setup(struct platform_device *pdev) +{ + struct pps_gpio_device_data *data = platform_get_drvdata(pdev); + struct device_node *np = pdev->dev.of_node; + int ret; + u32 value; + + data->gpio_pin = devm_gpiod_get(&pdev->dev, + NULL, /* request "gpios" */ + GPIOD_IN); + if (IS_ERR(data->gpio_pin)) { + dev_err(&pdev->dev, + "failed to request PPS GPIO\n"); + return PTR_ERR(data->gpio_pin); + } + + data->echo_pin = devm_gpiod_get_optional(&pdev->dev, + "echo", + GPIOD_OUT_LOW); + if (data->echo_pin) { + if (IS_ERR(data->echo_pin)) { + dev_err(&pdev->dev, "failed to request ECHO GPIO\n"); + return PTR_ERR(data->echo_pin); + } + + ret = of_property_read_u32(np, + "echo-active-ms", + &value); + if (ret) { + dev_err(&pdev->dev, + "failed to get echo-active-ms from OF\n"); + return ret; + } + data->echo_active_ms = value; + /* sanity check on echo_active_ms */ + if (!data->echo_active_ms || data->echo_active_ms > 999) { + dev_err(&pdev->dev, + "echo-active-ms: %u - bad value from OF\n", + data->echo_active_ms); + return -EINVAL; + } + } + + if (of_property_read_bool(np, "assert-falling-edge")) + data->assert_falling_edge = true; + return 0; +} + static unsigned long get_irqf_trigger_flags(const struct pps_gpio_device_data *data) { @@ -90,53 +179,32 @@ get_irqf_trigger_flags(const struct pps_gpio_device_data *data) static int pps_gpio_probe(struct platform_device *pdev) { struct pps_gpio_device_data *data; - const char *gpio_label; int ret; int pps_default_params; const struct pps_gpio_platform_data *pdata = pdev->dev.platform_data; - struct device_node *np = pdev->dev.of_node; /* allocate space for device info */ - data = devm_kzalloc(&pdev->dev, sizeof(struct pps_gpio_device_data), - GFP_KERNEL); + data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; + platform_set_drvdata(pdev, data); + /* GPIO setup */ if (pdata) { data->gpio_pin = pdata->gpio_pin; - gpio_label = pdata->gpio_label; + data->echo_pin = pdata->echo_pin; data->assert_falling_edge = pdata->assert_falling_edge; data->capture_clear = pdata->capture_clear; + data->echo_active_ms = pdata->echo_active_ms; } else { - ret = of_get_gpio(np, 0); - if (ret < 0) { - dev_err(&pdev->dev, "failed to get GPIO from device tree\n"); - return ret; - } - data->gpio_pin = ret; - gpio_label = PPS_GPIO_NAME; - - if (of_get_property(np, "assert-falling-edge", NULL)) - data->assert_falling_edge = true; - } - - /* GPIO setup */ - ret = devm_gpio_request(&pdev->dev, data->gpio_pin, gpio_label); - if (ret) { - dev_err(&pdev->dev, "failed to request GPIO %u\n", - data->gpio_pin); - return ret; - } - - ret = gpio_direction_input(data->gpio_pin); - if (ret) { - dev_err(&pdev->dev, "failed to set pin direction\n"); - return -EINVAL; + ret = pps_gpio_setup(pdev); + if (ret) + return -EINVAL; } /* IRQ setup */ - ret = gpio_to_irq(data->gpio_pin); + ret = gpiod_to_irq(data->gpio_pin); if (ret < 0) { dev_err(&pdev->dev, "failed to map GPIO to IRQ: %d\n", ret); return -EINVAL; @@ -152,6 +220,11 @@ static int pps_gpio_probe(struct platform_device *pdev) data->info.owner = THIS_MODULE; snprintf(data->info.name, PPS_MAX_NAME_LEN - 1, "%s.%d", pdev->name, pdev->id); + if (data->echo_pin) { + data->info.echo = pps_gpio_echo; + data->echo_timeout = msecs_to_jiffies(data->echo_active_ms); + timer_setup(&data->echo_timer, pps_gpio_echo_timer_callback, 0); + } /* register PPS source */ pps_default_params = PPS_CAPTUREASSERT | PPS_OFFSETASSERT; @@ -173,7 +246,6 @@ static int pps_gpio_probe(struct platform_device *pdev) return -EINVAL; } - platform_set_drvdata(pdev, data); dev_info(data->pps->dev, "Registered IRQ %d as PPS source\n", data->irq); @@ -185,6 +257,11 @@ static int pps_gpio_remove(struct platform_device *pdev) struct pps_gpio_device_data *data = platform_get_drvdata(pdev); pps_unregister_source(data->pps); + if (data->echo_pin) { + del_timer_sync(&data->echo_timer); + /* reset echo pin in any case */ + gpiod_set_value(data->echo_pin, 0); + } dev_info(&pdev->dev, "removed IRQ %d as PPS source\n", data->irq); return 0; } @@ -209,4 +286,4 @@ MODULE_AUTHOR("Ricardo Martins <rasm@fe.up.pt>"); MODULE_AUTHOR("James Nuss <jamesnuss@nanometrics.ca>"); MODULE_DESCRIPTION("Use GPIO pin as PPS source"); MODULE_LICENSE("GPL"); -MODULE_VERSION("1.0.0"); +MODULE_VERSION("1.2.0"); diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c index cf45829585cb..b29fc258eeba 100644 --- a/drivers/rapidio/rio_cm.c +++ b/drivers/rapidio/rio_cm.c @@ -2147,6 +2147,14 @@ static int riocm_add_mport(struct device *dev, mutex_init(&cm->rx_lock); riocm_rx_fill(cm, RIOCM_RX_RING_SIZE); cm->rx_wq = create_workqueue(DRV_NAME "/rxq"); + if (!cm->rx_wq) { + riocm_error("failed to allocate IBMBOX_%d on %s", + cmbox, mport->name); + rio_release_outb_mbox(mport, cmbox); + kfree(cm); + return -ENOMEM; + } + INIT_WORK(&cm->rx_work, rio_ibmsg_handler); cm->tx_slot = 0; diff --git a/drivers/sh/intc/userimask.c b/drivers/sh/intc/userimask.c index e649ceaaa410..87d69e7471f9 100644 --- a/drivers/sh/intc/userimask.c +++ b/drivers/sh/intc/userimask.c @@ -14,7 +14,7 @@ #include <linux/init.h> #include <linux/io.h> #include <linux/stat.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include "internals.h" static void __iomem *uimask; diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c index 3912526ead66..cdb613d38062 100644 --- a/drivers/spi/spi-rockchip.c +++ b/drivers/spi/spi-rockchip.c @@ -15,6 +15,7 @@ #include <linux/clk.h> #include <linux/dmaengine.h> +#include <linux/interrupt.h> #include <linux/module.h> #include <linux/of.h> #include <linux/pinctrl/consumer.h> diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 59e82e6d776d..573b2055173c 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -527,8 +527,12 @@ void __handle_sysrq(int key, bool check_mask) { struct sysrq_key_op *op_p; int orig_log_level; + int orig_suppress_printk; int i; + orig_suppress_printk = suppress_printk; + suppress_printk = 0; + rcu_sysrq_start(); rcu_read_lock(); /* @@ -574,6 +578,8 @@ void __handle_sysrq(int key, bool check_mask) } rcu_read_unlock(); rcu_sysrq_end(); + + suppress_printk = orig_suppress_printk; } void handle_sysrq(int key) diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index 53b8ceea9bde..fb45f866b923 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -155,21 +155,6 @@ static const struct backlight_ops pwm_backlight_ops = { #ifdef CONFIG_OF #define PWM_LUMINANCE_SCALE 10000 /* luminance scale */ -/* An integer based power function */ -static u64 int_pow(u64 base, int exp) -{ - u64 result = 1; - - while (exp) { - if (exp & 1) - result *= base; - exp >>= 1; - base *= base; - } - - return result; -} - /* * CIE lightness to PWM conversion. * diff --git a/drivers/video/fbdev/fb-puv3.c b/drivers/video/fbdev/fb-puv3.c index d9e816d53531..1bddcc20b2c0 100644 --- a/drivers/video/fbdev/fb-puv3.c +++ b/drivers/video/fbdev/fb-puv3.c @@ -20,7 +20,7 @@ #include <linux/console.h> #include <linux/mm.h> -#include <asm/sizes.h> +#include <linux/sizes.h> #include <asm/pgtable.h> #include <mach/hardware.h> diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c index 6446bcab4185..93d5bebf9572 100644 --- a/drivers/virt/fsl_hypervisor.c +++ b/drivers/virt/fsl_hypervisor.c @@ -215,6 +215,9 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p) * hypervisor. */ lb_offset = param.local_vaddr & (PAGE_SIZE - 1); + if (param.count == 0 || + param.count > U64_MAX - lb_offset - PAGE_SIZE + 1) + return -EINVAL; num_pages = (param.count + lb_offset + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Allocate the buffers we need */ @@ -331,8 +334,8 @@ static long ioctl_dtprop(struct fsl_hv_ioctl_prop __user *p, int set) struct fsl_hv_ioctl_prop param; char __user *upath, *upropname; void __user *upropval; - char *path = NULL, *propname = NULL; - void *propval = NULL; + char *path, *propname; + void *propval; int ret = 0; /* Get the parameters from the user. */ @@ -344,32 +347,30 @@ static long ioctl_dtprop(struct fsl_hv_ioctl_prop __user *p, int set) upropval = (void __user *)(uintptr_t)param.propval; path = strndup_user(upath, FH_DTPROP_MAX_PATHLEN); - if (IS_ERR(path)) { - ret = PTR_ERR(path); - goto out; - } + if (IS_ERR(path)) + return PTR_ERR(path); propname = strndup_user(upropname, FH_DTPROP_MAX_PATHLEN); if (IS_ERR(propname)) { ret = PTR_ERR(propname); - goto out; + goto err_free_path; } if (param.proplen > FH_DTPROP_MAX_PROPLEN) { ret = -EINVAL; - goto out; + goto err_free_propname; } propval = kmalloc(param.proplen, GFP_KERNEL); if (!propval) { ret = -ENOMEM; - goto out; + goto err_free_propname; } if (set) { if (copy_from_user(propval, upropval, param.proplen)) { ret = -EFAULT; - goto out; + goto err_free_propval; } param.ret = fh_partition_set_dtprop(param.handle, @@ -388,7 +389,7 @@ static long ioctl_dtprop(struct fsl_hv_ioctl_prop __user *p, int set) if (copy_to_user(upropval, propval, param.proplen) || put_user(param.proplen, &p->proplen)) { ret = -EFAULT; - goto out; + goto err_free_propval; } } } @@ -396,10 +397,12 @@ static long ioctl_dtprop(struct fsl_hv_ioctl_prop __user *p, int set) if (put_user(param.ret, &p->ret)) ret = -EFAULT; -out: - kfree(path); +err_free_propval: kfree(propval); +err_free_propname: kfree(propname); +err_free_path: + kfree(path); return ret; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 7d09d125f148..fa9e99a962e0 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -524,6 +524,19 @@ static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp, #endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */ +static inline int make_prot(u32 p_flags) +{ + int prot = 0; + + if (p_flags & PF_R) + prot |= PROT_READ; + if (p_flags & PF_W) + prot |= PROT_WRITE; + if (p_flags & PF_X) + prot |= PROT_EXEC; + return prot; +} + /* This is much more generalized than the library routine read function, so we keep this separate. Technically the library read function is only provided so that we can read a.out libraries that have @@ -563,16 +576,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { int elf_type = MAP_PRIVATE | MAP_DENYWRITE; - int elf_prot = 0; + int elf_prot = make_prot(eppnt->p_flags); unsigned long vaddr = 0; unsigned long k, map_addr; - if (eppnt->p_flags & PF_R) - elf_prot = PROT_READ; - if (eppnt->p_flags & PF_W) - elf_prot |= PROT_WRITE; - if (eppnt->p_flags & PF_X) - elf_prot |= PROT_EXEC; vaddr = eppnt->p_vaddr; if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) elf_type |= MAP_FIXED_NOREPLACE; @@ -687,7 +694,6 @@ static int load_elf_binary(struct linux_binprm *bprm) struct file *interpreter = NULL; /* to shut gcc up */ unsigned long load_addr = 0, load_bias = 0; int load_addr_set = 0; - char * elf_interpreter = NULL; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; unsigned long elf_bss, elf_brk; @@ -698,13 +704,12 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long start_code, end_code, start_data, end_data; unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; - struct pt_regs *regs = current_pt_regs(); struct { struct elfhdr elf_ex; struct elfhdr interp_elf_ex; } *loc; struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE; - loff_t pos; + struct pt_regs *regs; loc = kmalloc(sizeof(*loc), GFP_KERNEL); if (!loc) { @@ -734,69 +739,66 @@ static int load_elf_binary(struct linux_binprm *bprm) goto out; elf_ppnt = elf_phdata; - elf_bss = 0; - elf_brk = 0; + for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { + char *elf_interpreter; + loff_t pos; - start_code = ~0UL; - end_code = 0; - start_data = 0; - end_data = 0; + if (elf_ppnt->p_type != PT_INTERP) + continue; - for (i = 0; i < loc->elf_ex.e_phnum; i++) { - if (elf_ppnt->p_type == PT_INTERP) { - /* This is the program interpreter used for - * shared libraries - for now assume that this - * is an a.out format binary - */ - retval = -ENOEXEC; - if (elf_ppnt->p_filesz > PATH_MAX || - elf_ppnt->p_filesz < 2) - goto out_free_ph; - - retval = -ENOMEM; - elf_interpreter = kmalloc(elf_ppnt->p_filesz, - GFP_KERNEL); - if (!elf_interpreter) - goto out_free_ph; - - pos = elf_ppnt->p_offset; - retval = kernel_read(bprm->file, elf_interpreter, - elf_ppnt->p_filesz, &pos); - if (retval != elf_ppnt->p_filesz) { - if (retval >= 0) - retval = -EIO; - goto out_free_interp; - } - /* make sure path is NULL terminated */ - retval = -ENOEXEC; - if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') - goto out_free_interp; + /* + * This is the program interpreter used for shared libraries - + * for now assume that this is an a.out format binary. + */ + retval = -ENOEXEC; + if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2) + goto out_free_ph; - interpreter = open_exec(elf_interpreter); - retval = PTR_ERR(interpreter); - if (IS_ERR(interpreter)) - goto out_free_interp; + retval = -ENOMEM; + elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL); + if (!elf_interpreter) + goto out_free_ph; - /* - * If the binary is not readable then enforce - * mm->dumpable = 0 regardless of the interpreter's - * permissions. - */ - would_dump(bprm, interpreter); - - /* Get the exec headers */ - pos = 0; - retval = kernel_read(interpreter, &loc->interp_elf_ex, - sizeof(loc->interp_elf_ex), &pos); - if (retval != sizeof(loc->interp_elf_ex)) { - if (retval >= 0) - retval = -EIO; - goto out_free_dentry; - } + pos = elf_ppnt->p_offset; + retval = kernel_read(bprm->file, elf_interpreter, + elf_ppnt->p_filesz, &pos); + if (retval != elf_ppnt->p_filesz) { + if (retval >= 0) + retval = -EIO; + goto out_free_interp; + } + /* make sure path is NULL terminated */ + retval = -ENOEXEC; + if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') + goto out_free_interp; - break; + interpreter = open_exec(elf_interpreter); + kfree(elf_interpreter); + retval = PTR_ERR(interpreter); + if (IS_ERR(interpreter)) + goto out_free_ph; + + /* + * If the binary is not readable then enforce mm->dumpable = 0 + * regardless of the interpreter's permissions. + */ + would_dump(bprm, interpreter); + + /* Get the exec headers */ + pos = 0; + retval = kernel_read(interpreter, &loc->interp_elf_ex, + sizeof(loc->interp_elf_ex), &pos); + if (retval != sizeof(loc->interp_elf_ex)) { + if (retval >= 0) + retval = -EIO; + goto out_free_dentry; } - elf_ppnt++; + + break; + +out_free_interp: + kfree(elf_interpreter); + goto out_free_ph; } elf_ppnt = elf_phdata; @@ -819,7 +821,7 @@ static int load_elf_binary(struct linux_binprm *bprm) } /* Some simple consistency checks for the interpreter */ - if (elf_interpreter) { + if (interpreter) { retval = -ELIBBAD; /* Not an ELF interpreter */ if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0) @@ -884,13 +886,19 @@ static int load_elf_binary(struct linux_binprm *bprm) if (retval < 0) goto out_free_dentry; - current->mm->start_stack = bprm->p; + elf_bss = 0; + elf_brk = 0; + + start_code = ~0UL; + end_code = 0; + start_data = 0; + end_data = 0; /* Now we do a little grungy work by mmapping the ELF image into the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { - int elf_prot = 0, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE; + int elf_prot, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE; unsigned long k, vaddr; unsigned long total_size = 0; @@ -931,12 +939,7 @@ static int load_elf_binary(struct linux_binprm *bprm) elf_fixed = MAP_FIXED; } - if (elf_ppnt->p_flags & PF_R) - elf_prot |= PROT_READ; - if (elf_ppnt->p_flags & PF_W) - elf_prot |= PROT_WRITE; - if (elf_ppnt->p_flags & PF_X) - elf_prot |= PROT_EXEC; + elf_prot = make_prot(elf_ppnt->p_flags); elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; @@ -978,7 +981,7 @@ static int load_elf_binary(struct linux_binprm *bprm) * independently randomized mmap region (0 load_bias * without MAP_FIXED). */ - if (elf_interpreter) { + if (interpreter) { load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); @@ -1076,7 +1079,7 @@ static int load_elf_binary(struct linux_binprm *bprm) goto out_free_dentry; } - if (elf_interpreter) { + if (interpreter) { unsigned long interp_map_addr = 0; elf_entry = load_elf_interp(&loc->interp_elf_ex, @@ -1100,7 +1103,6 @@ static int load_elf_binary(struct linux_binprm *bprm) allow_write_access(interpreter); fput(interpreter); - kfree(elf_interpreter); } else { elf_entry = loc->elf_ex.e_entry; if (BAD_ADDR(elf_entry)) { @@ -1115,7 +1117,7 @@ static int load_elf_binary(struct linux_binprm *bprm) set_binfmt(&elf_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES - retval = arch_setup_additional_pages(bprm, !!elf_interpreter); + retval = arch_setup_additional_pages(bprm, !!interpreter); if (retval < 0) goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ @@ -1132,6 +1134,17 @@ static int load_elf_binary(struct linux_binprm *bprm) current->mm->start_stack = bprm->p; if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) { + /* + * For architectures with ELF randomization, when executing + * a loader directly (i.e. no interpreter listed in ELF + * headers), move the brk area out of the mmap region + * (since it grows up, and may collide early with the stack + * growing down), and into the unused ELF_ET_DYN_BASE region. + */ + if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && !interpreter) + current->mm->brk = current->mm->start_brk = + ELF_ET_DYN_BASE; + current->mm->brk = current->mm->start_brk = arch_randomize_brk(current->mm); #ifdef compat_brk_randomized @@ -1148,6 +1161,7 @@ static int load_elf_binary(struct linux_binprm *bprm) MAP_FIXED | MAP_PRIVATE, 0); } + regs = current_pt_regs(); #ifdef ELF_PLAT_INIT /* * The ABI may specify that certain registers be set up in special @@ -1176,8 +1190,6 @@ out_free_dentry: allow_write_access(interpreter); if (interpreter) fput(interpreter); -out_free_interp: - kfree(elf_interpreter); out_free_ph: kfree(elf_phdata); goto out; @@ -1456,8 +1468,6 @@ static void fill_elf_header(struct elfhdr *elf, int segs, elf->e_ehsize = sizeof(struct elfhdr); elf->e_phentsize = sizeof(struct elf_phdr); elf->e_phnum = segs; - - return; } static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) @@ -1470,7 +1480,6 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) phdr->p_memsz = 0; phdr->p_flags = 0; phdr->p_align = 0; - return; } static void fill_note(struct memelfnote *note, const char *name, int type, @@ -1480,7 +1489,6 @@ static void fill_note(struct memelfnote *note, const char *name, int type, note->type = type; note->datasz = sz; note->data = data; - return; } /* diff --git a/fs/block_dev.c b/fs/block_dev.c index f80045048bb7..0f7552a87d54 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -29,7 +29,6 @@ #include <linux/namei.h> #include <linux/log2.h> #include <linux/cleancache.h> -#include <linux/dax.h> #include <linux/task_io_accounting_ops.h> #include <linux/falloc.h> #include <linux/uaccess.h> diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 1645fcfd9691..d27720cd3664 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -20,7 +20,6 @@ #include <linux/namei.h> #include <linux/security.h> #include <linux/slab.h> -#include <linux/xattr.h> #include "internal.h" #define CACHEFILES_KEYBUF_SIZE 512 diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index c5234c21b539..f2bb7985d21c 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -39,7 +39,6 @@ #include <linux/device.h> #include <linux/pid_namespace.h> #include <asm/io.h> -#include <linux/poll.h> #include <linux/uaccess.h> #include <linux/coda.h> diff --git a/fs/eventfd.c b/fs/eventfd.c index 08d3bd602f73..93b1fa7bb298 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -21,6 +21,9 @@ #include <linux/eventfd.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/idr.h> + +static DEFINE_IDA(eventfd_ida); struct eventfd_ctx { struct kref kref; @@ -35,6 +38,7 @@ struct eventfd_ctx { */ __u64 count; unsigned int flags; + int id; }; /** @@ -69,6 +73,8 @@ EXPORT_SYMBOL_GPL(eventfd_signal); static void eventfd_free_ctx(struct eventfd_ctx *ctx) { + if (ctx->id >= 0) + ida_simple_remove(&eventfd_ida, ctx->id); kfree(ctx); } @@ -297,6 +303,7 @@ static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) seq_printf(m, "eventfd-count: %16llx\n", (unsigned long long)ctx->count); spin_unlock_irq(&ctx->wqh.lock); + seq_printf(m, "eventfd-id: %d\n", ctx->id); } #endif @@ -400,6 +407,7 @@ static int do_eventfd(unsigned int count, int flags) init_waitqueue_head(&ctx->wqh); ctx->count = count; ctx->flags = flags; + ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL); fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); diff --git a/fs/exec.c b/fs/exec.c index 2e0033348d8e..d88584ebf07f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1652,11 +1652,13 @@ int search_binary_handler(struct linux_binprm *bprm) if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); + bprm->recursion_depth++; retval = fmt->load_binary(bprm); + bprm->recursion_depth--; + read_lock(&binfmt_lock); put_binfmt(fmt); - bprm->recursion_depth--; if (retval < 0 && !bprm->mm) { /* we got to flush_old_exec() and failed after it */ read_unlock(&binfmt_lock); diff --git a/fs/fat/file.c b/fs/fat/file.c index b3bed32946b1..0e3ed79fcc3f 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -193,12 +193,17 @@ static int fat_file_release(struct inode *inode, struct file *filp) int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; - int res, err; + int err; + + err = __generic_file_fsync(filp, start, end, datasync); + if (err) + return err; - res = generic_file_fsync(filp, start, end, datasync); err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); + if (err) + return err; - return res ? res : err; + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); } diff --git a/fs/proc/base.c b/fs/proc/base.c index b6ccb6c57706..9c8ca6cd3ce4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -510,7 +510,7 @@ static ssize_t lstats_write(struct file *file, const char __user *buf, if (!task) return -ESRCH; - clear_all_latency_tracing(task); + clear_tsk_latency_tracing(task); put_task_struct(task); return count; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 32d8986c26fb..b5b26d8a192c 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -450,6 +450,15 @@ fail: static inline __u32 xattr_hash(const char *msg, int len) { + /* + * csum_partial() gives different results for little-endian and + * big endian hosts. Images created on little-endian hosts and + * mounted on big-endian hosts(and vice versa) will see csum mismatches + * when trying to fetch xattrs. Treating the hash as __wsum_t would + * lower the frequency of mismatch. This is an endianness bug in + * reiserfs. The return statement would result in a sparse warning. Do + * not fix the sparse warning so as to not hide a reminder of the bug. + */ return csum_partial(msg, len, 0); } diff --git a/include/asm-generic/shmparam.h b/include/asm-generic/shmparam.h index 8b78c0ba08b1..b8f9035ffc2c 100644 --- a/include/asm-generic/shmparam.h +++ b/include/asm-generic/shmparam.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_SHMPARAM_H #define __ASM_GENERIC_SHMPARAM_H diff --git a/include/asm-generic/sizes.h b/include/asm-generic/sizes.h deleted file mode 100644 index 1dcfad9629ef..000000000000 --- a/include/asm-generic/sizes.h +++ /dev/null @@ -1,2 +0,0 @@ -/* This is a placeholder, to be removed over time */ -#include <linux/sizes.h> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 688ab0de7810..b40fc633f3be 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -15,7 +15,6 @@ struct filename; * This structure is used to hold the arguments that are used when loading binaries. */ struct linux_binprm { - char buf[BINPRM_BUF_SIZE]; #ifdef CONFIG_MMU struct vm_area_struct *vma; unsigned long vma_pages; @@ -64,6 +63,8 @@ struct linux_binprm { unsigned long loader, exec; struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */ + + char buf[BINPRM_BUF_SIZE]; } __randomize_layout; #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 602af23b98c7..cf074bce3eb3 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -60,7 +60,7 @@ static __always_inline unsigned long hweight_long(unsigned long w) */ static inline __u64 rol64(__u64 word, unsigned int shift) { - return (word << shift) | (word >> (64 - shift)); + return (word << (shift & 63)) | (word >> ((-shift) & 63)); } /** @@ -70,7 +70,7 @@ static inline __u64 rol64(__u64 word, unsigned int shift) */ static inline __u64 ror64(__u64 word, unsigned int shift) { - return (word >> shift) | (word << (64 - shift)); + return (word >> (shift & 63)) | (word << ((-shift) & 63)); } /** @@ -80,7 +80,7 @@ static inline __u64 ror64(__u64 word, unsigned int shift) */ static inline __u32 rol32(__u32 word, unsigned int shift) { - return (word << shift) | (word >> ((-shift) & 31)); + return (word << (shift & 31)) | (word >> ((-shift) & 31)); } /** @@ -90,7 +90,7 @@ static inline __u32 rol32(__u32 word, unsigned int shift) */ static inline __u32 ror32(__u32 word, unsigned int shift) { - return (word >> shift) | (word << (32 - shift)); + return (word >> (shift & 31)) | (word << ((-shift) & 31)); } /** @@ -100,7 +100,7 @@ static inline __u32 ror32(__u32 word, unsigned int shift) */ static inline __u16 rol16(__u16 word, unsigned int shift) { - return (word << shift) | (word >> (16 - shift)); + return (word << (shift & 15)) | (word >> ((-shift) & 15)); } /** @@ -110,7 +110,7 @@ static inline __u16 rol16(__u16 word, unsigned int shift) */ static inline __u16 ror16(__u16 word, unsigned int shift) { - return (word >> shift) | (word << (16 - shift)); + return (word >> (shift & 15)) | (word << ((-shift) & 15)); } /** @@ -120,7 +120,7 @@ static inline __u16 ror16(__u16 word, unsigned int shift) */ static inline __u8 rol8(__u8 word, unsigned int shift) { - return (word << shift) | (word >> (8 - shift)); + return (word << (shift & 7)) | (word >> ((-shift) & 7)); } /** @@ -130,7 +130,7 @@ static inline __u8 rol8(__u8 word, unsigned int shift) */ static inline __u8 ror8(__u8 word, unsigned int shift) { - return (word >> shift) | (word << (8 - shift)); + return (word >> (shift & 7)) | (word << ((-shift) & 7)); } /** diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index ba814f18cb4c..19e58b9138a0 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -140,8 +140,7 @@ struct ftrace_likely_data { * Do not use __always_inline here, since currently it expands to inline again * (which would break users of __always_inline). */ -#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ - !defined(CONFIG_OPTIMIZE_INLINING) +#if !defined(CONFIG_OPTIMIZE_INLINING) #define inline inline __attribute__((__always_inline__)) __gnu_inline \ __maybe_unused notrace #else diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 147bdec42215..21755471b1c3 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -633,8 +633,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len, */ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) { - char *nl = strchr(buf, '\n'); - unsigned int len = nl ? (unsigned int)(nl - buf) : strlen(buf); + unsigned int len = strchrnul(buf, '\n') - buf; return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 6ab8c1bada3f..c309f43bde45 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -19,6 +19,7 @@ struct ipc_ids { struct rw_semaphore rwsem; struct idr ipcs_idr; int max_idx; + int last_idx; /* For wrap around detection */ #ifdef CONFIG_CHECKPOINT_RESTORE int next_id; #endif diff --git a/include/linux/kernel.h b/include/linux/kernel.h index a3b59d143afb..74b1ee9027f5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -484,6 +484,7 @@ extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); extern int func_ptr_is_kernel_text(void *ptr); +u64 int_pow(u64 base, unsigned int exp); unsigned long int_sqrt(unsigned long); #if BITS_PER_LONG < 64 diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 2c89e60bc752..0f9da966934e 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -4,7 +4,6 @@ /* Simple interface for creating and stopping kernel threads without mess. */ #include <linux/err.h> #include <linux/sched.h> -#include <linux/cgroup.h> __printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), @@ -198,6 +197,8 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work); void kthread_destroy_worker(struct kthread_worker *worker); +struct cgroup_subsys_state; + #ifdef CONFIG_BLK_CGROUP void kthread_associate_blkcg(struct cgroup_subsys_state *css); struct cgroup_subsys_state *kthread_blkcg(void); diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index 7c560e0dc8f4..9022f0c2e2e4 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -36,7 +36,7 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) __account_scheduler_latency(task, usecs, inter); } -void clear_all_latency_tracing(struct task_struct *p); +void clear_tsk_latency_tracing(struct task_struct *p); extern int sysctl_latencytop(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -48,7 +48,7 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) { } -static inline void clear_all_latency_tracing(struct task_struct *p) +static inline void clear_tsk_latency_tracing(struct task_struct *p) { } diff --git a/include/linux/list.h b/include/linux/list.h index 9e9a6403dbe4..d3b4db895340 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -151,6 +151,23 @@ static inline void list_replace_init(struct list_head *old, } /** + * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position + * @entry1: the location to place entry2 + * @entry2: the location to place entry1 + */ +static inline void list_swap(struct list_head *entry1, + struct list_head *entry2) +{ + struct list_head *pos = entry2->prev; + + list_del(entry2); + list_replace(entry1, entry2); + if (pos == entry1) + pos = entry2; + list_add(entry1, pos); +} + +/** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ diff --git a/include/linux/list_sort.h b/include/linux/list_sort.h index ba79956e848d..20f178c24e9d 100644 --- a/include/linux/list_sort.h +++ b/include/linux/list_sort.h @@ -6,6 +6,7 @@ struct list_head; +__attribute__((nonnull(2,3))) void list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, struct list_head *a, struct list_head *b)); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 30561a954ee0..bc74d6a4407c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -94,8 +94,8 @@ enum mem_cgroup_events_target { MEM_CGROUP_NTARGETS, }; -struct mem_cgroup_stat_cpu { - long count[MEMCG_NR_STAT]; +struct memcg_vmstats_percpu { + long stat[MEMCG_NR_STAT]; unsigned long events[NR_VM_EVENT_ITEMS]; unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; @@ -128,6 +128,7 @@ struct mem_cgroup_per_node { struct lruvec_stat __percpu *lruvec_stat_cpu; atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; + atomic_long_t lruvec_stat_local[NR_VM_NODE_STAT_ITEMS]; unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; @@ -274,13 +275,17 @@ struct mem_cgroup { struct task_struct *move_lock_task; /* memory.stat */ - struct mem_cgroup_stat_cpu __percpu *stat_cpu; + struct memcg_vmstats_percpu __percpu *vmstats_percpu; MEMCG_PADDING(_pad2_); - atomic_long_t stat[MEMCG_NR_STAT]; - atomic_long_t events[NR_VM_EVENT_ITEMS]; - atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; + atomic_long_t vmstats[MEMCG_NR_STAT]; + atomic_long_t vmstats_local[MEMCG_NR_STAT]; + + atomic_long_t vmevents[NR_VM_EVENT_ITEMS]; + atomic_long_t vmevents_local[NR_VM_EVENT_ITEMS]; + + atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; unsigned long socket_pressure; @@ -554,10 +559,9 @@ void unlock_page_memcg(struct page *page); * idx can be of type enum memcg_stat_item or node_stat_item. * Keep in sync with memcg_exact_page_state(). */ -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, - int idx) +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - long x = atomic_long_read(&memcg->stat[idx]); + long x = atomic_long_read(&memcg->vmstats[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -565,23 +569,23 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, return x; } -/* idx can be of type enum memcg_stat_item or node_stat_item */ -static inline void __mod_memcg_state(struct mem_cgroup *memcg, - int idx, int val) +/* + * idx can be of type enum memcg_stat_item or node_stat_item. + * Keep in sync with memcg_exact_page_state(). + */ +static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, + int idx) { - long x; - - if (mem_cgroup_disabled()) - return; - - x = val + __this_cpu_read(memcg->stat_cpu->count[idx]); - if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { - atomic_long_add(x, &memcg->stat[idx]); + long x = atomic_long_read(&memcg->vmstats_local[idx]); +#ifdef CONFIG_SMP + if (x < 0) x = 0; - } - __this_cpu_write(memcg->stat_cpu->count[idx], x); +#endif + return x; } +void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); + /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) @@ -642,32 +646,27 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec, return x; } -static inline void __mod_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, int val) +static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, + enum node_stat_item idx) { struct mem_cgroup_per_node *pn; long x; - /* Update node */ - __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); - if (mem_cgroup_disabled()) - return; + return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - - /* Update memcg */ - __mod_memcg_state(pn->memcg, idx, val); - - /* Update lruvec */ - x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); - if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { - atomic_long_add(x, &pn->lruvec_stat[idx]); + x = atomic_long_read(&pn->lruvec_stat_local[idx]); +#ifdef CONFIG_SMP + if (x < 0) x = 0; - } - __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); +#endif + return x; } +void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + int val); + static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { @@ -708,22 +707,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); -static inline void __count_memcg_events(struct mem_cgroup *memcg, - enum vm_event_item idx, - unsigned long count) -{ - unsigned long x; - - if (mem_cgroup_disabled()) - return; - - x = count + __this_cpu_read(memcg->stat_cpu->events[idx]); - if (unlikely(x > MEMCG_CHARGE_BATCH)) { - atomic_long_add(x, &memcg->events[idx]); - x = 0; - } - __this_cpu_write(memcg->stat_cpu->events[idx], x); -} +void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, + unsigned long count); static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, @@ -1011,8 +996,13 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { } -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, - int idx) +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + return 0; +} + +static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, + int idx) { return 0; } @@ -1047,6 +1037,12 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); } +static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, + enum node_stat_item idx) +{ + return node_page_state(lruvec_pgdat(lruvec), idx); +} + static inline void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 912614fbbef3..0e8834ac32b7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -536,9 +536,6 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma) struct mmu_gather; struct inode; -#define page_private(page) ((page)->private) -#define set_page_private(page, v) ((page)->private = (v)) - #if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline int pmd_devmap(pmd_t pmd) { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e1f42a07d8f0..8ec38b11b361 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -220,6 +220,9 @@ struct page { #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) +#define page_private(page) ((page)->private) +#define set_page_private(page, v) ((page)->private = (v)) + struct page_frag_cache { void * va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5a4aedc160bd..70394cabaf4e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -18,6 +18,8 @@ #include <linux/pageblock-flags.h> #include <linux/page-flags-layout.h> #include <linux/atomic.h> +#include <linux/mm_types.h> +#include <linux/page-flags.h> #include <asm/page.h> /* Free memory management - zoned buddy allocator. */ @@ -98,6 +100,62 @@ struct free_area { unsigned long nr_free; }; +/* Used for pages not on another list */ +static inline void add_to_free_area(struct page *page, struct free_area *area, + int migratetype) +{ + list_add(&page->lru, &area->free_list[migratetype]); + area->nr_free++; +} + +/* Used for pages not on another list */ +static inline void add_to_free_area_tail(struct page *page, struct free_area *area, + int migratetype) +{ + list_add_tail(&page->lru, &area->free_list[migratetype]); + area->nr_free++; +} + +#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR +/* Used to preserve page allocation order entropy */ +void add_to_free_area_random(struct page *page, struct free_area *area, + int migratetype); +#else +static inline void add_to_free_area_random(struct page *page, + struct free_area *area, int migratetype) +{ + add_to_free_area(page, area, migratetype); +} +#endif + +/* Used for pages which are on another list */ +static inline void move_to_free_area(struct page *page, struct free_area *area, + int migratetype) +{ + list_move(&page->lru, &area->free_list[migratetype]); +} + +static inline struct page *get_page_from_free_area(struct free_area *area, + int migratetype) +{ + return list_first_entry_or_null(&area->free_list[migratetype], + struct page, lru); +} + +static inline void del_page_from_free_area(struct page *page, + struct free_area *area) +{ + list_del(&page->lru); + __ClearPageBuddy(page); + set_page_private(page, 0); + area->nr_free--; +} + +static inline bool free_area_empty(struct free_area *area, int migratetype) +{ + return list_empty(&area->free_list[migratetype]); +} + struct pglist_data; /* @@ -1271,6 +1329,7 @@ void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) +#define pfn_present pfn_valid #endif /* CONFIG_SPARSEMEM */ /* diff --git a/include/linux/plist.h b/include/linux/plist.h index 97883604a3c5..9365df5a823f 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -231,7 +231,7 @@ static inline int plist_node_empty(const struct plist_node *node) * @type: the type of the struct this is embedded in * @member: the name of the list_head within the struct */ -#ifdef CONFIG_DEBUG_PI_LIST +#ifdef CONFIG_DEBUG_PLIST # define plist_first_entry(head, type, member) \ ({ \ WARN_ON(plist_head_empty(head)); \ @@ -248,7 +248,7 @@ static inline int plist_node_empty(const struct plist_node *node) * @type: the type of the struct this is embedded in * @member: the name of the list_head within the struct */ -#ifdef CONFIG_DEBUG_PI_LIST +#ifdef CONFIG_DEBUG_PLIST # define plist_last_entry(head, type, member) \ ({ \ WARN_ON(plist_head_empty(head)); \ diff --git a/include/linux/poll.h b/include/linux/poll.h index 7e0fdcf905d2..1cdc32b1f1b0 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -16,7 +16,11 @@ extern struct ctl_table epoll_table[]; /* for sysctl */ /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating additional memory. */ +#ifdef __clang__ +#define MAX_STACK_ALLOC 768 +#else #define MAX_STACK_ALLOC 832 +#endif #define FRONTEND_STACK_ALLOC 256 #define SELECT_STACK_ALLOC FRONTEND_STACK_ALLOC #define POLL_STACK_ALLOC FRONTEND_STACK_ALLOC diff --git a/include/linux/pps-gpio.h b/include/linux/pps-gpio.h index 56f35dd3d01d..44171e6b7197 100644 --- a/include/linux/pps-gpio.h +++ b/include/linux/pps-gpio.h @@ -23,10 +23,11 @@ #define _PPS_GPIO_H struct pps_gpio_platform_data { + struct gpio_desc *gpio_pin; + struct gpio_desc *echo_pin; bool assert_falling_edge; bool capture_clear; - unsigned int gpio_pin; - const char *gpio_label; + unsigned int echo_active_ms; }; #endif /* _PPS_GPIO_H */ diff --git a/include/linux/printk.h b/include/linux/printk.h index 84ea4d094af3..cefd374c47b1 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -82,6 +82,8 @@ static inline void console_verbose(void) extern char devkmsg_log_str[]; struct ctl_table; +extern int suppress_printk; + struct va_format { const char *fmt; va_list *va; diff --git a/include/linux/psi.h b/include/linux/psi.h index 7006008d5b72..7b3de7321219 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -4,6 +4,7 @@ #include <linux/jump_label.h> #include <linux/psi_types.h> #include <linux/sched.h> +#include <linux/poll.h> struct seq_file; struct css_set; @@ -11,6 +12,7 @@ struct css_set; #ifdef CONFIG_PSI extern struct static_key_false psi_disabled; +extern struct psi_group psi_system; void psi_init(void); @@ -26,6 +28,13 @@ int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); + +struct psi_trigger *psi_trigger_create(struct psi_group *group, + char *buf, size_t nbytes, enum psi_res res); +void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t); + +__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, + poll_table *wait); #endif #else /* CONFIG_PSI */ diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 2cf422db5d18..07aaf9b82241 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -1,8 +1,11 @@ #ifndef _LINUX_PSI_TYPES_H #define _LINUX_PSI_TYPES_H +#include <linux/kthread.h> #include <linux/seqlock.h> #include <linux/types.h> +#include <linux/kref.h> +#include <linux/wait.h> #ifdef CONFIG_PSI @@ -11,7 +14,7 @@ enum psi_task_count { NR_IOWAIT, NR_MEMSTALL, NR_RUNNING, - NR_PSI_TASK_COUNTS, + NR_PSI_TASK_COUNTS = 3, }; /* Task state bitmasks */ @@ -24,7 +27,7 @@ enum psi_res { PSI_IO, PSI_MEM, PSI_CPU, - NR_PSI_RESOURCES, + NR_PSI_RESOURCES = 3, }; /* @@ -41,7 +44,13 @@ enum psi_states { PSI_CPU_SOME, /* Only per-CPU, to weigh the CPU in the global average: */ PSI_NONIDLE, - NR_PSI_STATES, + NR_PSI_STATES = 6, +}; + +enum psi_aggregators { + PSI_AVGS = 0, + PSI_POLL, + NR_PSI_AGGREGATORS, }; struct psi_group_cpu { @@ -53,6 +62,9 @@ struct psi_group_cpu { /* States of the tasks belonging to this group */ unsigned int tasks[NR_PSI_TASK_COUNTS]; + /* Aggregate pressure state derived from the tasks */ + u32 state_mask; + /* Period time sampling buckets for each state of interest (ns) */ u32 times[NR_PSI_STATES]; @@ -62,25 +74,94 @@ struct psi_group_cpu { /* 2nd cacheline updated by the aggregator */ /* Delta detection against the sampling buckets */ - u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp; + u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES] + ____cacheline_aligned_in_smp; +}; + +/* PSI growth tracking window */ +struct psi_window { + /* Window size in ns */ + u64 size; + + /* Start time of the current window in ns */ + u64 start_time; + + /* Value at the start of the window */ + u64 start_value; + + /* Value growth in the previous window */ + u64 prev_growth; +}; + +struct psi_trigger { + /* PSI state being monitored by the trigger */ + enum psi_states state; + + /* User-spacified threshold in ns */ + u64 threshold; + + /* List node inside triggers list */ + struct list_head node; + + /* Backpointer needed during trigger destruction */ + struct psi_group *group; + + /* Wait queue for polling */ + wait_queue_head_t event_wait; + + /* Pending event flag */ + int event; + + /* Tracking window */ + struct psi_window win; + + /* + * Time last event was generated. Used for rate-limiting + * events to one per window + */ + u64 last_event_time; + + /* Refcounting to prevent premature destruction */ + struct kref refcount; }; struct psi_group { - /* Protects data updated during an aggregation */ - struct mutex stat_lock; + /* Protects data used by the aggregator */ + struct mutex avgs_lock; /* Per-cpu task state & time tracking */ struct psi_group_cpu __percpu *pcpu; - /* Periodic aggregation state */ - u64 total_prev[NR_PSI_STATES - 1]; - u64 last_update; - u64 next_update; - struct delayed_work clock_work; + /* Running pressure averages */ + u64 avg_total[NR_PSI_STATES - 1]; + u64 avg_last_update; + u64 avg_next_update; + + /* Aggregator work control */ + struct delayed_work avgs_work; /* Total stall times and sampled pressure averages */ - u64 total[NR_PSI_STATES - 1]; + u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1]; unsigned long avg[NR_PSI_STATES - 1][3]; + + /* Monitor work control */ + atomic_t poll_scheduled; + struct kthread_worker __rcu *poll_kworker; + struct kthread_delayed_work poll_work; + + /* Protects data used by the monitor */ + struct mutex trigger_lock; + + /* Configured polling triggers */ + struct list_head triggers; + u32 nr_triggers[NR_PSI_STATES - 1]; + u32 poll_states; + u64 poll_min_period; + + /* Total stall times at the start of monitor activation */ + u64 polling_total[NR_PSI_STATES - 1]; + u64 polling_next_update; + u64 polling_until; }; #else /* CONFIG_PSI */ diff --git a/include/linux/qcom-geni-se.h b/include/linux/qcom-geni-se.h index 3bcd67fd5548..dd464943f717 100644 --- a/include/linux/qcom-geni-se.h +++ b/include/linux/qcom-geni-se.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2017-2018, The Linux Foundation. All rights reserved. */ diff --git a/include/linux/reboot.h b/include/linux/reboot.h index e63799a6e895..3734cd8f38a8 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -14,6 +14,7 @@ struct device; #define SYS_POWER_OFF 0x0003 /* Notify of system power off */ enum reboot_mode { + REBOOT_UNDEFINED = -1, REBOOT_COLD = 0, REBOOT_WARM, REBOOT_HARD, @@ -21,6 +22,7 @@ enum reboot_mode { REBOOT_GPIO, }; extern enum reboot_mode reboot_mode; +extern enum reboot_mode panic_reboot_mode; enum reboot_type { BOOT_TRIPLE = 't', diff --git a/include/linux/sched.h b/include/linux/sched.h index a2cd15855bad..11837410690f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -26,7 +26,6 @@ #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/signal_types.h> -#include <linux/psi_types.h> #include <linux/mm_types_task.h> #include <linux/task_io_accounting.h> #include <linux/rseq.h> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index e412c092c1e8..38a0f0785323 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -271,17 +271,18 @@ static inline int signal_group_exit(const struct signal_struct *sig) extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); -extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *info); +extern int dequeue_signal(struct task_struct *task, + sigset_t *mask, kernel_siginfo_t *info); static inline int kernel_dequeue_signal(void) { - struct task_struct *tsk = current; + struct task_struct *task = current; kernel_siginfo_t __info; int ret; - spin_lock_irq(&tsk->sighand->siglock); - ret = dequeue_signal(tsk, &tsk->blocked, &__info); - spin_unlock_irq(&tsk->sighand->siglock); + spin_lock_irq(&task->sighand->siglock); + ret = dequeue_signal(task, &task->blocked, &__info); + spin_unlock_irq(&task->sighand->siglock); return ret; } @@ -419,18 +420,18 @@ static inline void set_restore_sigmask(void) WARN_ON(!test_thread_flag(TIF_SIGPENDING)); } -static inline void clear_tsk_restore_sigmask(struct task_struct *tsk) +static inline void clear_tsk_restore_sigmask(struct task_struct *task) { - clear_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK); + clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline void clear_restore_sigmask(void) { clear_thread_flag(TIF_RESTORE_SIGMASK); } -static inline bool test_tsk_restore_sigmask(struct task_struct *tsk) +static inline bool test_tsk_restore_sigmask(struct task_struct *task) { - return test_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK); + return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline bool test_restore_sigmask(void) { @@ -449,9 +450,9 @@ static inline void set_restore_sigmask(void) current->restore_sigmask = true; WARN_ON(!test_thread_flag(TIF_SIGPENDING)); } -static inline void clear_tsk_restore_sigmask(struct task_struct *tsk) +static inline void clear_tsk_restore_sigmask(struct task_struct *task) { - tsk->restore_sigmask = false; + task->restore_sigmask = false; } static inline void clear_restore_sigmask(void) { @@ -461,9 +462,9 @@ static inline bool test_restore_sigmask(void) { return current->restore_sigmask; } -static inline bool test_tsk_restore_sigmask(struct task_struct *tsk) +static inline bool test_tsk_restore_sigmask(struct task_struct *task) { - return tsk->restore_sigmask; + return task->restore_sigmask; } static inline bool test_and_clear_restore_sigmask(void) { @@ -617,9 +618,9 @@ static inline struct pid *task_session(struct task_struct *task) return task->signal->pids[PIDTYPE_SID]; } -static inline int get_nr_threads(struct task_struct *tsk) +static inline int get_nr_threads(struct task_struct *task) { - return tsk->signal->nr_threads; + return task->signal->nr_threads; } static inline bool thread_group_leader(struct task_struct *p) @@ -658,35 +659,35 @@ static inline int thread_group_empty(struct task_struct *p) #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) -extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, +extern struct sighand_struct *__lock_task_sighand(struct task_struct *task, unsigned long *flags); -static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk, +static inline struct sighand_struct *lock_task_sighand(struct task_struct *task, unsigned long *flags) { struct sighand_struct *ret; - ret = __lock_task_sighand(tsk, flags); - (void)__cond_lock(&tsk->sighand->siglock, ret); + ret = __lock_task_sighand(task, flags); + (void)__cond_lock(&task->sighand->siglock, ret); return ret; } -static inline void unlock_task_sighand(struct task_struct *tsk, +static inline void unlock_task_sighand(struct task_struct *task, unsigned long *flags) { - spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); + spin_unlock_irqrestore(&task->sighand->siglock, *flags); } -static inline unsigned long task_rlimit(const struct task_struct *tsk, +static inline unsigned long task_rlimit(const struct task_struct *task, unsigned int limit) { - return READ_ONCE(tsk->signal->rlim[limit].rlim_cur); + return READ_ONCE(task->signal->rlim[limit].rlim_cur); } -static inline unsigned long task_rlimit_max(const struct task_struct *tsk, +static inline unsigned long task_rlimit_max(const struct task_struct *task, unsigned int limit) { - return READ_ONCE(tsk->signal->rlim[limit].rlim_max); + return READ_ONCE(task->signal->rlim[limit].rlim_max); } static inline unsigned long rlimit(unsigned int limit) diff --git a/init/Kconfig b/init/Kconfig index 82b84e5ee30d..8b9ffe236e4f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1752,6 +1752,30 @@ config SLAB_FREELIST_HARDENED sacrifies to harden the kernel slab allocator against common freelist exploit methods. +config SHUFFLE_PAGE_ALLOCATOR + bool "Page allocator randomization" + default SLAB_FREELIST_RANDOM && ACPI_NUMA + help + Randomization of the page allocator improves the average + utilization of a direct-mapped memory-side-cache. See section + 5.2.27 Heterogeneous Memory Attribute Table (HMAT) in the ACPI + 6.2a specification for an example of how a platform advertises + the presence of a memory-side-cache. There are also incidental + security benefits as it reduces the predictability of page + allocations to compliment SLAB_FREELIST_RANDOM, but the + default granularity of shuffling on the "MAX_ORDER - 1" i.e, + 10th order of pages is selected based on cache utilization + benefits on x86. + + While the randomization improves cache utilization it may + negatively impact workloads on platforms without a cache. For + this reason, by default, the randomization is enabled only + after runtime detection of a direct-mapped memory-side-cache. + Otherwise, the randomization may be force enabled with the + 'page_alloc.shuffle' kernel command line parameter. + + Say Y if unsure. + config SLUB_CPU_PARTIAL default y depends on SLUB && SMP diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 49f9bf4ffc7f..bfaae457810c 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -120,7 +120,9 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, static int zero; static int one = 1; static int int_max = INT_MAX; -static int ipc_mni = IPCMNI; +int ipc_mni = IPCMNI; +int ipc_mni_shift = IPCMNI_SHIFT; +int ipc_min_cycle = RADIX_TREE_MAP_SIZE; static struct ctl_table ipc_kern_table[] = { { @@ -246,3 +248,13 @@ static int __init ipc_sysctl_init(void) } device_initcall(ipc_sysctl_init); + +static int __init ipc_mni_extend(char *str) +{ + ipc_mni = IPCMNI_EXTEND; + ipc_mni_shift = IPCMNI_EXTEND_SHIFT; + ipc_min_cycle = IPCMNI_EXTEND_MIN_CYCLE; + pr_info("IPCMNI extended to %d.\n", ipc_mni); + return 0; +} +early_param("ipcmni_extend", ipc_mni_extend); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index ba44164ea1f9..216cad1ff0d0 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -76,6 +76,7 @@ struct mqueue_inode_info { wait_queue_head_t wait_q; struct rb_root msg_tree; + struct rb_node *msg_tree_rightmost; struct posix_msg_tree_node *node_cache; struct mq_attr attr; @@ -131,6 +132,7 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info) { struct rb_node **p, *parent = NULL; struct posix_msg_tree_node *leaf; + bool rightmost = true; p = &info->msg_tree.rb_node; while (*p) { @@ -139,9 +141,10 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info) if (likely(leaf->priority == msg->m_type)) goto insert_msg; - else if (msg->m_type < leaf->priority) + else if (msg->m_type < leaf->priority) { p = &(*p)->rb_left; - else + rightmost = false; + } else p = &(*p)->rb_right; } if (info->node_cache) { @@ -154,6 +157,10 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info) INIT_LIST_HEAD(&leaf->msg_list); } leaf->priority = msg->m_type; + + if (rightmost) + info->msg_tree_rightmost = &leaf->rb_node; + rb_link_node(&leaf->rb_node, parent, p); rb_insert_color(&leaf->rb_node, &info->msg_tree); insert_msg: @@ -163,23 +170,35 @@ insert_msg: return 0; } +static inline void msg_tree_erase(struct posix_msg_tree_node *leaf, + struct mqueue_inode_info *info) +{ + struct rb_node *node = &leaf->rb_node; + + if (info->msg_tree_rightmost == node) + info->msg_tree_rightmost = rb_prev(node); + + rb_erase(node, &info->msg_tree); + if (info->node_cache) { + kfree(leaf); + } else { + info->node_cache = leaf; + } +} + static inline struct msg_msg *msg_get(struct mqueue_inode_info *info) { - struct rb_node **p, *parent = NULL; + struct rb_node *parent = NULL; struct posix_msg_tree_node *leaf; struct msg_msg *msg; try_again: - p = &info->msg_tree.rb_node; - while (*p) { - parent = *p; - /* - * During insert, low priorities go to the left and high to the - * right. On receive, we want the highest priorities first, so - * walk all the way to the right. - */ - p = &(*p)->rb_right; - } + /* + * During insert, low priorities go to the left and high to the + * right. On receive, we want the highest priorities first, so + * walk all the way to the right. + */ + parent = info->msg_tree_rightmost; if (!parent) { if (info->attr.mq_curmsgs) { pr_warn_once("Inconsistency in POSIX message queue, " @@ -194,24 +213,14 @@ try_again: pr_warn_once("Inconsistency in POSIX message queue, " "empty leaf node but we haven't implemented " "lazy leaf delete!\n"); - rb_erase(&leaf->rb_node, &info->msg_tree); - if (info->node_cache) { - kfree(leaf); - } else { - info->node_cache = leaf; - } + msg_tree_erase(leaf, info); goto try_again; } else { msg = list_first_entry(&leaf->msg_list, struct msg_msg, m_list); list_del(&msg->m_list); if (list_empty(&leaf->msg_list)) { - rb_erase(&leaf->rb_node, &info->msg_tree); - if (info->node_cache) { - kfree(leaf); - } else { - info->node_cache = leaf; - } + msg_tree_erase(leaf, info); } } info->attr.mq_curmsgs--; @@ -254,6 +263,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, info->qsize = 0; info->user = NULL; /* set when all is ok */ info->msg_tree = RB_ROOT; + info->msg_tree_rightmost = NULL; info->node_cache = NULL; memset(&info->attr, 0, sizeof(info->attr)); info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max, @@ -430,7 +440,8 @@ static void mqueue_evict_inode(struct inode *inode) struct user_struct *user; unsigned long mq_bytes, mq_treesize; struct ipc_namespace *ipc_ns; - struct msg_msg *msg; + struct msg_msg *msg, *nmsg; + LIST_HEAD(tmp_msg); clear_inode(inode); @@ -441,10 +452,15 @@ static void mqueue_evict_inode(struct inode *inode) info = MQUEUE_I(inode); spin_lock(&info->lock); while ((msg = msg_get(info)) != NULL) - free_msg(msg); + list_add_tail(&msg->m_list, &tmp_msg); kfree(info->node_cache); spin_unlock(&info->lock); + list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) { + list_del(&msg->m_list); + free_msg(msg); + } + /* Total amount of bytes accounted for the mqueue */ mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) + min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) * @@ -605,8 +621,6 @@ static void wq_add(struct mqueue_inode_info *info, int sr, { struct ext_wait_queue *walk; - ewp->task = current; - list_for_each_entry(walk, &info->e_wait_q[sr].list, list) { if (walk->task->prio <= current->prio) { list_add_tail(&ewp->list, &walk->list); diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 84598025a6ad..e65593742e2b 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -18,6 +18,7 @@ #include <linux/utsname.h> #include <linux/proc_ns.h> #include <linux/uaccess.h> +#include <linux/sched.h> #include "util.h" @@ -64,6 +65,9 @@ static struct msg_msg *alloc_msg(size_t len) pseg = &msg->next; while (len > 0) { struct msg_msgseg *seg; + + cond_resched(); + alen = min(len, DATALEN_SEG); seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_ACCOUNT); if (seg == NULL) @@ -176,6 +180,8 @@ void free_msg(struct msg_msg *msg) kfree(msg); while (seg != NULL) { struct msg_msgseg *tmp = seg->next; + + cond_resched(); kfree(seg); seg = tmp; } diff --git a/ipc/util.c b/ipc/util.c index 095274a871f8..d126d156efc6 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -109,7 +109,7 @@ static const struct rhashtable_params ipc_kht_params = { * @ids: ipc identifier set * * Set up the sequence range to use for the ipc identifier range (limited - * below IPCMNI) then initialise the keys hashtable and ids idr. + * below ipc_mni) then initialise the keys hashtable and ids idr. */ void ipc_init_ids(struct ipc_ids *ids) { @@ -119,6 +119,7 @@ void ipc_init_ids(struct ipc_ids *ids) rhashtable_init(&ids->key_ht, &ipc_kht_params); idr_init(&ids->ipcs_idr); ids->max_idx = -1; + ids->last_idx = -1; #ifdef CONFIG_CHECKPOINT_RESTORE ids->next_id = -1; #endif @@ -192,6 +193,10 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) * * The caller must own kern_ipc_perm.lock.of the new object. * On error, the function returns a (negative) error code. + * + * To conserve sequence number space, especially with extended ipc_mni, + * the sequence number is incremented only when the returned ID is less than + * the last one. */ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) { @@ -215,17 +220,42 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) */ if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */ - new->seq = ids->seq++; - if (ids->seq > IPCID_SEQ_MAX) - ids->seq = 0; - idx = idr_alloc(&ids->ipcs_idr, new, 0, 0, GFP_NOWAIT); + int max_idx; + + max_idx = max(ids->in_use*3/2, ipc_min_cycle); + max_idx = min(max_idx, ipc_mni); + + /* allocate the idx, with a NULL struct kern_ipc_perm */ + idx = idr_alloc_cyclic(&ids->ipcs_idr, NULL, 0, max_idx, + GFP_NOWAIT); + + if (idx >= 0) { + /* + * idx got allocated successfully. + * Now calculate the sequence number and set the + * pointer for real. + */ + if (idx <= ids->last_idx) { + ids->seq++; + if (ids->seq >= ipcid_seq_max()) + ids->seq = 0; + } + ids->last_idx = idx; + + new->seq = ids->seq; + /* no need for smp_wmb(), this is done + * inside idr_replace, as part of + * rcu_assign_pointer + */ + idr_replace(&ids->ipcs_idr, new, idx); + } } else { new->seq = ipcid_to_seqx(next_id); idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id), 0, GFP_NOWAIT); } if (idx >= 0) - new->id = SEQ_MULTIPLIER * new->seq + idx; + new->id = (new->seq << ipcmni_seq_shift()) + idx; return idx; } @@ -253,8 +283,8 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) /* 1) Initialize the refcount so that ipc_rcu_putref works */ refcount_set(&new->refcount, 1); - if (limit > IPCMNI) - limit = IPCMNI; + if (limit > ipc_mni) + limit = ipc_mni; if (ids->in_use >= limit) return -ENOSPC; @@ -737,7 +767,7 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, if (total >= ids->in_use) return NULL; - for (; pos < IPCMNI; pos++) { + for (; pos < ipc_mni; pos++) { ipc = idr_find(&ids->ipcs_idr, pos); if (ipc != NULL) { *new_pos = pos + 1; diff --git a/ipc/util.h b/ipc/util.h index e272be622ae7..0fcf8e719b76 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -15,8 +15,37 @@ #include <linux/err.h> #include <linux/ipc_namespace.h> -#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ -#define SEQ_MULTIPLIER (IPCMNI) +/* + * The IPC ID contains 2 separate numbers - index and sequence number. + * By default, + * bits 0-14: index (32k, 15 bits) + * bits 15-30: sequence number (64k, 16 bits) + * + * When IPCMNI extension mode is turned on, the composition changes: + * bits 0-23: index (16M, 24 bits) + * bits 24-30: sequence number (128, 7 bits) + */ +#define IPCMNI_SHIFT 15 +#define IPCMNI_EXTEND_SHIFT 24 +#define IPCMNI_EXTEND_MIN_CYCLE (RADIX_TREE_MAP_SIZE * RADIX_TREE_MAP_SIZE) +#define IPCMNI (1 << IPCMNI_SHIFT) +#define IPCMNI_EXTEND (1 << IPCMNI_EXTEND_SHIFT) + +#ifdef CONFIG_SYSVIPC_SYSCTL +extern int ipc_mni; +extern int ipc_mni_shift; +extern int ipc_min_cycle; + +#define ipcmni_seq_shift() ipc_mni_shift +#define IPCMNI_IDX_MASK ((1 << ipc_mni_shift) - 1) + +#else /* CONFIG_SYSVIPC_SYSCTL */ + +#define ipc_mni IPCMNI +#define ipc_min_cycle ((int)RADIX_TREE_MAP_SIZE) +#define ipcmni_seq_shift() IPCMNI_SHIFT +#define IPCMNI_IDX_MASK ((1 << IPCMNI_SHIFT) - 1) +#endif /* CONFIG_SYSVIPC_SYSCTL */ void sem_init(void); void msg_init(void); @@ -96,9 +125,9 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *); #define IPC_MSG_IDS 1 #define IPC_SHM_IDS 2 -#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) -#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER) -#define IPCID_SEQ_MAX min_t(int, INT_MAX/SEQ_MULTIPLIER, USHRT_MAX) +#define ipcid_to_idx(id) ((id) & IPCMNI_IDX_MASK) +#define ipcid_to_seqx(id) ((id) >> ipcmni_seq_shift()) +#define ipcid_seq_max() (INT_MAX >> ipcmni_seq_shift()) /* must be called with ids->rwsem acquired for writing */ int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); @@ -123,8 +152,8 @@ static inline int ipc_get_maxidx(struct ipc_ids *ids) if (ids->in_use == 0) return -1; - if (ids->in_use == IPCMNI) - return IPCMNI - 1; + if (ids->in_use == ipc_mni) + return ipc_mni - 1; return ids->max_idx; } @@ -216,10 +245,10 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, static inline int sem_check_semmni(struct ipc_namespace *ns) { /* - * Check semmni range [0, IPCMNI] + * Check semmni range [0, ipc_mni] * semmni is the last element of sem_ctls[4] array */ - return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > IPCMNI)) + return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > ipc_mni)) ? -ERANGE : 0; } diff --git a/kernel/Makefile b/kernel/Makefile index 298437bb2c6a..33824f0385b3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -127,7 +127,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz -cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@ +cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_ikh_data.sh $@ $(obj)/kheaders_data.tar.xz: FORCE $(call cmd,genikh) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 327f37c9fdfa..217cec4e22c6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3540,17 +3540,84 @@ static int cpu_stat_show(struct seq_file *seq, void *v) #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_CPU); } -#endif + +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, enum psi_res res) +{ + struct psi_trigger *new; + struct cgroup *cgrp; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + + cgroup_get(cgrp); + cgroup_kn_unlock(of->kn); + + new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); + if (IS_ERR(new)) { + cgroup_put(cgrp); + return PTR_ERR(new); + } + + psi_trigger_replace(&of->priv, new); + + cgroup_put(cgrp); + + return nbytes; +} + +static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_IO); +} + +static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); +} + +static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); +} + +static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, + poll_table *pt) +{ + return psi_trigger_poll(&of->priv, of->file, pt); +} + +static void cgroup_pressure_release(struct kernfs_open_file *of) +{ + psi_trigger_replace(&of->priv, NULL); +} +#endif /* CONFIG_PSI */ static int cgroup_freeze_show(struct seq_file *seq, void *v) { @@ -4743,20 +4810,26 @@ static struct cftype cgroup_base_files[] = { #ifdef CONFIG_PSI { .name = "io.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_io_pressure_show, + .write = cgroup_io_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "memory.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_memory_pressure_show, + .write = cgroup_memory_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "cpu.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_cpu_pressure_show, + .write = cgroup_cpu_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, -#endif +#endif /* CONFIG_PSI */ { } /* terminate */ }; diff --git a/kernel/exit.c b/kernel/exit.c index 2166c2d92ddc..8361a560cd1d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -422,7 +422,7 @@ retry: * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); return; } @@ -462,7 +462,7 @@ retry: * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); return; assign_new_owner: @@ -483,7 +483,7 @@ assign_new_owner: put_task_struct(c); goto retry; } - mm->owner = c; + WRITE_ONCE(mm->owner, c); task_unlock(c); put_task_struct(c); } diff --git a/kernel/fork.c b/kernel/fork.c index 737db1828437..b4cba953040a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -955,6 +955,15 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static __always_inline void mm_clear_owner(struct mm_struct *mm, + struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + if (mm->owner == p) + WRITE_ONCE(mm->owner, NULL); +#endif +} + static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { #ifdef CONFIG_MEMCG @@ -1343,6 +1352,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, free_pt: /* don't put binfmt in mmput, we haven't got module yet */ mm->binfmt = NULL; + mm_init_owner(mm, NULL); mmput(mm); fail_nomem: @@ -1726,6 +1736,21 @@ static int pidfd_create(struct pid *pid) return fd; } +static void __delayed_free_task(struct rcu_head *rhp) +{ + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + free_task(tsk); +} + +static __always_inline void delayed_free_task(struct task_struct *tsk) +{ + if (IS_ENABLED(CONFIG_MEMCG)) + call_rcu(&tsk->rcu, __delayed_free_task); + else + free_task(tsk); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2068,7 +2093,7 @@ static __latent_entropy struct task_struct *copy_process( #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif - clear_all_latency_tracing(p); + clear_tsk_latency_tracing(p); /* ok, now we should be set up.. */ p->pid = pid_nr(pid); @@ -2233,8 +2258,10 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) + if (p->mm) { + mm_clear_owner(p->mm, p); mmput(p->mm); + } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); @@ -2265,7 +2292,7 @@ bad_fork_cleanup_count: bad_fork_free: p->state = TASK_DEAD; put_task_stack(p); - free_task(p); + delayed_free_task(p); fork_out: spin_lock_irq(¤t->sighand->siglock); hlist_del_init(&delayed.node); diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 1e3823fa799b..f71c1adcff31 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -53,6 +53,7 @@ config GCOV_PROFILE_ALL choice prompt "Specify GCOV format" depends on GCOV_KERNEL + depends on CC_IS_GCC ---help--- The gcov format is usually determined by the GCC version, and the default is chosen according to your GCC version. However, there are @@ -62,7 +63,7 @@ choice config GCOV_FORMAT_3_4 bool "GCC 3.4 format" - depends on CC_IS_GCC && GCC_VERSION < 40700 + depends on GCC_VERSION < 40700 ---help--- Select this option to use the format defined by GCC 3.4. diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index ff06d64df397..d66a74b0f100 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -2,5 +2,6 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-y := base.o fs.o -obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o -obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o +obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o +obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o +obj-$(CONFIG_CC_IS_CLANG) += clang.o diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 9c7c8d5c18f2..0ffe9f194080 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -22,88 +22,8 @@ #include <linux/sched.h> #include "gcov.h" -static int gcov_events_enabled; -static DEFINE_MUTEX(gcov_lock); - -/* - * __gcov_init is called by gcc-generated constructor code for each object - * file compiled with -fprofile-arcs. - */ -void __gcov_init(struct gcov_info *info) -{ - static unsigned int gcov_version; - - mutex_lock(&gcov_lock); - if (gcov_version == 0) { - gcov_version = gcov_info_version(info); - /* - * Printing gcc's version magic may prove useful for debugging - * incompatibility reports. - */ - pr_info("version magic: 0x%x\n", gcov_version); - } - /* - * Add new profiling data structure to list and inform event - * listener. - */ - gcov_info_link(info); - if (gcov_events_enabled) - gcov_event(GCOV_ADD, info); - mutex_unlock(&gcov_lock); -} -EXPORT_SYMBOL(__gcov_init); - -/* - * These functions may be referenced by gcc-generated profiling code but serve - * no function for kernel profiling. - */ -void __gcov_flush(void) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_flush); - -void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_add); - -void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_single); - -void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_delta); - -void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_ior); - -void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_time_profile); - -void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_icall_topn); - -void __gcov_exit(void) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_exit); +int gcov_events_enabled; +DEFINE_MUTEX(gcov_lock); /** * gcov_enable_events - enable event reporting through gcov_event() @@ -144,7 +64,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, /* Remove entries located in module from linked list. */ while ((info = gcov_info_next(info))) { - if (within_module((unsigned long)info, mod)) { + if (gcov_info_within_module(info, mod)) { gcov_info_unlink(prev, info); if (gcov_events_enabled) gcov_event(GCOV_REMOVE, info); diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c new file mode 100644 index 000000000000..c94b820a1b62 --- /dev/null +++ b/kernel/gcov/clang.c @@ -0,0 +1,581 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Google, Inc. + * modified from kernel/gcov/gcc_4_7.c + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * LLVM uses profiling data that's deliberately similar to GCC, but has a + * very different way of exporting that data. LLVM calls llvm_gcov_init() once + * per module, and provides a couple of callbacks that we can use to ask for + * more data. + * + * We care about the "writeout" callback, which in turn calls back into + * compiler-rt/this module to dump all the gathered coverage data to disk: + * + * llvm_gcda_start_file() + * llvm_gcda_emit_function() + * llvm_gcda_emit_arcs() + * llvm_gcda_emit_function() + * llvm_gcda_emit_arcs() + * [... repeats for each function ...] + * llvm_gcda_summary_info() + * llvm_gcda_end_file() + * + * This design is much more stateless and unstructured than gcc's, and is + * intended to run at process exit. This forces us to keep some local state + * about which module we're dealing with at the moment. On the other hand, it + * also means we don't depend as much on how LLVM represents profiling data + * internally. + * + * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more + * details on how this works, particularly GCOVProfiler::emitProfileArcs(), + * GCOVProfiler::insertCounterWriteout(), and + * GCOVProfiler::insertFlush(). + */ + +#define pr_fmt(fmt) "gcov: " fmt + +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/printk.h> +#include <linux/ratelimit.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include "gcov.h" + +typedef void (*llvm_gcov_callback)(void); + +struct gcov_info { + struct list_head head; + + const char *filename; + unsigned int version; + u32 checksum; + + struct list_head functions; +}; + +struct gcov_fn_info { + struct list_head head; + + u32 ident; + u32 checksum; + u8 use_extra_checksum; + u32 cfg_checksum; + + u32 num_counters; + u64 *counters; + const char *function_name; +}; + +static struct gcov_info *current_info; + +static LIST_HEAD(clang_gcov_list); + +void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush) +{ + struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return; + + INIT_LIST_HEAD(&info->head); + INIT_LIST_HEAD(&info->functions); + + mutex_lock(&gcov_lock); + + list_add_tail(&info->head, &clang_gcov_list); + current_info = info; + writeout(); + current_info = NULL; + if (gcov_events_enabled) + gcov_event(GCOV_ADD, info); + + mutex_unlock(&gcov_lock); +} +EXPORT_SYMBOL(llvm_gcov_init); + +void llvm_gcda_start_file(const char *orig_filename, const char version[4], + u32 checksum) +{ + current_info->filename = orig_filename; + memcpy(¤t_info->version, version, sizeof(current_info->version)); + current_info->checksum = checksum; +} +EXPORT_SYMBOL(llvm_gcda_start_file); + +void llvm_gcda_emit_function(u32 ident, const char *function_name, + u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum) +{ + struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return; + + INIT_LIST_HEAD(&info->head); + info->ident = ident; + info->checksum = func_checksum; + info->use_extra_checksum = use_extra_checksum; + info->cfg_checksum = cfg_checksum; + if (function_name) + info->function_name = kstrdup(function_name, GFP_KERNEL); + + list_add_tail(&info->head, ¤t_info->functions); +} +EXPORT_SYMBOL(llvm_gcda_emit_function); + +void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters) +{ + struct gcov_fn_info *info = list_last_entry(¤t_info->functions, + struct gcov_fn_info, head); + + info->num_counters = num_counters; + info->counters = counters; +} +EXPORT_SYMBOL(llvm_gcda_emit_arcs); + +void llvm_gcda_summary_info(void) +{ +} +EXPORT_SYMBOL(llvm_gcda_summary_info); + +void llvm_gcda_end_file(void) +{ +} +EXPORT_SYMBOL(llvm_gcda_end_file); + +/** + * gcov_info_filename - return info filename + * @info: profiling data set + */ +const char *gcov_info_filename(struct gcov_info *info) +{ + return info->filename; +} + +/** + * gcov_info_version - return info version + * @info: profiling data set + */ +unsigned int gcov_info_version(struct gcov_info *info) +{ + return info->version; +} + +/** + * gcov_info_next - return next profiling data set + * @info: profiling data set + * + * Returns next gcov_info following @info or first gcov_info in the chain if + * @info is %NULL. + */ +struct gcov_info *gcov_info_next(struct gcov_info *info) +{ + if (!info) + return list_first_entry_or_null(&clang_gcov_list, + struct gcov_info, head); + if (list_is_last(&info->head, &clang_gcov_list)) + return NULL; + return list_next_entry(info, head); +} + +/** + * gcov_info_link - link/add profiling data set to the list + * @info: profiling data set + */ +void gcov_info_link(struct gcov_info *info) +{ + list_add_tail(&info->head, &clang_gcov_list); +} + +/** + * gcov_info_unlink - unlink/remove profiling data set from the list + * @prev: previous profiling data set + * @info: profiling data set + */ +void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) +{ + /* Generic code unlinks while iterating. */ + __list_del_entry(&info->head); +} + +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info->filename, mod); +} + +/* Symbolic links to be created for each profiling data file. */ +const struct gcov_link gcov_link[] = { + { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ + { 0, NULL}, +}; + +/** + * gcov_info_reset - reset profiling data to zero + * @info: profiling data set + */ +void gcov_info_reset(struct gcov_info *info) +{ + struct gcov_fn_info *fn; + + list_for_each_entry(fn, &info->functions, head) + memset(fn->counters, 0, + sizeof(fn->counters[0]) * fn->num_counters); +} + +/** + * gcov_info_is_compatible - check if profiling data can be added + * @info1: first profiling data set + * @info2: second profiling data set + * + * Returns non-zero if profiling data can be added, zero otherwise. + */ +int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) +{ + struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null( + &info1->functions, struct gcov_fn_info, head); + struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null( + &info2->functions, struct gcov_fn_info, head); + + if (info1->checksum != info2->checksum) + return false; + if (!fn_ptr1) + return fn_ptr1 == fn_ptr2; + while (!list_is_last(&fn_ptr1->head, &info1->functions) && + !list_is_last(&fn_ptr2->head, &info2->functions)) { + if (fn_ptr1->checksum != fn_ptr2->checksum) + return false; + if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum) + return false; + if (fn_ptr1->use_extra_checksum && + fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) + return false; + fn_ptr1 = list_next_entry(fn_ptr1, head); + fn_ptr2 = list_next_entry(fn_ptr2, head); + } + return list_is_last(&fn_ptr1->head, &info1->functions) && + list_is_last(&fn_ptr2->head, &info2->functions); +} + +/** + * gcov_info_add - add up profiling data + * @dest: profiling data set to which data is added + * @source: profiling data set which is added + * + * Adds profiling counts of @source to @dest. + */ +void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) +{ + struct gcov_fn_info *dfn_ptr; + struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions, + struct gcov_fn_info, head); + + list_for_each_entry(dfn_ptr, &dst->functions, head) { + u32 i; + + for (i = 0; i < sfn_ptr->num_counters; i++) + dfn_ptr->counters[i] += sfn_ptr->counters[i]; + } +} + +static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) +{ + size_t cv_size; /* counter values size */ + struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn), + GFP_KERNEL); + if (!fn_dup) + return NULL; + INIT_LIST_HEAD(&fn_dup->head); + + fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL); + if (!fn_dup->function_name) + goto err_name; + + cv_size = fn->num_counters * sizeof(fn->counters[0]); + fn_dup->counters = vmalloc(cv_size); + if (!fn_dup->counters) + goto err_counters; + memcpy(fn_dup->counters, fn->counters, cv_size); + + return fn_dup; + +err_counters: + kfree(fn_dup->function_name); +err_name: + kfree(fn_dup); + return NULL; +} + +/** + * gcov_info_dup - duplicate profiling data set + * @info: profiling data set to duplicate + * + * Return newly allocated duplicate on success, %NULL on error. + */ +struct gcov_info *gcov_info_dup(struct gcov_info *info) +{ + struct gcov_info *dup; + struct gcov_fn_info *fn; + + dup = kmemdup(info, sizeof(*dup), GFP_KERNEL); + if (!dup) + return NULL; + INIT_LIST_HEAD(&dup->head); + INIT_LIST_HEAD(&dup->functions); + dup->filename = kstrdup(info->filename, GFP_KERNEL); + if (!dup->filename) + goto err; + + list_for_each_entry(fn, &info->functions, head) { + struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn); + + if (!fn_dup) + goto err; + list_add_tail(&fn_dup->head, &dup->functions); + } + + return dup; + +err: + gcov_info_free(dup); + return NULL; +} + +/** + * gcov_info_free - release memory for profiling data set duplicate + * @info: profiling data set duplicate to free + */ +void gcov_info_free(struct gcov_info *info) +{ + struct gcov_fn_info *fn, *tmp; + + list_for_each_entry_safe(fn, tmp, &info->functions, head) { + kfree(fn->function_name); + vfree(fn->counters); + list_del(&fn->head); + kfree(fn); + } + kfree(info->filename); + kfree(info); +} + +#define ITER_STRIDE PAGE_SIZE + +/** + * struct gcov_iterator - specifies current file position in logical records + * @info: associated profiling data + * @buffer: buffer containing file data + * @size: size of buffer + * @pos: current position in file + */ +struct gcov_iterator { + struct gcov_info *info; + void *buffer; + size_t size; + loff_t pos; +}; + +/** + * store_gcov_u32 - store 32 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't + * store anything. + */ +static size_t store_gcov_u32(void *buffer, size_t off, u32 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + *data = v; + } + + return sizeof(*data); +} + +/** + * store_gcov_u64 - store 64 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. 64 bit numbers are stored as two 32 bit numbers, the low part + * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store + * anything. + */ +static size_t store_gcov_u64(void *buffer, size_t off, u64 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + + data[0] = (v & 0xffffffffUL); + data[1] = (v >> 32); + } + + return sizeof(*data) * 2; +} + +/** + * convert_to_gcda - convert profiling data set to gcda file format + * @buffer: the buffer to store file data or %NULL if no data should be stored + * @info: profiling data set to be converted + * + * Returns the number of bytes that were/would have been stored into the buffer. + */ +static size_t convert_to_gcda(char *buffer, struct gcov_info *info) +{ + struct gcov_fn_info *fi_ptr; + size_t pos = 0; + + /* File header. */ + pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC); + pos += store_gcov_u32(buffer, pos, info->version); + pos += store_gcov_u32(buffer, pos, info->checksum); + + list_for_each_entry(fi_ptr, &info->functions, head) { + u32 i; + u32 len = 2; + + if (fi_ptr->use_extra_checksum) + len++; + + pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); + pos += store_gcov_u32(buffer, pos, len); + pos += store_gcov_u32(buffer, pos, fi_ptr->ident); + pos += store_gcov_u32(buffer, pos, fi_ptr->checksum); + if (fi_ptr->use_extra_checksum) + pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); + + pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE); + pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2); + for (i = 0; i < fi_ptr->num_counters; i++) + pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]); + } + + return pos; +} + +/** + * gcov_iter_new - allocate and initialize profiling data iterator + * @info: profiling data set to be iterated + * + * Return file iterator on success, %NULL otherwise. + */ +struct gcov_iterator *gcov_iter_new(struct gcov_info *info) +{ + struct gcov_iterator *iter; + + iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); + if (!iter) + goto err_free; + + iter->info = info; + /* Dry-run to get the actual buffer size. */ + iter->size = convert_to_gcda(NULL, info); + iter->buffer = vmalloc(iter->size); + if (!iter->buffer) + goto err_free; + + convert_to_gcda(iter->buffer, info); + + return iter; + +err_free: + kfree(iter); + return NULL; +} + + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +void gcov_iter_free(struct gcov_iterator *iter) +{ + vfree(iter->buffer); + kfree(iter); +} + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) +{ + return iter->info; +} + +/** + * gcov_iter_start - reset file iterator to starting position + * @iter: file iterator + */ +void gcov_iter_start(struct gcov_iterator *iter) +{ + iter->pos = 0; +} + +/** + * gcov_iter_next - advance file iterator to next logical record + * @iter: file iterator + * + * Return zero if new position is valid, non-zero if iterator has reached end. + */ +int gcov_iter_next(struct gcov_iterator *iter) +{ + if (iter->pos < iter->size) + iter->pos += ITER_STRIDE; + + if (iter->pos >= iter->size) + return -EINVAL; + + return 0; +} + +/** + * gcov_iter_write - write data for current pos to seq_file + * @iter: file iterator + * @seq: seq_file handle + * + * Return zero on success, non-zero otherwise. + */ +int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) +{ + size_t len; + + if (iter->pos >= iter->size) + return -EINVAL; + + len = ITER_STRIDE; + if (iter->pos + len > iter->size) + len = iter->size - iter->pos; + + seq_write(seq, iter->buffer + iter->pos, len); + + return 0; +} diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 2dddecbdbe6e..801ee4b0b969 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -137,6 +137,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) gcov_info_head = info->next; } +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info, mod); +} + /* Symbolic links to be created for each profiling data file. */ const struct gcov_link gcov_link[] = { { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index ca5e5c0ef853..ec37563674d6 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -150,6 +150,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) gcov_info_head = info->next; } +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info, mod); +} + /* Symbolic links to be created for each profiling data file. */ const struct gcov_link gcov_link[] = { { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c new file mode 100644 index 000000000000..3cf736b9f880 --- /dev/null +++ b/kernel/gcov/gcc_base.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/mutex.h> +#include "gcov.h" + +/* + * __gcov_init is called by gcc-generated constructor code for each object + * file compiled with -fprofile-arcs. + */ +void __gcov_init(struct gcov_info *info) +{ + static unsigned int gcov_version; + + mutex_lock(&gcov_lock); + if (gcov_version == 0) { + gcov_version = gcov_info_version(info); + /* + * Printing gcc's version magic may prove useful for debugging + * incompatibility reports. + */ + pr_info("version magic: 0x%x\n", gcov_version); + } + /* + * Add new profiling data structure to list and inform event + * listener. + */ + gcov_info_link(info); + if (gcov_events_enabled) + gcov_event(GCOV_ADD, info); + mutex_unlock(&gcov_lock); +} +EXPORT_SYMBOL(__gcov_init); + +/* + * These functions may be referenced by gcc-generated profiling code but serve + * no function for kernel profiling. + */ +void __gcov_flush(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_flush); + +void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_add); + +void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_single); + +void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_delta); + +void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_ior); + +void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_time_profile); + +void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_icall_topn); + +void __gcov_exit(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_exit); diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index de118ad4a024..6ab2c1808c9d 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -15,6 +15,7 @@ #ifndef GCOV_H #define GCOV_H GCOV_H +#include <linux/module.h> #include <linux/types.h> /* @@ -46,6 +47,7 @@ unsigned int gcov_info_version(struct gcov_info *info); struct gcov_info *gcov_info_next(struct gcov_info *info); void gcov_info_link(struct gcov_info *info); void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); +bool gcov_info_within_module(struct gcov_info *info, struct module *mod); /* Base interface. */ enum gcov_action { @@ -83,4 +85,7 @@ struct gcov_link { }; extern const struct gcov_link gcov_link[]; +extern int gcov_events_enabled; +extern struct mutex gcov_lock; + #endif /* GCOV_H */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 5942eeafb9ac..be4e8795561a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -11,6 +11,7 @@ #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> +#include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 99a5b5f46dc5..871734ea2f04 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -67,13 +67,10 @@ static struct latency_record latency_record[MAXLR]; int latencytop_enabled; -void clear_all_latency_tracing(struct task_struct *p) +void clear_tsk_latency_tracing(struct task_struct *p) { unsigned long flags; - if (!latencytop_enabled) - return; - raw_spin_lock_irqsave(&latency_lock, flags); memset(&p->latency_record, 0, sizeof(p->latency_record)); p->latency_record_count = 0; @@ -96,9 +93,6 @@ account_global_scheduler_latency(struct task_struct *tsk, int firstnonnull = MAXLR + 1; int i; - if (!latencytop_enabled) - return; - /* skip kernel threads for now */ if (!tsk->mm) return; diff --git a/kernel/notifier.c b/kernel/notifier.c index 6196af8a8223..bfc95b3e4235 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -22,6 +22,7 @@ static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { + WARN_ONCE(((*nl) == n), "double register detected"); if (n->priority > (*nl)->priority) break; nl = &((*nl)->next); diff --git a/kernel/panic.c b/kernel/panic.c index c1fcaad337b7..8779d64bace0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -306,6 +306,8 @@ void panic(const char *fmt, ...) * shutting down. But if there is a chance of * rebooting the system it will be rebooted. */ + if (panic_reboot_mode != REBOOT_UNDEFINED) + reboot_mode = panic_reboot_mode; emergency_restart(); } #ifdef __sparc__ @@ -321,6 +323,9 @@ void panic(const char *fmt, ...) disabled_wait(); #endif pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); + + /* Do not scroll important messages printed above */ + suppress_printk = 1; local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); diff --git a/kernel/pid.c b/kernel/pid.c index 20881598bdfa..89548d35eefb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -32,7 +32,6 @@ #include <linux/init.h> #include <linux/rculist.h> #include <linux/memblock.h> -#include <linux/hash.h> #include <linux/pid_namespace.h> #include <linux/init_task.h> #include <linux/syscalls.h> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 02ca827b8fac..17102fd4c136 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -86,6 +86,12 @@ static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); +/* + * System may need to suppress printk message under certain + * circumstances, like after kernel panic happens. + */ +int __read_mostly suppress_printk; + #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" @@ -1943,6 +1949,10 @@ asmlinkage int vprintk_emit(int facility, int level, unsigned long flags; u64 curr_log_seq; + /* Suppress unimportant messages after panic happens */ + if (unlikely(suppress_printk)) + return 0; + if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; in_sched = true; diff --git a/kernel/reboot.c b/kernel/reboot.c index e1b79b6a2735..b9e79e8c7226 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -31,6 +31,7 @@ EXPORT_SYMBOL(cad_pid); #define DEFAULT_REBOOT_MODE #endif enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; +enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED; /* * This variable is used privately to keep track of whether or not @@ -519,6 +520,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot); static int __init reboot_setup(char *str) { for (;;) { + enum reboot_mode *mode; + /* * Having anything passed on the command line via * reboot= will cause us to disable DMI checking @@ -526,17 +529,24 @@ static int __init reboot_setup(char *str) */ reboot_default = 0; + if (!strncmp(str, "panic_", 6)) { + mode = &panic_reboot_mode; + str += 6; + } else { + mode = &reboot_mode; + } + switch (*str) { case 'w': - reboot_mode = REBOOT_WARM; + *mode = REBOOT_WARM; break; case 'c': - reboot_mode = REBOOT_COLD; + *mode = REBOOT_COLD; break; case 'h': - reboot_mode = REBOOT_HARD; + *mode = REBOOT_HARD; break; case 's': @@ -553,11 +563,11 @@ static int __init reboot_setup(char *str) if (rc) return rc; } else - reboot_mode = REBOOT_SOFT; + *mode = REBOOT_SOFT; break; } case 'g': - reboot_mode = REBOOT_GPIO; + *mode = REBOOT_GPIO; break; case 'b': diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 0e97ca9306ef..7acc632c3b82 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -4,6 +4,9 @@ * Copyright (c) 2018 Facebook, Inc. * Author: Johannes Weiner <hannes@cmpxchg.org> * + * Polling support by Suren Baghdasaryan <surenb@google.com> + * Copyright (c) 2018 Google, Inc. + * * When CPU, memory and IO are contended, tasks experience delays that * reduce throughput and introduce latencies into the workload. Memory * and IO contention, in addition, can cause a full loss of forward @@ -129,9 +132,13 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/seqlock.h> +#include <linux/uaccess.h> #include <linux/cgroup.h> #include <linux/module.h> #include <linux/sched.h> +#include <linux/ctype.h> +#include <linux/file.h> +#include <linux/poll.h> #include <linux/psi.h> #include "sched.h" @@ -140,9 +147,9 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED -bool psi_enable; +static bool psi_enable; #else -bool psi_enable = true; +static bool psi_enable = true; #endif static int __init setup_psi(char *str) { @@ -156,16 +163,21 @@ __setup("psi=", setup_psi); #define EXP_60s 1981 /* 1/exp(2s/60s) */ #define EXP_300s 2034 /* 1/exp(2s/300s) */ +/* PSI trigger definitions */ +#define WINDOW_MIN_US 500000 /* Min window size is 500ms */ +#define WINDOW_MAX_US 10000000 /* Max window size is 10s */ +#define UPDATES_PER_WINDOW 10 /* 10 updates per window */ + /* Sampling frequency in nanoseconds */ static u64 psi_period __read_mostly; /* System-level pressure and stall tracking */ static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); -static struct psi_group psi_system = { +struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; -static void psi_update_work(struct work_struct *work); +static void psi_avgs_work(struct work_struct *work); static void group_init(struct psi_group *group) { @@ -173,9 +185,20 @@ static void group_init(struct psi_group *group) for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); - group->next_update = sched_clock() + psi_period; - INIT_DELAYED_WORK(&group->clock_work, psi_update_work); - mutex_init(&group->stat_lock); + group->avg_next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); + mutex_init(&group->avgs_lock); + /* Init trigger-related members */ + atomic_set(&group->poll_scheduled, 0); + mutex_init(&group->trigger_lock); + INIT_LIST_HEAD(&group->triggers); + memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); + group->poll_states = 0; + group->poll_min_period = U32_MAX; + memset(group->polling_total, 0, sizeof(group->polling_total)); + group->polling_next_update = ULLONG_MAX; + group->polling_until = 0; + rcu_assign_pointer(group->poll_kworker, NULL); } void __init psi_init(void) @@ -210,20 +233,24 @@ static bool test_state(unsigned int *tasks, enum psi_states state) } } -static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +static void get_recent_times(struct psi_group *group, int cpu, + enum psi_aggregators aggregator, u32 *times, + u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); - unsigned int tasks[NR_PSI_TASK_COUNTS]; u64 now, state_start; + enum psi_states s; unsigned int seq; - int s; + u32 state_mask; + + *pchanged_states = 0; /* Snapshot a coherent view of the CPU state */ do { seq = read_seqcount_begin(&groupc->seq); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); - memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_mask = groupc->state_mask; state_start = groupc->state_start; } while (read_seqcount_retry(&groupc->seq, seq)); @@ -239,13 +266,15 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) * (u32) and our reported pressure close to what's * actually happening. */ - if (test_state(tasks, s)) + if (state_mask & (1 << s)) times[s] += now - state_start; - delta = times[s] - groupc->times_prev[s]; - groupc->times_prev[s] = times[s]; + delta = times[s] - groupc->times_prev[aggregator][s]; + groupc->times_prev[aggregator][s] = times[s]; times[s] = delta; + if (delta) + *pchanged_states |= (1 << s); } } @@ -269,17 +298,16 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } -static bool update_stats(struct psi_group *group) +static void collect_percpu_times(struct psi_group *group, + enum psi_aggregators aggregator, + u32 *pchanged_states) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; - unsigned long missed_periods = 0; unsigned long nonidle_total = 0; - u64 now, expires, period; + u32 changed_states = 0; int cpu; int s; - mutex_lock(&group->stat_lock); - /* * Collect the per-cpu time buckets and average them into a * single time sample that is normalized to wallclock time. @@ -291,8 +319,11 @@ static bool update_stats(struct psi_group *group) for_each_possible_cpu(cpu) { u32 times[NR_PSI_STATES]; u32 nonidle; + u32 cpu_changed_states; - get_recent_times(group, cpu, times); + get_recent_times(group, cpu, aggregator, times, + &cpu_changed_states); + changed_states |= cpu_changed_states; nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); nonidle_total += nonidle; @@ -315,13 +346,22 @@ static bool update_stats(struct psi_group *group) /* total= */ for (s = 0; s < NR_PSI_STATES - 1; s++) - group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + group->total[aggregator][s] += + div_u64(deltas[s], max(nonidle_total, 1UL)); + + if (pchanged_states) + *pchanged_states = changed_states; +} + +static u64 update_averages(struct psi_group *group, u64 now) +{ + unsigned long missed_periods = 0; + u64 expires, period; + u64 avg_next_update; + int s; /* avgX= */ - now = sched_clock(); - expires = group->next_update; - if (now < expires) - goto out; + expires = group->avg_next_update; if (now - expires >= psi_period) missed_periods = div_u64(now - expires, psi_period); @@ -332,14 +372,14 @@ static bool update_stats(struct psi_group *group) * But the deltas we sample out of the per-cpu buckets above * are based on the actual time elapsing between clock ticks. */ - group->next_update = expires + ((1 + missed_periods) * psi_period); - period = now - (group->last_update + (missed_periods * psi_period)); - group->last_update = now; + avg_next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->avg_last_update + (missed_periods * psi_period)); + group->avg_last_update = now; for (s = 0; s < NR_PSI_STATES - 1; s++) { u32 sample; - sample = group->total[s] - group->total_prev[s]; + sample = group->total[PSI_AVGS][s] - group->avg_total[s]; /* * Due to the lockless sampling of the time buckets, * recorded time deltas can slip into the next period, @@ -359,23 +399,30 @@ static bool update_stats(struct psi_group *group) */ if (sample > period) sample = period; - group->total_prev[s] += sample; + group->avg_total[s] += sample; calc_avgs(group->avg[s], missed_periods, sample, period); } -out: - mutex_unlock(&group->stat_lock); - return nonidle_total; + + return avg_next_update; } -static void psi_update_work(struct work_struct *work) +static void psi_avgs_work(struct work_struct *work) { struct delayed_work *dwork; struct psi_group *group; + u32 changed_states; bool nonidle; + u64 now; dwork = to_delayed_work(work); - group = container_of(dwork, struct psi_group, clock_work); + group = container_of(dwork, struct psi_group, avgs_work); + + mutex_lock(&group->avgs_lock); + now = sched_clock(); + + collect_percpu_times(group, PSI_AVGS, &changed_states); + nonidle = changed_states & (1 << PSI_NONIDLE); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -383,18 +430,196 @@ static void psi_update_work(struct work_struct *work) * Once restarted, we'll catch up the running averages in one * go - see calc_avgs() and missed_periods. */ - - nonidle = update_stats(group); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); if (nonidle) { - unsigned long delay = 0; - u64 now; + schedule_delayed_work(dwork, nsecs_to_jiffies( + group->avg_next_update - now) + 1); + } + + mutex_unlock(&group->avgs_lock); +} + +/* Trigger tracking window manupulations */ +static void window_reset(struct psi_window *win, u64 now, u64 value, + u64 prev_growth) +{ + win->start_time = now; + win->start_value = value; + win->prev_growth = prev_growth; +} + +/* + * PSI growth tracking window update and growth calculation routine. + * + * This approximates a sliding tracking window by interpolating + * partially elapsed windows using historical growth data from the + * previous intervals. This minimizes memory requirements (by not storing + * all the intermediate values in the previous window) and simplifies + * the calculations. It works well because PSI signal changes only in + * positive direction and over relatively small window sizes the growth + * is close to linear. + */ +static u64 window_update(struct psi_window *win, u64 now, u64 value) +{ + u64 elapsed; + u64 growth; + + elapsed = now - win->start_time; + growth = value - win->start_value; + /* + * After each tracking window passes win->start_value and + * win->start_time get reset and win->prev_growth stores + * the average per-window growth of the previous window. + * win->prev_growth is then used to interpolate additional + * growth from the previous window assuming it was linear. + */ + if (elapsed > win->size) + window_reset(win, now, value, growth); + else { + u32 remaining; + + remaining = win->size - elapsed; + growth += div_u64(win->prev_growth * remaining, win->size); + } + + return growth; +} + +static void init_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + + list_for_each_entry(t, &group->triggers, node) + window_reset(&t->win, now, + group->total[PSI_POLL][t->state], 0); + memcpy(group->polling_total, group->total[PSI_POLL], + sizeof(group->polling_total)); + group->polling_next_update = now + group->poll_min_period; +} + +static u64 update_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + bool new_stall = false; + u64 *total = group->total[PSI_POLL]; + + /* + * On subsequent updates, calculate growth deltas and let + * watchers know when their specified thresholds are exceeded. + */ + list_for_each_entry(t, &group->triggers, node) { + u64 growth; + + /* Check for stall activity */ + if (group->polling_total[t->state] == total[t->state]) + continue; + + /* + * Multiple triggers might be looking at the same state, + * remember to update group->polling_total[] once we've + * been through all of them. Also remember to extend the + * polling time if we see new stall activity. + */ + new_stall = true; + + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (growth < t->threshold) + continue; + + /* Limit event signaling to once per window */ + if (now < t->last_event_time + t->win.size) + continue; + + /* Generate an event */ + if (cmpxchg(&t->event, 0, 1) == 0) + wake_up_interruptible(&t->event_wait); + t->last_event_time = now; + } + + if (new_stall) + memcpy(group->polling_total, total, + sizeof(group->polling_total)); + + return now + group->poll_min_period; +} + +/* + * Schedule polling if it's not already scheduled. It's safe to call even from + * hotpath because even though kthread_queue_delayed_work takes worker->lock + * spinlock that spinlock is never contended due to poll_scheduled atomic + * preventing such competition. + */ +static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) +{ + struct kthread_worker *kworker; + + /* Do not reschedule if already scheduled */ + if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) + return; + + rcu_read_lock(); - now = sched_clock(); - if (group->next_update > now) - delay = nsecs_to_jiffies(group->next_update - now) + 1; - schedule_delayed_work(dwork, delay); + kworker = rcu_dereference(group->poll_kworker); + /* + * kworker might be NULL in case psi_trigger_destroy races with + * psi_task_change (hotpath) which can't use locks + */ + if (likely(kworker)) + kthread_queue_delayed_work(kworker, &group->poll_work, delay); + else + atomic_set(&group->poll_scheduled, 0); + + rcu_read_unlock(); +} + +static void psi_poll_work(struct kthread_work *work) +{ + struct kthread_delayed_work *dwork; + struct psi_group *group; + u32 changed_states; + u64 now; + + dwork = container_of(work, struct kthread_delayed_work, work); + group = container_of(dwork, struct psi_group, poll_work); + + atomic_set(&group->poll_scheduled, 0); + + mutex_lock(&group->trigger_lock); + + now = sched_clock(); + + collect_percpu_times(group, PSI_POLL, &changed_states); + + if (changed_states & group->poll_states) { + /* Initialize trigger windows when entering polling mode */ + if (now > group->polling_until) + init_triggers(group, now); + + /* + * Keep the monitor active for at least the duration of the + * minimum tracking window as long as monitor states are + * changing. + */ + group->polling_until = now + + group->poll_min_period * UPDATES_PER_WINDOW; + } + + if (now > group->polling_until) { + group->polling_next_update = ULLONG_MAX; + goto out; } + + if (now >= group->polling_next_update) + group->polling_next_update = update_triggers(group, now); + + psi_schedule_poll_work(group, + nsecs_to_jiffies(group->polling_next_update - now) + 1); + +out: + mutex_unlock(&group->trigger_lock); } static void record_times(struct psi_group_cpu *groupc, int cpu, @@ -407,15 +632,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, delta = now - groupc->state_start; groupc->state_start = now; - if (test_state(groupc->tasks, PSI_IO_SOME)) { + if (groupc->state_mask & (1 << PSI_IO_SOME)) { groupc->times[PSI_IO_SOME] += delta; - if (test_state(groupc->tasks, PSI_IO_FULL)) + if (groupc->state_mask & (1 << PSI_IO_FULL)) groupc->times[PSI_IO_FULL] += delta; } - if (test_state(groupc->tasks, PSI_MEM_SOME)) { + if (groupc->state_mask & (1 << PSI_MEM_SOME)) { groupc->times[PSI_MEM_SOME] += delta; - if (test_state(groupc->tasks, PSI_MEM_FULL)) + if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; else if (memstall_tick) { u32 sample; @@ -436,18 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, } } - if (test_state(groupc->tasks, PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) groupc->times[PSI_CPU_SOME] += delta; - if (test_state(groupc->tasks, PSI_NONIDLE)) + if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; } -static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set) +static u32 psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set) { struct psi_group_cpu *groupc; unsigned int t, m; + enum psi_states s; + u32 state_mask = 0; groupc = per_cpu_ptr(group->pcpu, cpu); @@ -480,7 +707,16 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; + /* Calculate state mask representing active states */ + for (s = 0; s < NR_PSI_STATES; s++) { + if (test_state(groupc->tasks, s)) + state_mask |= (1 << s); + } + groupc->state_mask = state_mask; + write_seqcount_end(&groupc->seq); + + return state_mask; } static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -537,13 +773,17 @@ void psi_task_change(struct task_struct *task, int clear, int set) */ if (unlikely((clear & TSK_RUNNING) && (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_update_work)) + wq_worker_last_func(task) == psi_avgs_work)) wake_clock = false; while ((group = iterate_groups(task, &iter))) { - psi_group_change(group, cpu, clear, set); - if (wake_clock && !delayed_work_pending(&group->clock_work)) - schedule_delayed_work(&group->clock_work, PSI_FREQ); + u32 state_mask = psi_group_change(group, cpu, clear, set); + + if (state_mask & group->poll_states) + psi_schedule_poll_work(group, 1); + + if (wake_clock && !delayed_work_pending(&group->avgs_work)) + schedule_delayed_work(&group->avgs_work, PSI_FREQ); } } @@ -640,8 +880,10 @@ void psi_cgroup_free(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return; - cancel_delayed_work_sync(&cgroup->psi.clock_work); + cancel_delayed_work_sync(&cgroup->psi.avgs_work); free_percpu(cgroup->psi.pcpu); + /* All triggers must be removed by now */ + WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); } /** @@ -697,11 +939,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; + u64 now; if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; - update_stats(group); + /* Update averages before reporting them */ + mutex_lock(&group->avgs_lock); + now = sched_clock(); + collect_percpu_times(group, PSI_AVGS, NULL); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); + mutex_unlock(&group->avgs_lock); for (full = 0; full < 2 - (res == PSI_CPU); full++) { unsigned long avg[3]; @@ -710,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) for (w = 0; w < 3; w++) avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", full ? "full" : "some", @@ -753,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file) return single_open(file, psi_cpu_show, NULL); } +struct psi_trigger *psi_trigger_create(struct psi_group *group, + char *buf, size_t nbytes, enum psi_res res) +{ + struct psi_trigger *t; + enum psi_states state; + u32 threshold_us; + u32 window_us; + + if (static_branch_likely(&psi_disabled)) + return ERR_PTR(-EOPNOTSUPP); + + if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_SOME + res * 2; + else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_FULL + res * 2; + else + return ERR_PTR(-EINVAL); + + if (state >= PSI_NONIDLE) + return ERR_PTR(-EINVAL); + + if (window_us < WINDOW_MIN_US || + window_us > WINDOW_MAX_US) + return ERR_PTR(-EINVAL); + + /* Check threshold */ + if (threshold_us == 0 || threshold_us > window_us) + return ERR_PTR(-EINVAL); + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return ERR_PTR(-ENOMEM); + + t->group = group; + t->state = state; + t->threshold = threshold_us * NSEC_PER_USEC; + t->win.size = window_us * NSEC_PER_USEC; + window_reset(&t->win, 0, 0, 0); + + t->event = 0; + t->last_event_time = 0; + init_waitqueue_head(&t->event_wait); + kref_init(&t->refcount); + + mutex_lock(&group->trigger_lock); + + if (!rcu_access_pointer(group->poll_kworker)) { + struct sched_param param = { + .sched_priority = MAX_RT_PRIO - 1, + }; + struct kthread_worker *kworker; + + kworker = kthread_create_worker(0, "psimon"); + if (IS_ERR(kworker)) { + kfree(t); + mutex_unlock(&group->trigger_lock); + return ERR_CAST(kworker); + } + sched_setscheduler(kworker->task, SCHED_FIFO, ¶m); + kthread_init_delayed_work(&group->poll_work, + psi_poll_work); + rcu_assign_pointer(group->poll_kworker, kworker); + } + + list_add(&t->node, &group->triggers); + group->poll_min_period = min(group->poll_min_period, + div_u64(t->win.size, UPDATES_PER_WINDOW)); + group->nr_triggers[t->state]++; + group->poll_states |= (1 << t->state); + + mutex_unlock(&group->trigger_lock); + + return t; +} + +static void psi_trigger_destroy(struct kref *ref) +{ + struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); + struct psi_group *group = t->group; + struct kthread_worker *kworker_to_destroy = NULL; + + if (static_branch_likely(&psi_disabled)) + return; + + /* + * Wakeup waiters to stop polling. Can happen if cgroup is deleted + * from under a polling process. + */ + wake_up_interruptible(&t->event_wait); + + mutex_lock(&group->trigger_lock); + + if (!list_empty(&t->node)) { + struct psi_trigger *tmp; + u64 period = ULLONG_MAX; + + list_del(&t->node); + group->nr_triggers[t->state]--; + if (!group->nr_triggers[t->state]) + group->poll_states &= ~(1 << t->state); + /* reset min update period for the remaining triggers */ + list_for_each_entry(tmp, &group->triggers, node) + period = min(period, div_u64(tmp->win.size, + UPDATES_PER_WINDOW)); + group->poll_min_period = period; + /* Destroy poll_kworker when the last trigger is destroyed */ + if (group->poll_states == 0) { + group->polling_until = 0; + kworker_to_destroy = rcu_dereference_protected( + group->poll_kworker, + lockdep_is_held(&group->trigger_lock)); + rcu_assign_pointer(group->poll_kworker, NULL); + } + } + + mutex_unlock(&group->trigger_lock); + + /* + * Wait for both *trigger_ptr from psi_trigger_replace and + * poll_kworker RCUs to complete their read-side critical sections + * before destroying the trigger and optionally the poll_kworker + */ + synchronize_rcu(); + /* + * Destroy the kworker after releasing trigger_lock to prevent a + * deadlock while waiting for psi_poll_work to acquire trigger_lock + */ + if (kworker_to_destroy) { + kthread_cancel_delayed_work_sync(&group->poll_work); + kthread_destroy_worker(kworker_to_destroy); + } + kfree(t); +} + +void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) +{ + struct psi_trigger *old = *trigger_ptr; + + if (static_branch_likely(&psi_disabled)) + return; + + rcu_assign_pointer(*trigger_ptr, new); + if (old) + kref_put(&old->refcount, psi_trigger_destroy); +} + +__poll_t psi_trigger_poll(void **trigger_ptr, + struct file *file, poll_table *wait) +{ + __poll_t ret = DEFAULT_POLLMASK; + struct psi_trigger *t; + + if (static_branch_likely(&psi_disabled)) + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + + rcu_read_lock(); + + t = rcu_dereference(*(void __rcu __force **)trigger_ptr); + if (!t) { + rcu_read_unlock(); + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + } + kref_get(&t->refcount); + + rcu_read_unlock(); + + poll_wait(file, &t->event_wait, wait); + + if (cmpxchg(&t->event, 1, 0) == 1) + ret |= EPOLLPRI; + + kref_put(&t->refcount, psi_trigger_destroy); + + return ret; +} + +static ssize_t psi_write(struct file *file, const char __user *user_buf, + size_t nbytes, enum psi_res res) +{ + char buf[32]; + size_t buf_size; + struct seq_file *seq; + struct psi_trigger *new; + + if (static_branch_likely(&psi_disabled)) + return -EOPNOTSUPP; + + buf_size = min(nbytes, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size - 1] = '\0'; + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) + return PTR_ERR(new); + + seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ + mutex_lock(&seq->lock); + psi_trigger_replace(&seq->private, new); + mutex_unlock(&seq->lock); + + return nbytes; +} + +static ssize_t psi_io_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IO); +} + +static ssize_t psi_memory_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_MEM); +} + +static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_CPU); +} + +static __poll_t psi_fop_poll(struct file *file, poll_table *wait) +{ + struct seq_file *seq = file->private_data; + + return psi_trigger_poll(&seq->private, file, wait); +} + +static int psi_fop_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + psi_trigger_replace(&seq->private, NULL); + return single_release(inode, file); +} + static const struct file_operations psi_io_fops = { .open = psi_io_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_io_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static const struct file_operations psi_memory_fops = { .open = psi_memory_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_memory_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static const struct file_operations psi_cpu_fops = { .open = psi_cpu_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_cpu_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static int __init psi_proc_init(void) diff --git a/kernel/signal.c b/kernel/signal.c index 62f9aea4a15a..c4dd66436fc5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -840,6 +840,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, */ if (!sid || sid == task_session(current)) break; + /* fall through */ default: return -EPERM; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ba158f61aab4..943c89178e3d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2886,8 +2886,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (neg) continue; val = convmul * val / convdiv; - if ((min && val < *min) || (max && val > *max)) - continue; + if ((min && val < *min) || (max && val > *max)) { + err = -EINVAL; + break; + } *i = val; } else { val = convdiv * (*i) / convmul; @@ -3170,17 +3172,19 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, if (write) { char *kbuf, *p; + size_t skipped = 0; - if (left > PAGE_SIZE - 1) + if (left > PAGE_SIZE - 1) { left = PAGE_SIZE - 1; + /* How much of the buffer we'll skip this pass */ + skipped = *lenp - left; + } p = kbuf = memdup_user_nul(buffer, left); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len), - sizeof(unsigned long), - GFP_KERNEL); + tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); if (!tmp_bitmap) { kfree(kbuf); return -ENOMEM; @@ -3189,9 +3193,22 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, while (!err && left) { unsigned long val_a, val_b; bool neg; + size_t saved_left; + /* In case we stop parsing mid-number, we can reset */ + saved_left = left; err = proc_get_long(&p, &left, &val_a, &neg, tr_a, sizeof(tr_a), &c); + /* + * If we consumed the entirety of a truncated buffer or + * only one char is left (may be a "-"), then stop here, + * reset, & come back for more. + */ + if ((left <= 1) && skipped) { + left = saved_left; + break; + } + if (err) break; if (val_a >= bitmap_len || neg) { @@ -3209,6 +3226,15 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, err = proc_get_long(&p, &left, &val_b, &neg, tr_b, sizeof(tr_b), &c); + /* + * If we consumed all of a truncated buffer or + * then stop here, reset, & come back for more. + */ + if (!left && skipped) { + left = saved_left; + break; + } + if (err) break; if (val_b >= bitmap_len || neg || @@ -3227,6 +3253,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, proc_skip_char(&p, &left, '\n'); } kfree(kbuf); + left += skipped; } else { unsigned long bit_a, bit_b = 0; @@ -3271,7 +3298,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, *ppos += *lenp; } - kfree(tmp_bitmap); + bitmap_free(tmp_bitmap); return err; } diff --git a/kernel/user.c b/kernel/user.c index 0df9b1640b2a..88b834f0eebc 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -185,7 +185,7 @@ struct user_struct *alloc_uid(kuid_t uid) if (!up) { new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); if (!new) - goto out_unlock; + return NULL; new->uid = uid; refcount_set(&new->__count, 1); @@ -199,8 +199,6 @@ struct user_struct *alloc_uid(kuid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { - key_put(new->uid_keyring); - key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); @@ -210,9 +208,6 @@ struct user_struct *alloc_uid(kuid_t uid) } return up; - -out_unlock: - return NULL; } static int __init uid_cache_init(void) diff --git a/lib/Kconfig b/lib/Kconfig index f323b85ad11c..3577609b61be 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -46,9 +46,6 @@ config HAVE_ARCH_BITREVERSE This option enables the use of hardware bit-reversal instructions on architectures which support such operations. -config RATIONAL - bool - config GENERIC_STRNCPY_FROM_USER bool @@ -61,6 +58,8 @@ config GENERIC_NET_UTILS config GENERIC_FIND_FIRST_BIT bool +source "lib/math/Kconfig" + config NO_GENERIC_PCI_IOPORT_MAP bool @@ -531,12 +530,6 @@ config LRU_CACHE config CLZ_TAB bool -config CORDIC - tristate "CORDIC algorithm" - help - This option provides an implementation of the CORDIC algorithm; - calculations are in fixed point. Module will be called cordic. - config DDR bool "JEDEC DDR data" help @@ -628,9 +621,6 @@ config SBITMAP config PARMAN tristate "parman" if COMPILE_TEST -config PRIME_NUMBERS - tristate - config STRING_SELFTEST tristate "Test string functions" diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d695ec1477f3..fdfa173651eb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -318,6 +318,20 @@ config HEADERS_CHECK exported to $(INSTALL_HDR_PATH) (usually 'usr/include' in your build tree), to make sure they're suitable. +config OPTIMIZE_INLINING + bool "Allow compiler to uninline functions marked 'inline'" + help + This option determines if the kernel forces gcc to inline the functions + developers have marked 'inline'. Doing so takes away freedom from gcc to + do what it thinks is best, which is desirable for the gcc 3.x series of + compilers. The gcc 4.x series have a rewritten inlining algorithm and + enabling this option will generate a smaller kernel there. Hopefully + this algorithm is so good that allowing gcc 4.x and above to make the + decision will become the default in the future. Until then this option + is there to test gcc for this. + + If unsure, say N. + config DEBUG_SECTION_MISMATCH bool "Enable full Section mismatch analysis" help @@ -446,6 +460,15 @@ config DEBUG_KERNEL Say Y here if you are developing drivers or trying to debug and identify kernel problems. +config DEBUG_MISC + bool "Miscellaneous debug code" + default DEBUG_KERNEL + depends on DEBUG_KERNEL + help + Say Y here if you need to enable miscellaneous debug code that should + be under a more specific debug option but isn't. + + menu "Memory Debugging" source "mm/Kconfig.debug" @@ -1358,7 +1381,7 @@ config DEBUG_LIST If unsure, say N. -config DEBUG_PI_LIST +config DEBUG_PLIST bool "Debug priority linked list manipulation" depends on DEBUG_KERNEL help diff --git a/lib/Makefile b/lib/Makefile index 83d7df2661ff..fb7697031a79 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -30,7 +30,7 @@ endif lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o timerqueue.o xarray.o \ - idr.o int_sqrt.o extable.o \ + idr.o extable.o \ sha1.o chacha.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ @@ -44,11 +44,11 @@ lib-$(CONFIG_SMP) += cpumask.o lib-y += kobject.o klist.o obj-y += lockref.o -obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \ +obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \ bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \ - gcd.o lcm.o list_sort.o uuid.o iov_iter.o clz_ctz.o \ + list_sort.o uuid.o iov_iter.o clz_ctz.o \ bsearch.o find_bit.o llist.o memweight.o kfifo.o \ - percpu-refcount.o rhashtable.o reciprocal_div.o \ + percpu-refcount.o rhashtable.o \ once.o refcount.o usercopy.o errseq.o bucket_locks.o \ generic-radix-tree.o obj-$(CONFIG_STRING_SELFTEST) += test_string.o @@ -102,6 +102,8 @@ endif obj-$(CONFIG_DEBUG_INFO_REDUCED) += debug_info.o CFLAGS_debug_info.o += $(call cc-option, -femit-struct-debug-detailed=any) +obj-y += math/ + obj-$(CONFIG_GENERIC_IOMAP) += iomap.o obj-$(CONFIG_GENERIC_PCI_IOMAP) += pci_iomap.o obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o @@ -121,7 +123,6 @@ obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o obj-$(CONFIG_BITREVERSE) += bitrev.o obj-$(CONFIG_PACKING) += packing.o -obj-$(CONFIG_RATIONAL) += rational.o obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o obj-$(CONFIG_CRC16) += crc16.o obj-$(CONFIG_CRC_T10DIF)+= crc-t10dif.o @@ -195,8 +196,6 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o -obj-$(CONFIG_CORDIC) += cordic.o - obj-$(CONFIG_DQL) += dynamic_queue_limits.o obj-$(CONFIG_GLOB) += glob.o @@ -238,8 +237,6 @@ obj-$(CONFIG_ASN1) += asn1_decoder.o obj-$(CONFIG_FONT_SUPPORT) += fonts/ -obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o - hostprogs-y := gen_crc32table hostprogs-y += gen_crc64table clean-files := crc32table.h diff --git a/lib/bitmap.c b/lib/bitmap.c index 98872e9025da..f235434df87b 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -20,6 +20,8 @@ #include <asm/page.h> +#include "kstrtox.h" + /** * DOC: bitmap introduction * @@ -477,12 +479,128 @@ int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, } EXPORT_SYMBOL(bitmap_print_to_pagebuf); +/* + * Region 9-38:4/10 describes the following bitmap structure: + * 0 9 12 18 38 + * .........****......****......****...... + * ^ ^ ^ ^ + * start off group_len end + */ +struct region { + unsigned int start; + unsigned int off; + unsigned int group_len; + unsigned int end; +}; + +static int bitmap_set_region(const struct region *r, + unsigned long *bitmap, int nbits) +{ + unsigned int start; + + if (r->end >= nbits) + return -ERANGE; + + for (start = r->start; start <= r->end; start += r->group_len) + bitmap_set(bitmap, start, min(r->end - start + 1, r->off)); + + return 0; +} + +static int bitmap_check_region(const struct region *r) +{ + if (r->start > r->end || r->group_len == 0 || r->off > r->group_len) + return -EINVAL; + + return 0; +} + +static const char *bitmap_getnum(const char *str, unsigned int *num) +{ + unsigned long long n; + unsigned int len; + + len = _parse_integer(str, 10, &n); + if (!len) + return ERR_PTR(-EINVAL); + if (len & KSTRTOX_OVERFLOW || n != (unsigned int)n) + return ERR_PTR(-EOVERFLOW); + + *num = n; + return str + len; +} + +static inline bool end_of_str(char c) +{ + return c == '\0' || c == '\n'; +} + +static inline bool __end_of_region(char c) +{ + return isspace(c) || c == ','; +} + +static inline bool end_of_region(char c) +{ + return __end_of_region(c) || end_of_str(c); +} + +/* + * The format allows commas and whitespases at the beginning + * of the region. + */ +static const char *bitmap_find_region(const char *str) +{ + while (__end_of_region(*str)) + str++; + + return end_of_str(*str) ? NULL : str; +} + +static const char *bitmap_parse_region(const char *str, struct region *r) +{ + str = bitmap_getnum(str, &r->start); + if (IS_ERR(str)) + return str; + + if (end_of_region(*str)) + goto no_end; + + if (*str != '-') + return ERR_PTR(-EINVAL); + + str = bitmap_getnum(str + 1, &r->end); + if (IS_ERR(str)) + return str; + + if (end_of_region(*str)) + goto no_pattern; + + if (*str != ':') + return ERR_PTR(-EINVAL); + + str = bitmap_getnum(str + 1, &r->off); + if (IS_ERR(str)) + return str; + + if (*str != '/') + return ERR_PTR(-EINVAL); + + return bitmap_getnum(str + 1, &r->group_len); + +no_end: + r->end = r->start; +no_pattern: + r->off = r->end + 1; + r->group_len = r->end + 1; + + return end_of_str(*str) ? NULL : str; +} + /** - * __bitmap_parselist - convert list format ASCII string to bitmap - * @buf: read nul-terminated user string from this buffer - * @buflen: buffer size in bytes. If string is smaller than this - * then it must be terminated with a \0. - * @is_user: location of buffer, 0 indicates kernel space + * bitmap_parselist - convert list format ASCII string to bitmap + * @buf: read user string from this buffer; must be terminated + * with a \0 or \n. * @maskp: write resulting mask here * @nmaskbits: number of bits in mask to be written * @@ -498,127 +616,38 @@ EXPORT_SYMBOL(bitmap_print_to_pagebuf); * * Returns: 0 on success, -errno on invalid input strings. Error values: * - * - ``-EINVAL``: second number in range smaller than first + * - ``-EINVAL``: wrong region format * - ``-EINVAL``: invalid character in string * - ``-ERANGE``: bit number specified too large for mask + * - ``-EOVERFLOW``: integer overflow in the input parameters */ -static int __bitmap_parselist(const char *buf, unsigned int buflen, - int is_user, unsigned long *maskp, - int nmaskbits) +int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits) { - unsigned int a, b, old_a, old_b; - unsigned int group_size, used_size, off; - int c, old_c, totaldigits, ndigits; - const char __user __force *ubuf = (const char __user __force *)buf; - int at_start, in_range, in_partial_range; + struct region r; + long ret; - totaldigits = c = 0; - old_a = old_b = 0; - group_size = used_size = 0; bitmap_zero(maskp, nmaskbits); - do { - at_start = 1; - in_range = 0; - in_partial_range = 0; - a = b = 0; - ndigits = totaldigits; - - /* Get the next cpu# or a range of cpu#'s */ - while (buflen) { - old_c = c; - if (is_user) { - if (__get_user(c, ubuf++)) - return -EFAULT; - } else - c = *buf++; - buflen--; - if (isspace(c)) - continue; - - /* A '\0' or a ',' signal the end of a cpu# or range */ - if (c == '\0' || c == ',') - break; - /* - * whitespaces between digits are not allowed, - * but it's ok if whitespaces are on head or tail. - * when old_c is whilespace, - * if totaldigits == ndigits, whitespace is on head. - * if whitespace is on tail, it should not run here. - * as c was ',' or '\0', - * the last code line has broken the current loop. - */ - if ((totaldigits != ndigits) && isspace(old_c)) - return -EINVAL; - if (c == '/') { - used_size = a; - at_start = 1; - in_range = 0; - a = b = 0; - continue; - } + while (buf) { + buf = bitmap_find_region(buf); + if (buf == NULL) + return 0; - if (c == ':') { - old_a = a; - old_b = b; - at_start = 1; - in_range = 0; - in_partial_range = 1; - a = b = 0; - continue; - } + buf = bitmap_parse_region(buf, &r); + if (IS_ERR(buf)) + return PTR_ERR(buf); - if (c == '-') { - if (at_start || in_range) - return -EINVAL; - b = 0; - in_range = 1; - at_start = 1; - continue; - } + ret = bitmap_check_region(&r); + if (ret) + return ret; - if (!isdigit(c)) - return -EINVAL; + ret = bitmap_set_region(&r, maskp, nmaskbits); + if (ret) + return ret; + } - b = b * 10 + (c - '0'); - if (!in_range) - a = b; - at_start = 0; - totaldigits++; - } - if (ndigits == totaldigits) - continue; - if (in_partial_range) { - group_size = a; - a = old_a; - b = old_b; - old_a = old_b = 0; - } else { - used_size = group_size = b - a + 1; - } - /* if no digit is after '-', it's wrong*/ - if (at_start && in_range) - return -EINVAL; - if (!(a <= b) || group_size == 0 || !(used_size <= group_size)) - return -EINVAL; - if (b >= nmaskbits) - return -ERANGE; - while (a <= b) { - off = min(b - a + 1, used_size); - bitmap_set(maskp, a, off); - a += group_size; - } - } while (buflen && c == ','); return 0; } - -int bitmap_parselist(const char *bp, unsigned long *maskp, int nmaskbits) -{ - char *nl = strchrnul(bp, '\n'); - int len = nl - bp; - - return __bitmap_parselist(bp, len, 0, maskp, nmaskbits); -} EXPORT_SYMBOL(bitmap_parselist); @@ -632,23 +661,27 @@ EXPORT_SYMBOL(bitmap_parselist); * @nmaskbits: size of bitmap, in bits. * * Wrapper for bitmap_parselist(), providing it with user buffer. - * - * We cannot have this as an inline function in bitmap.h because it needs - * linux/uaccess.h to get the access_ok() declaration and this causes - * cyclic dependencies. */ int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, unsigned long *maskp, int nmaskbits) { - if (!access_ok(ubuf, ulen)) - return -EFAULT; - return __bitmap_parselist((const char __force *)ubuf, - ulen, 1, maskp, nmaskbits); + char *buf; + int ret; + + buf = memdup_user_nul(ubuf, ulen); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + ret = bitmap_parselist(buf, maskp, nmaskbits); + + kfree(buf); + return ret; } EXPORT_SYMBOL(bitmap_parselist_user); +#ifdef CONFIG_NUMA /** * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap * @buf: pointer to a bitmap @@ -757,7 +790,6 @@ void bitmap_remap(unsigned long *dst, const unsigned long *src, set_bit(bitmap_ord_to_pos(new, n % w, nbits), dst); } } -EXPORT_SYMBOL(bitmap_remap); /** * bitmap_bitremap - Apply map defined by a pair of bitmaps to a single bit @@ -795,7 +827,6 @@ int bitmap_bitremap(int oldbit, const unsigned long *old, else return bitmap_ord_to_pos(new, n % w, bits); } -EXPORT_SYMBOL(bitmap_bitremap); /** * bitmap_onto - translate one bitmap relative to another @@ -930,7 +961,6 @@ void bitmap_onto(unsigned long *dst, const unsigned long *orig, m++; } } -EXPORT_SYMBOL(bitmap_onto); /** * bitmap_fold - fold larger bitmap into smaller, modulo specified size @@ -955,7 +985,7 @@ void bitmap_fold(unsigned long *dst, const unsigned long *orig, for_each_set_bit(oldbit, orig, nbits) set_bit(oldbit % sz, dst); } -EXPORT_SYMBOL(bitmap_fold); +#endif /* CONFIG_NUMA */ /* * Common code for bitmap_*_region() routines. diff --git a/lib/list_sort.c b/lib/list_sort.c index 85759928215b..06e900c5587b 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -7,33 +7,41 @@ #include <linux/list_sort.h> #include <linux/list.h> -#define MAX_LIST_LENGTH_BITS 20 +typedef int __attribute__((nonnull(2,3))) (*cmp_func)(void *, + struct list_head const *, struct list_head const *); /* * Returns a list organized in an intermediate format suited * to chaining of merge() calls: null-terminated, no reserved or * sentinel head node, "prev" links not maintained. */ -static struct list_head *merge(void *priv, - int (*cmp)(void *priv, struct list_head *a, - struct list_head *b), +__attribute__((nonnull(2,3,4))) +static struct list_head *merge(void *priv, cmp_func cmp, struct list_head *a, struct list_head *b) { - struct list_head head, *tail = &head; + struct list_head *head, **tail = &head; - while (a && b) { + for (;;) { /* if equal, take 'a' -- important for sort stability */ - if ((*cmp)(priv, a, b) <= 0) { - tail->next = a; + if (cmp(priv, a, b) <= 0) { + *tail = a; + tail = &a->next; a = a->next; + if (!a) { + *tail = b; + break; + } } else { - tail->next = b; + *tail = b; + tail = &b->next; b = b->next; + if (!b) { + *tail = a; + break; + } } - tail = tail->next; } - tail->next = a?:b; - return head.next; + return head; } /* @@ -43,44 +51,52 @@ static struct list_head *merge(void *priv, * prev-link restoration pass, or maintaining the prev links * throughout. */ -static void merge_and_restore_back_links(void *priv, - int (*cmp)(void *priv, struct list_head *a, - struct list_head *b), - struct list_head *head, - struct list_head *a, struct list_head *b) +__attribute__((nonnull(2,3,4,5))) +static void merge_final(void *priv, cmp_func cmp, struct list_head *head, + struct list_head *a, struct list_head *b) { struct list_head *tail = head; u8 count = 0; - while (a && b) { + for (;;) { /* if equal, take 'a' -- important for sort stability */ - if ((*cmp)(priv, a, b) <= 0) { + if (cmp(priv, a, b) <= 0) { tail->next = a; a->prev = tail; + tail = a; a = a->next; + if (!a) + break; } else { tail->next = b; b->prev = tail; + tail = b; b = b->next; + if (!b) { + b = a; + break; + } } - tail = tail->next; } - tail->next = a ? : b; + /* Finish linking remainder of list b on to tail */ + tail->next = b; do { /* - * In worst cases this loop may run many iterations. + * If the merge is highly unbalanced (e.g. the input is + * already sorted), this loop may run many iterations. * Continue callbacks to the client even though no * element comparison is needed, so the client's cmp() * routine can invoke cond_resched() periodically. */ - if (unlikely(!(++count))) - (*cmp)(priv, tail->next, tail->next); - - tail->next->prev = tail; - tail = tail->next; - } while (tail->next); - + if (unlikely(!++count)) + cmp(priv, b, b); + b->prev = tail; + tail = b; + b = b->next; + } while (b); + + /* And the final links to make a circular doubly-linked list */ tail->next = head; head->prev = tail; } @@ -91,55 +107,149 @@ static void merge_and_restore_back_links(void *priv, * @head: the list to sort * @cmp: the elements comparison function * - * This function implements "merge sort", which has O(nlog(n)) - * complexity. + * The comparison funtion @cmp must return > 0 if @a should sort after + * @b ("@a > @b" if you want an ascending sort), and <= 0 if @a should + * sort before @b *or* their original order should be preserved. It is + * always called with the element that came first in the input in @a, + * and list_sort is a stable sort, so it is not necessary to distinguish + * the @a < @b and @a == @b cases. + * + * This is compatible with two styles of @cmp function: + * - The traditional style which returns <0 / =0 / >0, or + * - Returning a boolean 0/1. + * The latter offers a chance to save a few cycles in the comparison + * (which is used by e.g. plug_ctx_cmp() in block/blk-mq.c). + * + * A good way to write a multi-word comparison is + * if (a->high != b->high) + * return a->high > b->high; + * if (a->middle != b->middle) + * return a->middle > b->middle; + * return a->low > b->low; + * + * + * This mergesort is as eager as possible while always performing at least + * 2:1 balanced merges. Given two pending sublists of size 2^k, they are + * merged to a size-2^(k+1) list as soon as we have 2^k following elements. + * + * Thus, it will avoid cache thrashing as long as 3*2^k elements can + * fit into the cache. Not quite as good as a fully-eager bottom-up + * mergesort, but it does use 0.2*n fewer comparisons, so is faster in + * the common case that everything fits into L1. + * + * + * The merging is controlled by "count", the number of elements in the + * pending lists. This is beautiully simple code, but rather subtle. * - * The comparison function @cmp must return a negative value if @a - * should sort before @b, and a positive value if @a should sort after - * @b. If @a and @b are equivalent, and their original relative - * ordering is to be preserved, @cmp must return 0. + * Each time we increment "count", we set one bit (bit k) and clear + * bits k-1 .. 0. Each time this happens (except the very first time + * for each bit, when count increments to 2^k), we merge two lists of + * size 2^k into one list of size 2^(k+1). + * + * This merge happens exactly when the count reaches an odd multiple of + * 2^k, which is when we have 2^k elements pending in smaller lists, + * so it's safe to merge away two lists of size 2^k. + * + * After this happens twice, we have created two lists of size 2^(k+1), + * which will be merged into a list of size 2^(k+2) before we create + * a third list of size 2^(k+1), so there are never more than two pending. + * + * The number of pending lists of size 2^k is determined by the + * state of bit k of "count" plus two extra pieces of information: + * - The state of bit k-1 (when k == 0, consider bit -1 always set), and + * - Whether the higher-order bits are zero or non-zero (i.e. + * is count >= 2^(k+1)). + * There are six states we distinguish. "x" represents some arbitrary + * bits, and "y" represents some arbitrary non-zero bits: + * 0: 00x: 0 pending of size 2^k; x pending of sizes < 2^k + * 1: 01x: 0 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k + * 2: x10x: 0 pending of size 2^k; 2^k + x pending of sizes < 2^k + * 3: x11x: 1 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k + * 4: y00x: 1 pending of size 2^k; 2^k + x pending of sizes < 2^k + * 5: y01x: 2 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k + * (merge and loop back to state 2) + * + * We gain lists of size 2^k in the 2->3 and 4->5 transitions (because + * bit k-1 is set while the more significant bits are non-zero) and + * merge them away in the 5->2 transition. Note in particular that just + * before the 5->2 transition, all lower-order bits are 11 (state 3), + * so there is one list of each smaller size. + * + * When we reach the end of the input, we merge all the pending + * lists, from smallest to largest. If you work through cases 2 to + * 5 above, you can see that the number of elements we merge with a list + * of size 2^k varies from 2^(k-1) (cases 3 and 5 when x == 0) to + * 2^(k+1) - 1 (second merge of case 5 when x == 2^(k-1) - 1). */ +__attribute__((nonnull(2,3))) void list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, struct list_head *a, struct list_head *b)) { - struct list_head *part[MAX_LIST_LENGTH_BITS+1]; /* sorted partial lists - -- last slot is a sentinel */ - int lev; /* index into part[] */ - int max_lev = 0; - struct list_head *list; + struct list_head *list = head->next, *pending = NULL; + size_t count = 0; /* Count of pending */ - if (list_empty(head)) + if (list == head->prev) /* Zero or one elements */ return; - memset(part, 0, sizeof(part)); - + /* Convert to a null-terminated singly-linked list. */ head->prev->next = NULL; - list = head->next; - - while (list) { - struct list_head *cur = list; - list = list->next; - cur->next = NULL; - for (lev = 0; part[lev]; lev++) { - cur = merge(priv, cmp, part[lev], cur); - part[lev] = NULL; - } - if (lev > max_lev) { - if (unlikely(lev >= ARRAY_SIZE(part)-1)) { - printk_once(KERN_DEBUG "list too long for efficiency\n"); - lev--; - } - max_lev = lev; + /* + * Data structure invariants: + * - All lists are singly linked and null-terminated; prev + * pointers are not maintained. + * - pending is a prev-linked "list of lists" of sorted + * sublists awaiting further merging. + * - Each of the sorted sublists is power-of-two in size. + * - Sublists are sorted by size and age, smallest & newest at front. + * - There are zero to two sublists of each size. + * - A pair of pending sublists are merged as soon as the number + * of following pending elements equals their size (i.e. + * each time count reaches an odd multiple of that size). + * That ensures each later final merge will be at worst 2:1. + * - Each round consists of: + * - Merging the two sublists selected by the highest bit + * which flips when count is incremented, and + * - Adding an element from the input as a size-1 sublist. + */ + do { + size_t bits; + struct list_head **tail = &pending; + + /* Find the least-significant clear bit in count */ + for (bits = count; bits & 1; bits >>= 1) + tail = &(*tail)->prev; + /* Do the indicated merge */ + if (likely(bits)) { + struct list_head *a = *tail, *b = a->prev; + + a = merge(priv, (cmp_func)cmp, b, a); + /* Install the merged result in place of the inputs */ + a->prev = b->prev; + *tail = a; } - part[lev] = cur; - } - for (lev = 0; lev < max_lev; lev++) - if (part[lev]) - list = merge(priv, cmp, part[lev], list); - - merge_and_restore_back_links(priv, cmp, head, part[max_lev], list); + /* Move one element from input list to pending */ + list->prev = pending; + pending = list; + list = list->next; + pending->next = NULL; + count++; + } while (list); + + /* End of input; merge together all the pending lists. */ + list = pending; + pending = pending->prev; + for (;;) { + struct list_head *next = pending->prev; + + if (!next) + break; + list = merge(priv, (cmp_func)cmp, pending, list); + pending = next; + } + /* The final merge, rebuilding prev links */ + merge_final(priv, (cmp_func)cmp, head, pending, list); } EXPORT_SYMBOL(list_sort); diff --git a/lib/math/Kconfig b/lib/math/Kconfig new file mode 100644 index 000000000000..73bdf37178d1 --- /dev/null +++ b/lib/math/Kconfig @@ -0,0 +1,11 @@ +config CORDIC + tristate "CORDIC algorithm" + help + This option provides an implementation of the CORDIC algorithm; + calculations are in fixed point. Module will be called cordic. + +config PRIME_NUMBERS + tristate + +config RATIONAL + bool diff --git a/lib/math/Makefile b/lib/math/Makefile new file mode 100644 index 000000000000..583bbfebfc09 --- /dev/null +++ b/lib/math/Makefile @@ -0,0 +1,5 @@ +obj-y += div64.o gcd.o lcm.o int_pow.o int_sqrt.o reciprocal_div.o + +obj-$(CONFIG_CORDIC) += cordic.o +obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o +obj-$(CONFIG_RATIONAL) += rational.o diff --git a/lib/cordic.c b/lib/math/cordic.c index 8ef27c12956f..8ef27c12956f 100644 --- a/lib/cordic.c +++ b/lib/math/cordic.c diff --git a/lib/div64.c b/lib/math/div64.c index ee146bb4c558..368ca7fd0d82 100644 --- a/lib/div64.c +++ b/lib/math/div64.c @@ -10,7 +10,7 @@ * Generic C version of 64bit/32bit division and modulo, with * 64bit result and 32bit remainder. * - * The fast case for (n>>32 == 0) is handled inline by do_div(). + * The fast case for (n>>32 == 0) is handled inline by do_div(). * * Code generated for this function might be very inefficient * for some CPUs. __div64_32() can be overridden by linking arch-specific diff --git a/lib/gcd.c b/lib/math/gcd.c index 7948ab27f0a4..7948ab27f0a4 100644 --- a/lib/gcd.c +++ b/lib/math/gcd.c diff --git a/lib/math/int_pow.c b/lib/math/int_pow.c new file mode 100644 index 000000000000..622fc1ab3c74 --- /dev/null +++ b/lib/math/int_pow.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * An integer based power function + * + * Derived from drivers/video/backlight/pwm_bl.c + */ + +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/types.h> + +/** + * int_pow - computes the exponentiation of the given base and exponent + * @base: base which will be raised to the given power + * @exp: power to be raised to + * + * Computes: pow(base, exp), i.e. @base raised to the @exp power + */ +u64 int_pow(u64 base, unsigned int exp) +{ + u64 result = 1; + + while (exp) { + if (exp & 1) + result *= base; + exp >>= 1; + base *= base; + } + + return result; +} +EXPORT_SYMBOL_GPL(int_pow); diff --git a/lib/int_sqrt.c b/lib/math/int_sqrt.c index 30e0f9770f88..30e0f9770f88 100644 --- a/lib/int_sqrt.c +++ b/lib/math/int_sqrt.c diff --git a/lib/lcm.c b/lib/math/lcm.c index 03d7fcb420b5..03d7fcb420b5 100644 --- a/lib/lcm.c +++ b/lib/math/lcm.c diff --git a/lib/prime_numbers.c b/lib/math/prime_numbers.c index 550eec457c2e..550eec457c2e 100644 --- a/lib/prime_numbers.c +++ b/lib/math/prime_numbers.c diff --git a/lib/rational.c b/lib/math/rational.c index ba7443677c90..ba7443677c90 100644 --- a/lib/rational.c +++ b/lib/math/rational.c diff --git a/lib/reciprocal_div.c b/lib/math/reciprocal_div.c index bf043258fa00..bf043258fa00 100644 --- a/lib/reciprocal_div.c +++ b/lib/math/reciprocal_div.c diff --git a/lib/plist.c b/lib/plist.c index 199408f91057..d3bd8827186f 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -26,7 +26,7 @@ #include <linux/bug.h> #include <linux/plist.h> -#ifdef CONFIG_DEBUG_PI_LIST +#ifdef CONFIG_DEBUG_PLIST static struct plist_head test_head; @@ -173,7 +173,7 @@ void plist_requeue(struct plist_node *node, struct plist_head *head) plist_check_head(head); } -#ifdef CONFIG_DEBUG_PI_LIST +#ifdef CONFIG_DEBUG_PLIST #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/module.h> diff --git a/lib/sort.c b/lib/sort.c index d6b7a202b0b6..50855ea8c262 100644 --- a/lib/sort.c +++ b/lib/sort.c @@ -1,8 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 /* - * A fast, small, non-recursive O(nlog n) sort for the Linux kernel + * A fast, small, non-recursive O(n log n) sort for the Linux kernel * - * Jan 23 2005 Matt Mackall <mpm@selenic.com> + * This performs n*log2(n) + 0.37*n + o(n) comparisons on average, + * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case. + * + * Glibc qsort() manages n*log2(n) - 1.26*n for random inputs (1.63*n + * better) at the expense of stack usage and much larger code to avoid + * quicksort's O(n^2) worst case. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -11,35 +16,155 @@ #include <linux/export.h> #include <linux/sort.h> -static int alignment_ok(const void *base, int align) +/** + * is_aligned - is this pointer & size okay for word-wide copying? + * @base: pointer to data + * @size: size of each element + * @align: required alignment (typically 4 or 8) + * + * Returns true if elements can be copied using word loads and stores. + * The size must be a multiple of the alignment, and the base address must + * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. + * + * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" + * to "if ((a | b) & mask)", so we do that by hand. + */ +__attribute_const__ __always_inline +static bool is_aligned(const void *base, size_t size, unsigned char align) { - return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || - ((unsigned long)base & (align - 1)) == 0; + unsigned char lsbits = (unsigned char)size; + + (void)base; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + lsbits |= (unsigned char)(uintptr_t)base; +#endif + return (lsbits & (align - 1)) == 0; } -static void u32_swap(void *a, void *b, int size) +/** + * swap_words_32 - swap two elements in 32-bit chunks + * @a, @b: pointers to the elements + * @size: element size (must be a multiple of 4) + * + * Exchange the two objects in memory. This exploits base+index addressing, + * which basically all CPUs have, to minimize loop overhead computations. + * + * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the + * bottom of the loop, even though the zero flag is stil valid from the + * subtract (since the intervening mov instructions don't alter the flags). + * Gcc 8.1.0 doesn't have that problem. + */ +static void swap_words_32(void *a, void *b, size_t n) { - u32 t = *(u32 *)a; - *(u32 *)a = *(u32 *)b; - *(u32 *)b = t; + do { + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + } while (n); } -static void u64_swap(void *a, void *b, int size) +/** + * swap_words_64 - swap two elements in 64-bit chunks + * @a, @b: pointers to the elements + * @size: element size (must be a multiple of 8) + * + * Exchange the two objects in memory. This exploits base+index + * addressing, which basically all CPUs have, to minimize loop overhead + * computations. + * + * We'd like to use 64-bit loads if possible. If they're not, emulating + * one requires base+index+4 addressing which x86 has but most other + * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, + * but it's possible to have 64-bit loads without 64-bit pointers (e.g. + * x32 ABI). Are there any cases the kernel needs to worry about? + */ +static void swap_words_64(void *a, void *b, size_t n) { - u64 t = *(u64 *)a; - *(u64 *)a = *(u64 *)b; - *(u64 *)b = t; + do { +#ifdef CONFIG_64BIT + u64 t = *(u64 *)(a + (n -= 8)); + *(u64 *)(a + n) = *(u64 *)(b + n); + *(u64 *)(b + n) = t; +#else + /* Use two 32-bit transfers to avoid base+index+4 addressing */ + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + + t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; +#endif + } while (n); } -static void generic_swap(void *a, void *b, int size) +/** + * swap_bytes - swap two elements a byte at a time + * @a, @b: pointers to the elements + * @size: element size + * + * This is the fallback if alignment doesn't allow using larger chunks. + */ +static void swap_bytes(void *a, void *b, size_t n) { - char t; - do { - t = *(char *)a; - *(char *)a++ = *(char *)b; - *(char *)b++ = t; - } while (--size > 0); + char t = ((char *)a)[--n]; + ((char *)a)[n] = ((char *)b)[n]; + ((char *)b)[n] = t; + } while (n); +} + +typedef void (*swap_func_t)(void *a, void *b, int size); + +/* + * The values are arbitrary as long as they can't be confused with + * a pointer, but small integers make for the smallest compare + * instructions. + */ +#define SWAP_WORDS_64 (swap_func_t)0 +#define SWAP_WORDS_32 (swap_func_t)1 +#define SWAP_BYTES (swap_func_t)2 + +/* + * The function pointer is last to make tail calls most efficient if the + * compiler decides not to inline this function. + */ +static void do_swap(void *a, void *b, size_t size, swap_func_t swap_func) +{ + if (swap_func == SWAP_WORDS_64) + swap_words_64(a, b, size); + else if (swap_func == SWAP_WORDS_32) + swap_words_32(a, b, size); + else if (swap_func == SWAP_BYTES) + swap_bytes(a, b, size); + else + swap_func(a, b, (int)size); +} + +/** + * parent - given the offset of the child, find the offset of the parent. + * @i: the offset of the heap element whose parent is sought. Non-zero. + * @lsbit: a precomputed 1-bit mask, equal to "size & -size" + * @size: size of each element + * + * In terms of array indexes, the parent of element j = @i/@size is simply + * (j-1)/2. But when working in byte offsets, we can't use implicit + * truncation of integer divides. + * + * Fortunately, we only need one bit of the quotient, not the full divide. + * @size has a least significant bit. That bit will be clear if @i is + * an even multiple of @size, and set if it's an odd multiple. + * + * Logically, we're doing "if (i & lsbit) i -= size;", but since the + * branch is unpredictable, it's done with a bit of clever branch-free + * code instead. + */ +__attribute_const__ __always_inline +static size_t parent(size_t i, unsigned int lsbit, size_t size) +{ + i -= size; + i -= size & -(i & lsbit); + return i / 2; } /** @@ -50,57 +175,78 @@ static void generic_swap(void *a, void *b, int size) * @cmp_func: pointer to comparison function * @swap_func: pointer to swap function or NULL * - * This function does a heapsort on the given array. You may provide a - * swap_func function optimized to your element type. + * This function does a heapsort on the given array. You may provide + * a swap_func function if you need to do something more than a memory + * copy (e.g. fix up pointers or auxiliary data), but the built-in swap + * avoids a slow retpoline and so is significantly faster. * * Sorting time is O(n log n) both on average and worst-case. While - * qsort is about 20% faster on average, it suffers from exploitable + * quicksort is slightly faster on average, it suffers from exploitable * O(n*n) worst-case behavior and extra memory requirements that make * it less suitable for kernel use. */ - void sort(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *), void (*swap_func)(void *, void *, int size)) { /* pre-scale counters for performance */ - int i = (num/2 - 1) * size, n = num * size, c, r; + size_t n = num * size, a = (num/2) * size; + const unsigned int lsbit = size & -size; /* Used to find parent */ + + if (!a) /* num < 2 || size == 0 */ + return; if (!swap_func) { - if (size == 4 && alignment_ok(base, 4)) - swap_func = u32_swap; - else if (size == 8 && alignment_ok(base, 8)) - swap_func = u64_swap; + if (is_aligned(base, size, 8)) + swap_func = SWAP_WORDS_64; + else if (is_aligned(base, size, 4)) + swap_func = SWAP_WORDS_32; else - swap_func = generic_swap; + swap_func = SWAP_BYTES; } - /* heapify */ - for ( ; i >= 0; i -= size) { - for (r = i; r * 2 + size < n; r = c) { - c = r * 2 + size; - if (c < n - size && - cmp_func(base + c, base + c + size) < 0) - c += size; - if (cmp_func(base + r, base + c) >= 0) - break; - swap_func(base + r, base + c, size); - } - } + /* + * Loop invariants: + * 1. elements [a,n) satisfy the heap property (compare greater than + * all of their children), + * 2. elements [n,num*size) are sorted, and + * 3. a <= b <= c <= d <= n (whenever they are valid). + */ + for (;;) { + size_t b, c, d; + + if (a) /* Building heap: sift down --a */ + a -= size; + else if (n -= size) /* Sorting: Extract root to --n */ + do_swap(base, base + n, size, swap_func); + else /* Sort complete */ + break; - /* sort */ - for (i = n - size; i > 0; i -= size) { - swap_func(base, base + i, size); - for (r = 0; r * 2 + size < i; r = c) { - c = r * 2 + size; - if (c < i - size && - cmp_func(base + c, base + c + size) < 0) - c += size; - if (cmp_func(base + r, base + c) >= 0) - break; - swap_func(base + r, base + c, size); + /* + * Sift element at "a" down into heap. This is the + * "bottom-up" variant, which significantly reduces + * calls to cmp_func(): we find the sift-down path all + * the way to the leaves (one compare per level), then + * backtrack to find where to insert the target element. + * + * Because elements tend to sift down close to the leaves, + * this uses fewer compares than doing two per level + * on the way down. (A bit more than half as many on + * average, 3/4 worst-case.) + */ + for (b = a; c = 2*b + size, (d = c + size) < n;) + b = cmp_func(base + c, base + d) >= 0 ? c : d; + if (d == n) /* Special case last leaf with no sibling */ + b = c; + + /* Now backtrack from "b" to the correct location for "a" */ + while (b != a && cmp_func(base + a, base + b) >= 0) + b = parent(b, lsbit, size); + c = b; /* Where "a" belongs */ + while (b != a) { /* Shift it into place */ + b = parent(b, lsbit, size); + do_swap(base + b, base + c, size, swap_func); } } } - EXPORT_SYMBOL(sort); diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 792d90608052..d3a501f2a81a 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -11,6 +11,7 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/uaccess.h> #include "../tools/testing/selftests/kselftest_module.h" @@ -226,7 +227,8 @@ static const unsigned long exp[] __initconst = { BITMAP_FROM_U64(0xffffffff), BITMAP_FROM_U64(0xfffffffe), BITMAP_FROM_U64(0x3333333311111111ULL), - BITMAP_FROM_U64(0xffffffff77777777ULL) + BITMAP_FROM_U64(0xffffffff77777777ULL), + BITMAP_FROM_U64(0), }; static const unsigned long exp2[] __initconst = { @@ -249,55 +251,93 @@ static const struct test_bitmap_parselist parselist_tests[] __initconst = { {0, "1-31:4/4", &exp[9 * step], 32, 0}, {0, "0-31:1/4,32-63:2/4", &exp[10 * step], 64, 0}, {0, "0-31:3/4,32-63:4/4", &exp[11 * step], 64, 0}, + {0, " ,, 0-31:3/4 ,, 32-63:4/4 ,, ", &exp[11 * step], 64, 0}, {0, "0-31:1/4,32-63:2/4,64-95:3/4,96-127:4/4", exp2, 128, 0}, {0, "0-2047:128/256", NULL, 2048, PARSE_TIME}, + {0, "", &exp[12 * step], 8, 0}, + {0, "\n", &exp[12 * step], 8, 0}, + {0, ",, ,, , , ,", &exp[12 * step], 8, 0}, + {0, " , ,, , , ", &exp[12 * step], 8, 0}, + {0, " , ,, , , \n", &exp[12 * step], 8, 0}, + {-EINVAL, "-1", NULL, 8, 0}, {-EINVAL, "-0", NULL, 8, 0}, {-EINVAL, "10-1", NULL, 8, 0}, {-EINVAL, "0-31:", NULL, 8, 0}, {-EINVAL, "0-31:0", NULL, 8, 0}, + {-EINVAL, "0-31:0/", NULL, 8, 0}, {-EINVAL, "0-31:0/0", NULL, 8, 0}, {-EINVAL, "0-31:1/0", NULL, 8, 0}, {-EINVAL, "0-31:10/1", NULL, 8, 0}, + {-EOVERFLOW, "0-98765432123456789:10/1", NULL, 8, 0}, + + {-EINVAL, "a-31", NULL, 8, 0}, + {-EINVAL, "0-a1", NULL, 8, 0}, + {-EINVAL, "a-31:10/1", NULL, 8, 0}, + {-EINVAL, "0-31:a/1", NULL, 8, 0}, + {-EINVAL, "0-\n", NULL, 8, 0}, }; -static void __init test_bitmap_parselist(void) +static void __init __test_bitmap_parselist(int is_user) { int i; int err; - cycles_t cycles; + ktime_t time; DECLARE_BITMAP(bmap, 2048); + char *mode = is_user ? "_user" : ""; for (i = 0; i < ARRAY_SIZE(parselist_tests); i++) { #define ptest parselist_tests[i] - cycles = get_cycles(); - err = bitmap_parselist(ptest.in, bmap, ptest.nbits); - cycles = get_cycles() - cycles; + if (is_user) { + mm_segment_t orig_fs = get_fs(); + size_t len = strlen(ptest.in); + + set_fs(KERNEL_DS); + time = ktime_get(); + err = bitmap_parselist_user(ptest.in, len, + bmap, ptest.nbits); + time = ktime_get() - time; + set_fs(orig_fs); + } else { + time = ktime_get(); + err = bitmap_parselist(ptest.in, bmap, ptest.nbits); + time = ktime_get() - time; + } if (err != ptest.errno) { - pr_err("test %d: input is %s, errno is %d, expected %d\n", - i, ptest.in, err, ptest.errno); + pr_err("parselist%s: %d: input is %s, errno is %d, expected %d\n", + mode, i, ptest.in, err, ptest.errno); continue; } if (!err && ptest.expected && !__bitmap_equal(bmap, ptest.expected, ptest.nbits)) { - pr_err("test %d: input is %s, result is 0x%lx, expected 0x%lx\n", - i, ptest.in, bmap[0], *ptest.expected); + pr_err("parselist%s: %d: input is %s, result is 0x%lx, expected 0x%lx\n", + mode, i, ptest.in, bmap[0], + *ptest.expected); continue; } if (ptest.flags & PARSE_TIME) - pr_err("test %d: input is '%s' OK, Time: %llu\n", - i, ptest.in, - (unsigned long long)cycles); + pr_err("parselist%s: %d: input is '%s' OK, Time: %llu\n", + mode, i, ptest.in, time); } } +static void __init test_bitmap_parselist(void) +{ + __test_bitmap_parselist(0); +} + +static void __init test_bitmap_parselist_user(void) +{ + __test_bitmap_parselist(1); +} + #define EXP_BYTES (sizeof(exp) * 8) static void __init test_bitmap_arr32(void) @@ -370,6 +410,7 @@ static void __init selftest(void) test_copy(); test_bitmap_arr32(); test_bitmap_parselist(); + test_bitmap_parselist_user(); test_mem_optimisations(); } diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c index 3dd801c1c85b..566dad3f4196 100644 --- a/lib/test_sysctl.c +++ b/lib/test_sysctl.c @@ -47,6 +47,9 @@ struct test_sysctl_data { unsigned int uint_0001; char string_0001[65]; + +#define SYSCTL_TEST_BITMAP_SIZE 65536 + unsigned long *bitmap_0001; }; static struct test_sysctl_data test_data = { @@ -102,6 +105,13 @@ static struct ctl_table test_table[] = { .mode = 0644, .proc_handler = proc_dostring, }, + { + .procname = "bitmap_0001", + .data = &test_data.bitmap_0001, + .maxlen = SYSCTL_TEST_BITMAP_SIZE, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, + }, { } }; @@ -129,15 +139,21 @@ static struct ctl_table_header *test_sysctl_header; static int __init test_sysctl_init(void) { + test_data.bitmap_0001 = kzalloc(SYSCTL_TEST_BITMAP_SIZE/8, GFP_KERNEL); + if (!test_data.bitmap_0001) + return -ENOMEM; test_sysctl_header = register_sysctl_table(test_sysctl_root_table); - if (!test_sysctl_header) + if (!test_sysctl_header) { + kfree(test_data.bitmap_0001); return -ENOMEM; + } return 0; } late_initcall(test_sysctl_init); static void __exit test_sysctl_exit(void) { + kfree(test_data.bitmap_0001); if (test_sysctl_header) unregister_sysctl_table(test_sysctl_header); } diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index f832b095afba..8bbefcaddfe8 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -384,12 +384,11 @@ static int test_func(void *private) { struct test_driver *t = private; int random_array[ARRAY_SIZE(test_case_array)]; - int index, i, j, ret; + int index, i, j; ktime_t kt; u64 delta; - ret = set_cpus_allowed_ptr(current, cpumask_of(t->cpu)); - if (ret < 0) + if (set_cpus_allowed_ptr(current, cpumask_of(t->cpu)) < 0) pr_err("Failed to set affinity to %d CPU\n", t->cpu); for (i = 0; i < ARRAY_SIZE(test_case_array); i++) @@ -415,8 +414,7 @@ static int test_func(void *private) kt = ktime_get(); for (j = 0; j < test_repeat_count; j++) { - ret = test_case_array[index].test_func(); - if (!ret) + if (!test_case_array[index].test_func()) per_cpu_test_data[t->cpu][index].test_passed++; else per_cpu_test_data[t->cpu][index].test_failed++; diff --git a/mm/Makefile b/mm/Makefile index d210cc9d6f80..ac5e5ba78874 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -33,7 +33,7 @@ mmu-$(CONFIG_MMU) += process_vm_access.o endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ - maccess.o page_alloc.o page-writeback.o \ + maccess.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ @@ -41,6 +41,11 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ interval_tree.o list_lru.o workingset.o \ debug.o $(mmu-y) +# Give 'page_alloc' its own module-parameter namespace +page-alloc-y := page_alloc.o +page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o + +obj-y += page-alloc.o obj-y += init-mm.o obj-y += memblock.o diff --git a/mm/compaction.c b/mm/compaction.c index 6cc4bea33dcb..cbac7277978a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1888,13 +1888,13 @@ static enum compact_result __compact_finished(struct compact_control *cc) bool can_steal; /* Job done if page is free of the right migratetype */ - if (!list_empty(&area->free_list[migratetype])) + if (!free_area_empty(area, migratetype)) return COMPACT_SUCCESS; #ifdef CONFIG_CMA /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ if (migratetype == MIGRATE_MOVABLE && - !list_empty(&area->free_list[MIGRATE_CMA])) + !free_area_empty(area, MIGRATE_CMA)) return COMPACT_SUCCESS; #endif /* diff --git a/mm/debug.c b/mm/debug.c index eee9c221280c..8345bb6e4769 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -67,7 +67,7 @@ void __dump_page(struct page *page, const char *reason) */ mapcount = PageSlab(page) ? 0 : page_mapcount(page); - pr_warn("page:%px count:%d mapcount:%d mapping:%px index:%#lx", + pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx", page, page_ref_count(page), mapcount, page->mapping, page_to_pgoff(page)); if (PageCompound(page)) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 287933005e11..e50a2db5b4ff 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -687,10 +687,119 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } -static unsigned long memcg_sum_events(struct mem_cgroup *memcg, - int event) +/** + * __mod_memcg_state - update cgroup memory statistics + * @memcg: the memory cgroup + * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item + * @val: delta to add to the counter, can be negative + */ +void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) +{ + long x; + + if (mem_cgroup_disabled()) + return; + + x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); + if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + struct mem_cgroup *mi; + + atomic_long_add(x, &memcg->vmstats_local[idx]); + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + atomic_long_add(x, &mi->vmstats[idx]); + x = 0; + } + __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); +} + +static struct mem_cgroup_per_node * +parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) +{ + struct mem_cgroup *parent; + + parent = parent_mem_cgroup(pn->memcg); + if (!parent) + return NULL; + return mem_cgroup_nodeinfo(parent, nid); +} + +/** + * __mod_lruvec_state - update lruvec memory statistics + * @lruvec: the lruvec + * @idx: the stat item + * @val: delta to add to the counter, can be negative + * + * The lruvec is the intersection of the NUMA node and a cgroup. This + * function updates the all three counters that are affected by a + * change of state at this level: per-node, per-cgroup, per-lruvec. + */ +void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + int val) { - return atomic_long_read(&memcg->events[event]); + pg_data_t *pgdat = lruvec_pgdat(lruvec); + struct mem_cgroup_per_node *pn; + struct mem_cgroup *memcg; + long x; + + /* Update node */ + __mod_node_page_state(pgdat, idx, val); + + if (mem_cgroup_disabled()) + return; + + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + memcg = pn->memcg; + + /* Update memcg */ + __mod_memcg_state(memcg, idx, val); + + /* Update lruvec */ + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); + if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + struct mem_cgroup_per_node *pi; + + atomic_long_add(x, &pn->lruvec_stat_local[idx]); + for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) + atomic_long_add(x, &pi->lruvec_stat[idx]); + x = 0; + } + __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); +} + +/** + * __count_memcg_events - account VM events in a cgroup + * @memcg: the memory cgroup + * @idx: the event item + * @count: the number of events that occured + */ +void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, + unsigned long count) +{ + unsigned long x; + + if (mem_cgroup_disabled()) + return; + + x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); + if (unlikely(x > MEMCG_CHARGE_BATCH)) { + struct mem_cgroup *mi; + + atomic_long_add(x, &memcg->vmevents_local[idx]); + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + atomic_long_add(x, &mi->vmevents[idx]); + x = 0; + } + __this_cpu_write(memcg->vmstats_percpu->events[idx], x); +} + +static unsigned long memcg_events(struct mem_cgroup *memcg, int event) +{ + return atomic_long_read(&memcg->vmevents[event]); +} + +static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) +{ + return atomic_long_read(&memcg->vmevents_local[event]); } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, @@ -722,7 +831,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, nr_pages = -nr_pages; /* for event */ } - __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); + __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); } static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, @@ -730,8 +839,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, { unsigned long val, next; - val = __this_cpu_read(memcg->stat_cpu->nr_page_events); - next = __this_cpu_read(memcg->stat_cpu->targets[target]); + val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); + next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); /* from time_after() in jiffies.h */ if ((long)(next - val) < 0) { switch (target) { @@ -747,7 +856,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, default: break; } - __this_cpu_write(memcg->stat_cpu->targets[target], next); + __this_cpu_write(memcg->vmstats_percpu->targets[target], next); return true; } return false; @@ -1325,12 +1434,14 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account) continue; pr_cont(" %s:%luKB", memcg1_stat_names[i], - K(memcg_page_state(iter, memcg1_stats[i]))); + K(memcg_page_state_local(iter, + memcg1_stats[i]))); } for (i = 0; i < NR_LRU_LISTS; i++) pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], - K(memcg_page_state(iter, NR_LRU_BASE + i))); + K(memcg_page_state_local(iter, + NR_LRU_BASE + i))); pr_cont("\n"); } @@ -2076,7 +2187,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *mi; stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); @@ -2088,9 +2199,12 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) int nid; long x; - x = this_cpu_xchg(memcg->stat_cpu->count[i], 0); - if (x) - atomic_long_add(x, &memcg->stat[i]); + x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0); + if (x) { + atomic_long_add(x, &memcg->vmstats_local[i]); + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + atomic_long_add(x, &memcg->vmstats[i]); + } if (i >= NR_VM_NODE_STAT_ITEMS) continue; @@ -2100,17 +2214,24 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) pn = mem_cgroup_nodeinfo(memcg, nid); x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); - if (x) - atomic_long_add(x, &pn->lruvec_stat[i]); + if (x) { + atomic_long_add(x, &pn->lruvec_stat_local[i]); + do { + atomic_long_add(x, &pn->lruvec_stat[i]); + } while ((pn = parent_nodeinfo(pn, nid))); + } } } for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { long x; - x = this_cpu_xchg(memcg->stat_cpu->events[i], 0); - if (x) - atomic_long_add(x, &memcg->events[i]); + x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0); + if (x) { + atomic_long_add(x, &memcg->vmevents_local[i]); + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + atomic_long_add(x, &memcg->vmevents[i]); + } } } @@ -2940,50 +3061,15 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return retval; } -struct accumulated_stats { - unsigned long stat[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; - unsigned long lru_pages[NR_LRU_LISTS]; - const unsigned int *stats_array; - const unsigned int *events_array; - int stats_size; - int events_size; -}; - -static void accumulate_memcg_tree(struct mem_cgroup *memcg, - struct accumulated_stats *acc) -{ - struct mem_cgroup *mi; - int i; - - for_each_mem_cgroup_tree(mi, memcg) { - for (i = 0; i < acc->stats_size; i++) - acc->stat[i] += memcg_page_state(mi, - acc->stats_array ? acc->stats_array[i] : i); - - for (i = 0; i < acc->events_size; i++) - acc->events[i] += memcg_sum_events(mi, - acc->events_array ? acc->events_array[i] : i); - - for (i = 0; i < NR_LRU_LISTS; i++) - acc->lru_pages[i] += memcg_page_state(mi, - NR_LRU_BASE + i); - } -} - static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { - unsigned long val = 0; + unsigned long val; if (mem_cgroup_is_root(memcg)) { - struct mem_cgroup *iter; - - for_each_mem_cgroup_tree(iter, memcg) { - val += memcg_page_state(iter, MEMCG_CACHE); - val += memcg_page_state(iter, MEMCG_RSS); - if (swap) - val += memcg_page_state(iter, MEMCG_SWAP); - } + val = memcg_page_state(memcg, MEMCG_CACHE) + + memcg_page_state(memcg, MEMCG_RSS); + if (swap) + val += memcg_page_state(memcg, MEMCG_SWAP); } else { if (!swap) val = page_counter_read(&memcg->memory); @@ -3324,7 +3410,7 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); + nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); } return nr; } @@ -3338,7 +3424,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - nr += memcg_page_state(memcg, NR_LRU_BASE + lru); + nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); } return nr; } @@ -3414,7 +3500,6 @@ static int memcg_stat_show(struct seq_file *m, void *v) unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; - struct accumulated_stats acc; BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); @@ -3423,17 +3508,17 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; seq_printf(m, "%s %lu\n", memcg1_stat_names[i], - memcg_page_state(memcg, memcg1_stats[i]) * + memcg_page_state_local(memcg, memcg1_stats[i]) * PAGE_SIZE); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) seq_printf(m, "%s %lu\n", memcg1_event_names[i], - memcg_sum_events(memcg, memcg1_events[i])); + memcg_events_local(memcg, memcg1_events[i])); for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], - memcg_page_state(memcg, NR_LRU_BASE + i) * + memcg_page_state_local(memcg, NR_LRU_BASE + i) * PAGE_SIZE); /* Hierarchical information */ @@ -3448,27 +3533,21 @@ static int memcg_stat_show(struct seq_file *m, void *v) seq_printf(m, "hierarchical_memsw_limit %llu\n", (u64)memsw * PAGE_SIZE); - memset(&acc, 0, sizeof(acc)); - acc.stats_size = ARRAY_SIZE(memcg1_stats); - acc.stats_array = memcg1_stats; - acc.events_size = ARRAY_SIZE(memcg1_events); - acc.events_array = memcg1_events; - accumulate_memcg_tree(memcg, &acc); - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], - (u64)acc.stat[i] * PAGE_SIZE); + (u64)memcg_page_state(memcg, i) * PAGE_SIZE); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], - (u64)acc.events[i]); + (u64)memcg_events(memcg, i)); for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], - (u64)acc.lru_pages[i] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); #ifdef CONFIG_DEBUG_VM { @@ -3901,11 +3980,11 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) */ static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) { - long x = atomic_long_read(&memcg->stat[idx]); + long x = atomic_long_read(&memcg->vmstats[idx]); int cpu; for_each_online_cpu(cpu) - x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx]; + x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx]; if (x < 0) x = 0; return x; @@ -4445,7 +4524,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); - free_percpu(memcg->stat_cpu); + free_percpu(memcg->vmstats_percpu); kfree(memcg); } @@ -4474,8 +4553,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg->id.id < 0) goto fail; - memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu); - if (!memcg->stat_cpu) + memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu); + if (!memcg->vmstats_percpu) goto fail; for_each_node(node) @@ -5561,7 +5640,6 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - struct accumulated_stats acc; int i; /* @@ -5575,31 +5653,27 @@ static int memory_stat_show(struct seq_file *m, void *v) * Current memory state: */ - memset(&acc, 0, sizeof(acc)); - acc.stats_size = MEMCG_NR_STAT; - acc.events_size = NR_VM_EVENT_ITEMS; - accumulate_memcg_tree(memcg, &acc); - seq_printf(m, "anon %llu\n", - (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE); + (u64)memcg_page_state(memcg, MEMCG_RSS) * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE); + (u64)memcg_page_state(memcg, MEMCG_CACHE) * PAGE_SIZE); seq_printf(m, "kernel_stack %llu\n", - (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024); + (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1024); seq_printf(m, "slab %llu\n", - (u64)(acc.stat[NR_SLAB_RECLAIMABLE] + - acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); + (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + + memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * + PAGE_SIZE); seq_printf(m, "sock %llu\n", - (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE); + (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE); seq_printf(m, "shmem %llu\n", - (u64)acc.stat[NR_SHMEM] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_SHMEM) * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE); /* * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter @@ -5608,43 +5682,47 @@ static int memory_stat_show(struct seq_file *m, void *v) * where the page->mem_cgroup is set up and stable. */ seq_printf(m, "anon_thp %llu\n", - (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE); + (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * PAGE_SIZE); for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], - (u64)acc.lru_pages[i] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); seq_printf(m, "slab_reclaimable %llu\n", - (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * + PAGE_SIZE); seq_printf(m, "slab_unreclaimable %llu\n", - (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * + PAGE_SIZE); /* Accumulated memory events */ - seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); - seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); + seq_printf(m, "pgfault %lu\n", memcg_events(memcg, PGFAULT)); + seq_printf(m, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT)); seq_printf(m, "workingset_refault %lu\n", - acc.stat[WORKINGSET_REFAULT]); + memcg_page_state(memcg, WORKINGSET_REFAULT)); seq_printf(m, "workingset_activate %lu\n", - acc.stat[WORKINGSET_ACTIVATE]); + memcg_page_state(memcg, WORKINGSET_ACTIVATE)); seq_printf(m, "workingset_nodereclaim %lu\n", - acc.stat[WORKINGSET_NODERECLAIM]); - - seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); - seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + - acc.events[PGSCAN_DIRECT]); - seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] + - acc.events[PGSTEAL_DIRECT]); - seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]); - seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]); - seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); - seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); + memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); + + seq_printf(m, "pgrefill %lu\n", memcg_events(memcg, PGREFILL)); + seq_printf(m, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + + memcg_events(memcg, PGSCAN_DIRECT)); + seq_printf(m, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + + memcg_events(memcg, PGSTEAL_DIRECT)); + seq_printf(m, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE)); + seq_printf(m, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE)); + seq_printf(m, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE)); + seq_printf(m, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED)); #ifdef CONFIG_TRANSPARENT_HUGEPAGE - seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]); + seq_printf(m, "thp_fault_alloc %lu\n", + memcg_events(memcg, THP_FAULT_ALLOC)); seq_printf(m, "thp_collapse_alloc %lu\n", - acc.events[THP_COLLAPSE_ALLOC]); + memcg_events(memcg, THP_COLLAPSE_ALLOC)); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return 0; @@ -6080,7 +6158,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages); + __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6c0c4f48638e..328878b6799d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -39,6 +39,7 @@ #include <asm/tlbflush.h> #include "internal.h" +#include "shuffle.h" /* * online_page_callback contains pointer to current page onlining function. @@ -891,6 +892,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ zone->zone_pgdat->node_present_pages += onlined_pages; pgdat_resize_unlock(zone->zone_pgdat, &flags); + shuffle_zone(zone); + if (onlined_pages) { node_states_set_node(nid, &arg); if (need_zonelists_rebuild) diff --git a/mm/mincore.c b/mm/mincore.c index 218099b5ed31..c3f058bd0faf 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -169,6 +169,22 @@ out: return 0; } +static inline bool can_do_mincore(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return true; + if (!vma->vm_file) + return false; + /* + * Reveal pagecache information only for non-anonymous mappings that + * correspond to the files the calling process could (if tried) open + * for writing; otherwise we'd be including shared non-exclusive + * mappings, which opens a side channel. + */ + return inode_owner_or_capable(file_inode(vma->vm_file)) || + inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; +} + /* * Do a chunk of "sys_mincore()". We've already checked * all the arguments, we hold the mmap semaphore: we should @@ -189,8 +205,13 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v vma = find_vma(current->mm, addr); if (!vma || addr < vma->vm_start) return -ENOMEM; - mincore_walk.mm = vma->vm_mm; end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); + if (!can_do_mincore(vma)) { + unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE); + memset(vec, 1, pages); + return pages; + } + mincore_walk.mm = vma->vm_mm; err = walk_page_range(addr, end, &mincore_walk); if (err < 0) return err; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f2f3fb4921d1..3b13d3914176 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -43,6 +43,7 @@ #include <linux/mempolicy.h> #include <linux/memremap.h> #include <linux/stop_machine.h> +#include <linux/random.h> #include <linux/sort.h> #include <linux/pfn.h> #include <linux/backing-dev.h> @@ -72,6 +73,7 @@ #include <asm/tlbflush.h> #include <asm/div64.h> #include "internal.h" +#include "shuffle.h" /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); @@ -755,12 +757,6 @@ static inline void set_page_order(struct page *page, unsigned int order) __SetPageBuddy(page); } -static inline void rmv_page_order(struct page *page) -{ - __ClearPageBuddy(page); - set_page_private(page, 0); -} - /* * This function checks whether a page is free && is the buddy * we can coalesce a page and its buddy if @@ -918,13 +914,10 @@ continue_merging: * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, * merge with it and move up one order. */ - if (page_is_guard(buddy)) { + if (page_is_guard(buddy)) clear_page_guard(zone, buddy, order, migratetype); - } else { - list_del(&buddy->lru); - zone->free_area[order].nr_free--; - rmv_page_order(buddy); - } + else + del_page_from_free_area(buddy, &zone->free_area[order]); combined_pfn = buddy_pfn & pfn; page = page + (combined_pfn - pfn); pfn = combined_pfn; @@ -966,7 +959,8 @@ done_merging: * so it's less likely to be used soon and more likely to be merged * as a higher order page */ - if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) { + if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn) + && !is_shuffle_order(order)) { struct page *higher_page, *higher_buddy; combined_pfn = buddy_pfn & pfn; higher_page = page + (combined_pfn - pfn); @@ -974,15 +968,18 @@ done_merging: higher_buddy = higher_page + (buddy_pfn - combined_pfn); if (pfn_valid_within(buddy_pfn) && page_is_buddy(higher_page, higher_buddy, order + 1)) { - list_add_tail(&page->lru, - &zone->free_area[order].free_list[migratetype]); - goto out; + add_to_free_area_tail(page, &zone->free_area[order], + migratetype); + return; } } - list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); -out: - zone->free_area[order].nr_free++; + if (is_shuffle_order(order)) + add_to_free_area_random(page, &zone->free_area[order], + migratetype); + else + add_to_free_area(page, &zone->free_area[order], migratetype); + } /* @@ -1874,9 +1871,9 @@ _deferred_grow_zone(struct zone *zone, unsigned int order) void __init page_alloc_init_late(void) { struct zone *zone; + int nid; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - int nid; /* There will be num_node_state(N_MEMORY) threads */ atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); @@ -1900,6 +1897,9 @@ void __init page_alloc_init_late(void) /* Discard memblock private memory */ memblock_discard(); + for_each_node_state(nid, N_MEMORY) + shuffle_free_memory(NODE_DATA(nid)); + for_each_populated_zone(zone) set_zone_contiguous(zone); } @@ -1970,8 +1970,7 @@ static inline void expand(struct zone *zone, struct page *page, if (set_page_guard(zone, &page[size], high, migratetype)) continue; - list_add(&page[size].lru, &area->free_list[migratetype]); - area->nr_free++; + add_to_free_area(&page[size], area, migratetype); set_page_order(&page[size], high); } } @@ -1986,7 +1985,7 @@ static void check_new_page_bad(struct page *page) if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) - bad_reason = "nonzero _count"; + bad_reason = "nonzero _refcount"; if (unlikely(page->flags & __PG_HWPOISON)) { bad_reason = "HWPoisoned (hardware-corrupted)"; bad_flags = __PG_HWPOISON; @@ -2113,13 +2112,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, /* Find a page of the appropriate size in the preferred list */ for (current_order = order; current_order < MAX_ORDER; ++current_order) { area = &(zone->free_area[current_order]); - page = list_first_entry_or_null(&area->free_list[migratetype], - struct page, lru); + page = get_page_from_free_area(area, migratetype); if (!page) continue; - list_del(&page->lru); - rmv_page_order(page); - area->nr_free--; + del_page_from_free_area(page, area); expand(zone, page, order, current_order, area, migratetype); set_pcppage_migratetype(page, migratetype); return page; @@ -2205,8 +2201,7 @@ static int move_freepages(struct zone *zone, } order = page_order(page); - list_move(&page->lru, - &zone->free_area[order].free_list[migratetype]); + move_to_free_area(page, &zone->free_area[order], migratetype); page += 1 << order; pages_moved += 1 << order; } @@ -2394,7 +2389,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, single_page: area = &zone->free_area[current_order]; - list_move(&page->lru, &area->free_list[start_type]); + move_to_free_area(page, area, start_type); } /* @@ -2418,7 +2413,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, if (fallback_mt == MIGRATE_TYPES) break; - if (list_empty(&area->free_list[fallback_mt])) + if (free_area_empty(area, fallback_mt)) continue; if (can_steal_fallback(order, migratetype)) @@ -2505,9 +2500,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, for (order = 0; order < MAX_ORDER; order++) { struct free_area *area = &(zone->free_area[order]); - page = list_first_entry_or_null( - &area->free_list[MIGRATE_HIGHATOMIC], - struct page, lru); + page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); if (!page) continue; @@ -2630,8 +2623,7 @@ find_smallest: VM_BUG_ON(current_order == MAX_ORDER); do_steal: - page = list_first_entry(&area->free_list[fallback_mt], - struct page, lru); + page = get_page_from_free_area(area, fallback_mt); steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, can_steal); @@ -3068,6 +3060,7 @@ EXPORT_SYMBOL_GPL(split_page); int __isolate_free_page(struct page *page, unsigned int order) { + struct free_area *area = &page_zone(page)->free_area[order]; unsigned long watermark; struct zone *zone; int mt; @@ -3092,9 +3085,8 @@ int __isolate_free_page(struct page *page, unsigned int order) } /* Remove page from free list */ - list_del(&page->lru); - zone->free_area[order].nr_free--; - rmv_page_order(page); + + del_page_from_free_area(page, area); /* * Set the pageblock if the isolated page is at least half of a @@ -3391,13 +3383,13 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, continue; for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { - if (!list_empty(&area->free_list[mt])) + if (!free_area_empty(area, mt)) return true; } #ifdef CONFIG_CMA if ((alloc_flags & ALLOC_CMA) && - !list_empty(&area->free_list[MIGRATE_CMA])) { + !free_area_empty(area, MIGRATE_CMA)) { return true; } #endif @@ -5324,7 +5316,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) types[order] = 0; for (type = 0; type < MIGRATE_TYPES; type++) { - if (!list_empty(&area->free_list[type])) + if (!free_area_empty(area, type)) types[order] |= 1 << type; } } @@ -8497,9 +8489,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) pr_info("remove from free list %lx %d %lx\n", pfn, 1 << order, end_pfn); #endif - list_del(&page->lru); - rmv_page_order(page); - zone->free_area[order].nr_free--; + del_page_from_free_area(page, &zone->free_area[order]); for (i = 0; i < (1 << order); i++) SetPageReserved((page+i)); pfn += (1 << order); diff --git a/mm/shuffle.c b/mm/shuffle.c new file mode 100644 index 000000000000..3ce12481b1dc --- /dev/null +++ b/mm/shuffle.c @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Intel Corporation. All rights reserved. + +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/mmzone.h> +#include <linux/random.h> +#include <linux/moduleparam.h> +#include "internal.h" +#include "shuffle.h" + +DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key); +static unsigned long shuffle_state __ro_after_init; + +/* + * Depending on the architecture, module parameter parsing may run + * before, or after the cache detection. SHUFFLE_FORCE_DISABLE prevents, + * or reverts the enabling of the shuffle implementation. SHUFFLE_ENABLE + * attempts to turn on the implementation, but aborts if it finds + * SHUFFLE_FORCE_DISABLE already set. + */ +__meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl) +{ + if (ctl == SHUFFLE_FORCE_DISABLE) + set_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state); + + if (test_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state)) { + if (test_and_clear_bit(SHUFFLE_ENABLE, &shuffle_state)) + static_branch_disable(&page_alloc_shuffle_key); + } else if (ctl == SHUFFLE_ENABLE + && !test_and_set_bit(SHUFFLE_ENABLE, &shuffle_state)) + static_branch_enable(&page_alloc_shuffle_key); +} + +static bool shuffle_param; +extern int shuffle_show(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state) + ? 'Y' : 'N'); +} + +static __meminit int shuffle_store(const char *val, + const struct kernel_param *kp) +{ + int rc = param_set_bool(val, kp); + + if (rc < 0) + return rc; + if (shuffle_param) + page_alloc_shuffle(SHUFFLE_ENABLE); + else + page_alloc_shuffle(SHUFFLE_FORCE_DISABLE); + return 0; +} +module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400); + +/* + * For two pages to be swapped in the shuffle, they must be free (on a + * 'free_area' lru), have the same order, and have the same migratetype. + */ +static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order) +{ + struct page *page; + + /* + * Given we're dealing with randomly selected pfns in a zone we + * need to ask questions like... + */ + + /* ...is the pfn even in the memmap? */ + if (!pfn_valid_within(pfn)) + return NULL; + + /* ...is the pfn in a present section or a hole? */ + if (!pfn_present(pfn)) + return NULL; + + /* ...is the page free and currently on a free_area list? */ + page = pfn_to_page(pfn); + if (!PageBuddy(page)) + return NULL; + + /* + * ...is the page on the same list as the page we will + * shuffle it with? + */ + if (page_order(page) != order) + return NULL; + + return page; +} + +/* + * Fisher-Yates shuffle the freelist which prescribes iterating through an + * array, pfns in this case, and randomly swapping each entry with another in + * the span, end_pfn - start_pfn. + * + * To keep the implementation simple it does not attempt to correct for sources + * of bias in the distribution, like modulo bias or pseudo-random number + * generator bias. I.e. the expectation is that this shuffling raises the bar + * for attacks that exploit the predictability of page allocations, but need not + * be a perfect shuffle. + */ +#define SHUFFLE_RETRY 10 +void __meminit __shuffle_zone(struct zone *z) +{ + unsigned long i, flags; + unsigned long start_pfn = z->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(z); + const int order = SHUFFLE_ORDER; + const int order_pages = 1 << order; + + spin_lock_irqsave(&z->lock, flags); + start_pfn = ALIGN(start_pfn, order_pages); + for (i = start_pfn; i < end_pfn; i += order_pages) { + unsigned long j; + int migratetype, retry; + struct page *page_i, *page_j; + + /* + * We expect page_i, in the sub-range of a zone being added + * (@start_pfn to @end_pfn), to more likely be valid compared to + * page_j randomly selected in the span @zone_start_pfn to + * @spanned_pages. + */ + page_i = shuffle_valid_page(i, order); + if (!page_i) + continue; + + for (retry = 0; retry < SHUFFLE_RETRY; retry++) { + /* + * Pick a random order aligned page in the zone span as + * a swap target. If the selected pfn is a hole, retry + * up to SHUFFLE_RETRY attempts find a random valid pfn + * in the zone. + */ + j = z->zone_start_pfn + + ALIGN_DOWN(get_random_long() % z->spanned_pages, + order_pages); + page_j = shuffle_valid_page(j, order); + if (page_j && page_j != page_i) + break; + } + if (retry >= SHUFFLE_RETRY) { + pr_debug("%s: failed to swap %#lx\n", __func__, i); + continue; + } + + /* + * Each migratetype corresponds to its own list, make sure the + * types match otherwise we're moving pages to lists where they + * do not belong. + */ + migratetype = get_pageblock_migratetype(page_i); + if (get_pageblock_migratetype(page_j) != migratetype) { + pr_debug("%s: migratetype mismatch %#lx\n", __func__, i); + continue; + } + + list_swap(&page_i->lru, &page_j->lru); + + pr_debug("%s: swap: %#lx -> %#lx\n", __func__, i, j); + + /* take it easy on the zone lock */ + if ((i % (100 * order_pages)) == 0) { + spin_unlock_irqrestore(&z->lock, flags); + cond_resched(); + spin_lock_irqsave(&z->lock, flags); + } + } + spin_unlock_irqrestore(&z->lock, flags); +} + +/** + * shuffle_free_memory - reduce the predictability of the page allocator + * @pgdat: node page data + */ +void __meminit __shuffle_free_memory(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + shuffle_zone(z); +} + +void add_to_free_area_random(struct page *page, struct free_area *area, + int migratetype) +{ + static u64 rand; + static u8 rand_bits; + + /* + * The lack of locking is deliberate. If 2 threads race to + * update the rand state it just adds to the entropy. + */ + if (rand_bits == 0) { + rand_bits = 64; + rand = get_random_u64(); + } + + if (rand & 1) + add_to_free_area(page, area, migratetype); + else + add_to_free_area_tail(page, area, migratetype); + rand_bits--; + rand >>= 1; +} diff --git a/mm/shuffle.h b/mm/shuffle.h new file mode 100644 index 000000000000..777a257a0d2f --- /dev/null +++ b/mm/shuffle.h @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Intel Corporation. All rights reserved. +#ifndef _MM_SHUFFLE_H +#define _MM_SHUFFLE_H +#include <linux/jump_label.h> + +/* + * SHUFFLE_ENABLE is called from the command line enabling path, or by + * platform-firmware enabling that indicates the presence of a + * direct-mapped memory-side-cache. SHUFFLE_FORCE_DISABLE is called from + * the command line path and overrides any previous or future + * SHUFFLE_ENABLE. + */ +enum mm_shuffle_ctl { + SHUFFLE_ENABLE, + SHUFFLE_FORCE_DISABLE, +}; + +#define SHUFFLE_ORDER (MAX_ORDER-1) + +#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR +DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key); +extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl); +extern void __shuffle_free_memory(pg_data_t *pgdat); +static inline void shuffle_free_memory(pg_data_t *pgdat) +{ + if (!static_branch_unlikely(&page_alloc_shuffle_key)) + return; + __shuffle_free_memory(pgdat); +} + +extern void __shuffle_zone(struct zone *z); +static inline void shuffle_zone(struct zone *z) +{ + if (!static_branch_unlikely(&page_alloc_shuffle_key)) + return; + __shuffle_zone(z); +} + +static inline bool is_shuffle_order(int order) +{ + if (!static_branch_unlikely(&page_alloc_shuffle_key)) + return false; + return order >= SHUFFLE_ORDER; +} +#else +static inline void shuffle_free_memory(pg_data_t *pgdat) +{ +} + +static inline void shuffle_zone(struct zone *z) +{ +} + +static inline void page_alloc_shuffle(enum mm_shuffle_ctl ctl) +{ +} + +static inline bool is_shuffle_order(int order) +{ + return false; +} +#endif +#endif /* _MM_SHUFFLE_H */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e5e9e1fcac01..67bbb8d2a0a8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -633,7 +633,7 @@ static unsigned long lazy_max_pages(void) return log * (32UL * 1024 * 1024 / PAGE_SIZE); } -static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); /* * Serialize vmap purging. There is no actual criticial section protected @@ -651,7 +651,7 @@ static void purge_fragmented_blocks_allcpus(void); */ void set_iounmap_nonlazy(void) { - atomic_set(&vmap_lazy_nr, lazy_max_pages()+1); + atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1); } /* @@ -659,34 +659,40 @@ void set_iounmap_nonlazy(void) */ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { + unsigned long resched_threshold; struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; - bool do_free = false; lockdep_assert_held(&vmap_purge_lock); valist = llist_del_all(&vmap_purge_list); + if (unlikely(valist == NULL)) + return false; + + /* + * TODO: to calculate a flush range without looping. + * The list can be up to lazy_max_pages() elements. + */ llist_for_each_entry(va, valist, purge_list) { if (va->va_start < start) start = va->va_start; if (va->va_end > end) end = va->va_end; - do_free = true; } - if (!do_free) - return false; - flush_tlb_kernel_range(start, end); + resched_threshold = lazy_max_pages() << 1; spin_lock(&vmap_area_lock); llist_for_each_entry_safe(va, n_va, valist, purge_list) { - int nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; __free_vmap_area(va); - atomic_sub(nr, &vmap_lazy_nr); - cond_resched_lock(&vmap_area_lock); + atomic_long_sub(nr, &vmap_lazy_nr); + + if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) + cond_resched_lock(&vmap_area_lock); } spin_unlock(&vmap_area_lock); return true; @@ -722,10 +728,10 @@ static void purge_vmap_area_lazy(void) */ static void free_vmap_area_noflush(struct vmap_area *va) { - int nr_lazy; + unsigned long nr_lazy; - nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, - &vmap_lazy_nr); + nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> + PAGE_SHIFT, &vmap_lazy_nr); /* After this point, we may free va at any time */ llist_add(&va->purge_list, &vmap_purge_list); diff --git a/mm/vmscan.c b/mm/vmscan.c index d96c54703948..7acd0afdfc2a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -346,7 +346,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone int zid; if (!mem_cgroup_disabled()) - lru_size = lruvec_page_state(lruvec, NR_LRU_BASE + lru); + lru_size = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); else lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); @@ -2150,7 +2150,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, * is being established. Disable active list protection to get * rid of the stale workingset quickly. */ - refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE); + refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); if (file && actual_reclaim && lruvec->refaults != refaults) { inactive_ratio = 0; } else { @@ -2912,7 +2912,7 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) struct lruvec *lruvec; lruvec = mem_cgroup_lruvec(pgdat, memcg); - refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE); + refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); lruvec->refaults = refaults; } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); } diff --git a/mm/workingset.c b/mm/workingset.c index 6419baebd306..e0b4edcb88c8 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -430,9 +430,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) - pages += lruvec_page_state(lruvec, NR_LRU_BASE + i); - pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE); - pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE); + pages += lruvec_page_state_local(lruvec, + NR_LRU_BASE + i); + pages += lruvec_page_state_local(lruvec, NR_SLAB_RECLAIMABLE); + pages += lruvec_page_state_local(lruvec, NR_SLAB_UNRECLAIMABLE); } else #endif pages = node_present_pages(sc->nid); diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 71f06900473e..b96fd3f54705 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -163,7 +163,7 @@ nf_hook_entries_grow(const struct nf_hook_entries *old, static void hooks_validate(const struct nf_hook_entries *hooks) { -#ifdef CONFIG_DEBUG_KERNEL +#ifdef CONFIG_DEBUG_MISC struct nf_hook_ops **orig_ops; int prio = INT_MIN; size_t i = 0; diff --git a/scripts/gdb/linux/clk.py b/scripts/gdb/linux/clk.py new file mode 100644 index 000000000000..061aecfa294e --- /dev/null +++ b/scripts/gdb/linux/clk.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) NXP 2019 + +import gdb +import sys + +from linux import utils, lists, constants + +clk_core_type = utils.CachedType("struct clk_core") + + +def clk_core_for_each_child(hlist_head): + return lists.hlist_for_each_entry(hlist_head, + clk_core_type.get_type().pointer(), "child_node") + + +class LxClkSummary(gdb.Command): + """Print clk tree summary + +Output is a subset of /sys/kernel/debug/clk/clk_summary + +No calls are made during printing, instead a (c) if printed after values which +are cached and potentially out of date""" + + def __init__(self): + super(LxClkSummary, self).__init__("lx-clk-summary", gdb.COMMAND_DATA) + + def show_subtree(self, clk, level): + gdb.write("%*s%-*s %7d %8d %8d %11lu%s\n" % ( + level * 3 + 1, "", + 30 - level * 3, + clk['name'].string(), + clk['enable_count'], + clk['prepare_count'], + clk['protect_count'], + clk['rate'], + '(c)' if clk['flags'] & constants.LX_CLK_GET_RATE_NOCACHE else ' ')) + + for child in clk_core_for_each_child(clk['children']): + self.show_subtree(child, level + 1) + + def invoke(self, arg, from_tty): + gdb.write(" enable prepare protect \n") + gdb.write(" clock count count count rate \n") + gdb.write("------------------------------------------------------------------------\n") + for clk in clk_core_for_each_child(gdb.parse_and_eval("clk_root_list")): + self.show_subtree(clk, 0) + for clk in clk_core_for_each_child(gdb.parse_and_eval("clk_orphan_list")): + self.show_subtree(clk, 0) + + +LxClkSummary() + + +class LxClkCoreLookup(gdb.Function): + """Find struct clk_core by name""" + + def __init__(self): + super(LxClkCoreLookup, self).__init__("lx_clk_core_lookup") + + def lookup_hlist(self, hlist_head, name): + for child in clk_core_for_each_child(hlist_head): + if child['name'].string() == name: + return child + result = self.lookup_hlist(child['children'], name) + if result: + return result + + def invoke(self, name): + name = name.string() + return (self.lookup_hlist(gdb.parse_and_eval("clk_root_list"), name) or + self.lookup_hlist(gdb.parse_and_eval("clk_orphan_list"), name)) + + +LxClkCoreLookup() diff --git a/scripts/gdb/linux/config.py b/scripts/gdb/linux/config.py new file mode 100644 index 000000000000..90e1565b1967 --- /dev/null +++ b/scripts/gdb/linux/config.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright 2019 Google LLC. + +import gdb +import zlib + +from linux import utils + + +class LxConfigDump(gdb.Command): + """Output kernel config to the filename specified as the command + argument. Equivalent to 'zcat /proc/config.gz > config.txt' on + a running target""" + + def __init__(self): + super(LxConfigDump, self).__init__("lx-configdump", gdb.COMMAND_DATA, + gdb.COMPLETE_FILENAME) + + def invoke(self, arg, from_tty): + if len(arg) == 0: + filename = "config.txt" + else: + filename = arg + + try: + py_config_ptr = gdb.parse_and_eval("kernel_config_data + 8") + py_config_size = gdb.parse_and_eval( + "sizeof(kernel_config_data) - 1 - 8 * 2") + except gdb.error as e: + raise gdb.GdbError("Can't find config, enable CONFIG_IKCONFIG?") + + inf = gdb.inferiors()[0] + zconfig_buf = utils.read_memoryview(inf, py_config_ptr, + py_config_size).tobytes() + + config_buf = zlib.decompress(zconfig_buf, 16) + with open(filename, 'wb') as f: + f.write(config_buf) + + gdb.write("Dumped config to " + filename + "\n") + + +LxConfigDump() diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in index d3319a80788a..1d73083da6cb 100644 --- a/scripts/gdb/linux/constants.py.in +++ b/scripts/gdb/linux/constants.py.in @@ -12,9 +12,12 @@ * */ +#include <linux/clk-provider.h> #include <linux/fs.h> +#include <linux/hrtimer.h> #include <linux/mount.h> #include <linux/of_fdt.h> +#include <linux/threads.h> /* We need to stringify expanded macros so that they can be parsed */ @@ -36,6 +39,9 @@ import gdb +/* linux/clk-provider.h */ +LX_GDBPARSED(CLK_GET_RATE_NOCACHE) + /* linux/fs.h */ LX_VALUE(SB_RDONLY) LX_VALUE(SB_SYNCHRONOUS) @@ -44,6 +50,9 @@ LX_VALUE(SB_DIRSYNC) LX_VALUE(SB_NOATIME) LX_VALUE(SB_NODIRATIME) +/* linux/htimer.h */ +LX_GDBPARSED(hrtimer_resolution) + /* linux/mount.h */ LX_VALUE(MNT_NOSUID) LX_VALUE(MNT_NODEV) @@ -52,8 +61,16 @@ LX_VALUE(MNT_NOATIME) LX_VALUE(MNT_NODIRATIME) LX_VALUE(MNT_RELATIME) +/* linux/threads.h */ +LX_VALUE(NR_CPUS) + /* linux/of_fdt.h> */ LX_VALUE(OF_DT_HEADER) /* Kernel Configs */ +LX_CONFIG(CONFIG_GENERIC_CLOCKEVENTS) +LX_CONFIG(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) +LX_CONFIG(CONFIG_HIGH_RES_TIMERS) +LX_CONFIG(CONFIG_NR_CPUS) LX_CONFIG(CONFIG_OF) +LX_CONFIG(CONFIG_TICK_ONESHOT) diff --git a/scripts/gdb/linux/cpus.py b/scripts/gdb/linux/cpus.py index ca11e8df31b6..008e62f3190d 100644 --- a/scripts/gdb/linux/cpus.py +++ b/scripts/gdb/linux/cpus.py @@ -135,6 +135,7 @@ and can help identify the state of hotplugged CPUs""" gdb.write("Online CPUs : {}\n".format(list(each_online_cpu()))) gdb.write("Active CPUs : {}\n".format(list(each_active_cpu()))) + LxCpus() diff --git a/scripts/gdb/linux/lists.py b/scripts/gdb/linux/lists.py index 2f335fbd86fd..c487ddf09d38 100644 --- a/scripts/gdb/linux/lists.py +++ b/scripts/gdb/linux/lists.py @@ -16,13 +16,15 @@ import gdb from linux import utils list_head = utils.CachedType("struct list_head") +hlist_head = utils.CachedType("struct hlist_head") +hlist_node = utils.CachedType("struct hlist_node") def list_for_each(head): if head.type == list_head.get_type().pointer(): head = head.dereference() elif head.type != list_head.get_type(): - raise gdb.GdbError("Must be struct list_head not {}" + raise TypeError("Must be struct list_head not {}" .format(head.type)) node = head['next'].dereference() @@ -33,9 +35,24 @@ def list_for_each(head): def list_for_each_entry(head, gdbtype, member): for node in list_for_each(head): - if node.type != list_head.get_type().pointer(): - raise TypeError("Type {} found. Expected struct list_head *." - .format(node.type)) + yield utils.container_of(node, gdbtype, member) + + +def hlist_for_each(head): + if head.type == hlist_head.get_type().pointer(): + head = head.dereference() + elif head.type != hlist_head.get_type(): + raise TypeError("Must be struct hlist_head not {}" + .format(head.type)) + + node = head['first'].dereference() + while node.address: + yield node.address + node = node['next'].dereference() + + +def hlist_for_each_entry(head, gdbtype, member): + for node in hlist_for_each(head): yield utils.container_of(node, gdbtype, member) @@ -110,4 +127,5 @@ class LxListChk(gdb.Command): raise gdb.GdbError("lx-list-check takes one argument") list_check(gdb.parse_and_eval(argv[0])) + LxListChk() diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py index 2f01a958eb22..6a56bba233a9 100644 --- a/scripts/gdb/linux/proc.py +++ b/scripts/gdb/linux/proc.py @@ -29,6 +29,7 @@ class LxCmdLine(gdb.Command): def invoke(self, arg, from_tty): gdb.write(gdb.parse_and_eval("saved_command_line").string() + "\n") + LxCmdLine() @@ -43,6 +44,7 @@ class LxVersion(gdb.Command): # linux_banner should contain a newline gdb.write(gdb.parse_and_eval("(char *)linux_banner").string()) + LxVersion() @@ -86,6 +88,7 @@ Equivalent to cat /proc/iomem on a running target""" def invoke(self, arg, from_tty): return show_lx_resources("iomem_resource") + LxIOMem() @@ -100,6 +103,7 @@ Equivalent to cat /proc/ioports on a running target""" def invoke(self, arg, from_tty): return show_lx_resources("ioport_resource") + LxIOPorts() @@ -149,7 +153,7 @@ values of that process namespace""" if len(argv) >= 1: try: pid = int(argv[0]) - except: + except gdb.error: raise gdb.GdbError("Provide a PID as integer value") else: pid = 1 @@ -195,6 +199,7 @@ values of that process namespace""" info_opts(FS_INFO, s_flags), info_opts(MNT_INFO, m_flags))) + LxMounts() @@ -259,7 +264,7 @@ class LxFdtDump(gdb.Command): try: f = open(filename, 'wb') - except: + except gdb.error: raise gdb.GdbError("Could not open file to dump fdt") f.write(fdt_buf) @@ -267,4 +272,5 @@ class LxFdtDump(gdb.Command): gdb.write("Dumped fdt blob to " + filename + "\n") + LxFdtDump() diff --git a/scripts/gdb/linux/rbtree.py b/scripts/gdb/linux/rbtree.py new file mode 100644 index 000000000000..39db889b874c --- /dev/null +++ b/scripts/gdb/linux/rbtree.py @@ -0,0 +1,177 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright 2019 Google LLC. + +import gdb + +from linux import utils + +rb_root_type = utils.CachedType("struct rb_root") +rb_node_type = utils.CachedType("struct rb_node") + + +def rb_first(root): + if root.type == rb_root_type.get_type(): + node = node.address.cast(rb_root_type.get_type().pointer()) + elif root.type != rb_root_type.get_type().pointer(): + raise gdb.GdbError("Must be struct rb_root not {}".format(root.type)) + + node = root['rb_node'] + if node is 0: + return None + + while node['rb_left']: + node = node['rb_left'] + + return node + + +def rb_last(root): + if root.type == rb_root_type.get_type(): + node = node.address.cast(rb_root_type.get_type().pointer()) + elif root.type != rb_root_type.get_type().pointer(): + raise gdb.GdbError("Must be struct rb_root not {}".format(root.type)) + + node = root['rb_node'] + if node is 0: + return None + + while node['rb_right']: + node = node['rb_right'] + + return node + + +def rb_parent(node): + parent = gdb.Value(node['__rb_parent_color'] & ~3) + return parent.cast(rb_node_type.get_type().pointer()) + + +def rb_empty_node(node): + return node['__rb_parent_color'] == node.address + + +def rb_next(node): + if node.type == rb_node_type.get_type(): + node = node.address.cast(rb_node_type.get_type().pointer()) + elif node.type != rb_node_type.get_type().pointer(): + raise gdb.GdbError("Must be struct rb_node not {}".format(node.type)) + + if rb_empty_node(node): + return None + + if node['rb_right']: + node = node['rb_right'] + while node['rb_left']: + node = node['rb_left'] + return node + + parent = rb_parent(node) + while parent and node == parent['rb_right']: + node = parent + parent = rb_parent(node) + + return parent + + +def rb_prev(node): + if node.type == rb_node_type.get_type(): + node = node.address.cast(rb_node_type.get_type().pointer()) + elif node.type != rb_node_type.get_type().pointer(): + raise gdb.GdbError("Must be struct rb_node not {}".format(node.type)) + + if rb_empty_node(node): + return None + + if node['rb_left']: + node = node['rb_left'] + while node['rb_right']: + node = node['rb_right'] + return node.dereference() + + parent = rb_parent(node) + while parent and node == parent['rb_left'].dereference(): + node = parent + parent = rb_parent(node) + + return parent + + +class LxRbFirst(gdb.Function): + """Lookup and return a node from an RBTree + +$lx_rb_first(root): Return the node at the given index. +If index is omitted, the root node is dereferenced and returned.""" + + def __init__(self): + super(LxRbFirst, self).__init__("lx_rb_first") + + def invoke(self, root): + result = rb_first(root) + if result is None: + raise gdb.GdbError("No entry in tree") + + return result + + +LxRbFirst() + + +class LxRbLast(gdb.Function): + """Lookup and return a node from an RBTree. + +$lx_rb_last(root): Return the node at the given index. +If index is omitted, the root node is dereferenced and returned.""" + + def __init__(self): + super(LxRbLast, self).__init__("lx_rb_last") + + def invoke(self, root): + result = rb_last(root) + if result is None: + raise gdb.GdbError("No entry in tree") + + return result + + +LxRbLast() + + +class LxRbNext(gdb.Function): + """Lookup and return a node from an RBTree. + +$lx_rb_next(node): Return the node at the given index. +If index is omitted, the root node is dereferenced and returned.""" + + def __init__(self): + super(LxRbNext, self).__init__("lx_rb_next") + + def invoke(self, node): + result = rb_next(node) + if result is None: + raise gdb.GdbError("No entry in tree") + + return result + + +LxRbNext() + + +class LxRbPrev(gdb.Function): + """Lookup and return a node from an RBTree. + +$lx_rb_prev(node): Return the node at the given index. +If index is omitted, the root node is dereferenced and returned.""" + + def __init__(self): + super(LxRbPrev, self).__init__("lx_rb_prev") + + def invoke(self, node): + result = rb_prev(node) + if result is None: + raise gdb.GdbError("No entry in tree") + + return result + + +LxRbPrev() diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py index 004b0ac7fa72..2f5b95f09fa0 100644 --- a/scripts/gdb/linux/symbols.py +++ b/scripts/gdb/linux/symbols.py @@ -139,8 +139,12 @@ lx-symbols command.""" saved_states.append({'breakpoint': bp, 'enabled': bp.enabled}) # drop all current symbols and reload vmlinux + orig_vmlinux = 'vmlinux' + for obj in gdb.objfiles(): + if obj.filename.endswith('vmlinux'): + orig_vmlinux = obj.filename gdb.execute("symbol-file", to_string=True) - gdb.execute("symbol-file vmlinux") + gdb.execute("symbol-file {0}".format(orig_vmlinux)) self.loaded_modules = [] module_list = modules.module_list() diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py index f6ab3ccf698f..0301dc1e0138 100644 --- a/scripts/gdb/linux/tasks.py +++ b/scripts/gdb/linux/tasks.py @@ -79,6 +79,7 @@ class LxPs(gdb.Command): pid=task["pid"], comm=task["comm"].string())) + LxPs() @@ -134,4 +135,5 @@ variable.""" else: raise gdb.GdbError("No task of PID " + str(pid)) + LxThreadInfoByPidFunc() diff --git a/scripts/gdb/linux/timerlist.py b/scripts/gdb/linux/timerlist.py new file mode 100644 index 000000000000..071d0dd5a634 --- /dev/null +++ b/scripts/gdb/linux/timerlist.py @@ -0,0 +1,219 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright 2019 Google LLC. + +import binascii +import gdb + +from linux import constants +from linux import cpus +from linux import rbtree +from linux import utils + +timerqueue_node_type = utils.CachedType("struct timerqueue_node").get_type() +hrtimer_type = utils.CachedType("struct hrtimer").get_type() + + +def ktime_get(): + """Returns the current time, but not very accurately + + We can't read the hardware timer itself to add any nanoseconds + that need to be added since we last stored the time in the + timekeeper. But this is probably good enough for debug purposes.""" + tk_core = gdb.parse_and_eval("&tk_core") + + return tk_core['timekeeper']['tkr_mono']['base'] + + +def print_timer(rb_node, idx): + timerqueue = utils.container_of(rb_node, timerqueue_node_type.pointer(), + "node") + timer = utils.container_of(timerqueue, hrtimer_type.pointer(), "node") + + function = str(timer['function']).split(" ")[1].strip("<>") + softexpires = timer['_softexpires'] + expires = timer['node']['expires'] + now = ktime_get() + + text = " #{}: <{}>, {}, ".format(idx, timer, function) + text += "S:{:02x}\n".format(int(timer['state'])) + text += " # expires at {}-{} nsecs [in {} to {} nsecs]\n".format( + softexpires, expires, softexpires - now, expires - now) + return text + + +def print_active_timers(base): + curr = base['active']['next']['node'] + curr = curr.address.cast(rbtree.rb_node_type.get_type().pointer()) + idx = 0 + while curr: + yield print_timer(curr, idx) + curr = rbtree.rb_next(curr) + idx += 1 + + +def print_base(base): + text = " .base: {}\n".format(base.address) + text += " .index: {}\n".format(base['index']) + + text += " .resolution: {} nsecs\n".format(constants.LX_hrtimer_resolution) + + text += " .get_time: {}\n".format(base['get_time']) + if constants.LX_CONFIG_HIGH_RES_TIMERS: + text += " .offset: {} nsecs\n".format(base['offset']) + text += "active timers:\n" + text += "".join([x for x in print_active_timers(base)]) + return text + + +def print_cpu(hrtimer_bases, cpu, max_clock_bases): + cpu_base = cpus.per_cpu(hrtimer_bases, cpu) + jiffies = gdb.parse_and_eval("jiffies_64") + tick_sched_ptr = gdb.parse_and_eval("&tick_cpu_sched") + ts = cpus.per_cpu(tick_sched_ptr, cpu) + + text = "cpu: {}\n".format(cpu) + for i in xrange(max_clock_bases): + text += " clock {}:\n".format(i) + text += print_base(cpu_base['clock_base'][i]) + + if constants.LX_CONFIG_HIGH_RES_TIMERS: + fmts = [(" .{} : {} nsecs", 'expires_next'), + (" .{} : {}", 'hres_active'), + (" .{} : {}", 'nr_events'), + (" .{} : {}", 'nr_retries'), + (" .{} : {}", 'nr_hangs'), + (" .{} : {}", 'max_hang_time')] + text += "\n".join([s.format(f, cpu_base[f]) for s, f in fmts]) + text += "\n" + + if constants.LX_CONFIG_TICK_ONESHOT: + fmts = [(" .{} : {}", 'nohz_mode'), + (" .{} : {} nsecs", 'last_tick'), + (" .{} : {}", 'tick_stopped'), + (" .{} : {}", 'idle_jiffies'), + (" .{} : {}", 'idle_calls'), + (" .{} : {}", 'idle_sleeps'), + (" .{} : {} nsecs", 'idle_entrytime'), + (" .{} : {} nsecs", 'idle_waketime'), + (" .{} : {} nsecs", 'idle_exittime'), + (" .{} : {} nsecs", 'idle_sleeptime'), + (" .{}: {} nsecs", 'iowait_sleeptime'), + (" .{} : {}", 'last_jiffies'), + (" .{} : {}", 'next_timer'), + (" .{} : {} nsecs", 'idle_expires')] + text += "\n".join([s.format(f, ts[f]) for s, f in fmts]) + text += "\njiffies: {}\n".format(jiffies) + + text += "\n" + + return text + + +def print_tickdevice(td, cpu): + dev = td['evtdev'] + text = "Tick Device: mode: {}\n".format(td['mode']) + if cpu < 0: + text += "Broadcast device\n" + else: + text += "Per CPU device: {}\n".format(cpu) + + text += "Clock Event Device: " + if dev == 0: + text += "<NULL>\n" + return text + + text += "{}\n".format(dev['name']) + text += " max_delta_ns: {}\n".format(dev['max_delta_ns']) + text += " min_delta_ns: {}\n".format(dev['min_delta_ns']) + text += " mult: {}\n".format(dev['mult']) + text += " shift: {}\n".format(dev['shift']) + text += " mode: {}\n".format(dev['state_use_accessors']) + text += " next_event: {} nsecs\n".format(dev['next_event']) + + text += " set_next_event: {}\n".format(dev['set_next_event']) + + members = [('set_state_shutdown', " shutdown: {}\n"), + ('set_state_periodic', " periodic: {}\n"), + ('set_state_oneshot', " oneshot: {}\n"), + ('set_state_oneshot_stopped', " oneshot stopped: {}\n"), + ('tick_resume', " resume: {}\n")] + for member, fmt in members: + if dev[member]: + text += fmt.format(dev[member]) + + text += " event_handler: {}\n".format(dev['event_handler']) + text += " retries: {}\n".format(dev['retries']) + + return text + + +def pr_cpumask(mask): + nr_cpu_ids = 1 + if constants.LX_NR_CPUS > 1: + nr_cpu_ids = gdb.parse_and_eval("nr_cpu_ids") + + inf = gdb.inferiors()[0] + bits = mask['bits'] + num_bytes = (nr_cpu_ids + 7) / 8 + buf = utils.read_memoryview(inf, bits, num_bytes).tobytes() + buf = binascii.b2a_hex(buf) + + chunks = [] + i = num_bytes + while i > 0: + i -= 1 + start = i * 2 + end = start + 2 + chunks.append(buf[start:end]) + if i != 0 and i % 4 == 0: + chunks.append(',') + + extra = nr_cpu_ids % 8 + if 0 < extra <= 4: + chunks[0] = chunks[0][0] # Cut off the first 0 + + return "".join(chunks) + + +class LxTimerList(gdb.Command): + """Print /proc/timer_list""" + + def __init__(self): + super(LxTimerList, self).__init__("lx-timerlist", gdb.COMMAND_DATA) + + def invoke(self, arg, from_tty): + hrtimer_bases = gdb.parse_and_eval("&hrtimer_bases") + max_clock_bases = gdb.parse_and_eval("HRTIMER_MAX_CLOCK_BASES") + + text = "Timer List Version: gdb scripts\n" + text += "HRTIMER_MAX_CLOCK_BASES: {}\n".format(max_clock_bases) + text += "now at {} nsecs\n".format(ktime_get()) + + for cpu in cpus.each_online_cpu(): + text += print_cpu(hrtimer_bases, cpu, max_clock_bases) + + if constants.LX_CONFIG_GENERIC_CLOCKEVENTS: + if constants.LX_CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: + bc_dev = gdb.parse_and_eval("&tick_broadcast_device") + text += print_tickdevice(bc_dev, -1) + text += "\n" + mask = gdb.parse_and_eval("tick_broadcast_mask") + mask = pr_cpumask(mask) + text += "tick_broadcast_mask: {}\n".format(mask) + if constants.LX_CONFIG_TICK_ONESHOT: + mask = gdb.parse_and_eval("tick_broadcast_oneshot_mask") + mask = pr_cpumask(mask) + text += "tick_broadcast_oneshot_mask: {}\n".format(mask) + text += "\n" + + tick_cpu_devices = gdb.parse_and_eval("&tick_cpu_device") + for cpu in cpus.each_online_cpu(): + tick_dev = cpus.per_cpu(tick_cpu_devices, cpu) + text += print_tickdevice(tick_dev, cpu) + text += "\n" + + gdb.write(text) + + +LxTimerList() diff --git a/scripts/gdb/linux/utils.py b/scripts/gdb/linux/utils.py index 50805874cfc3..bc67126118c4 100644 --- a/scripts/gdb/linux/utils.py +++ b/scripts/gdb/linux/utils.py @@ -66,6 +66,7 @@ Note that TYPE and ELEMENT have to be quoted as strings.""" return container_of(ptr, gdb.lookup_type(typename.string()).pointer(), elementname.string()) + ContainerOf() @@ -148,14 +149,14 @@ def get_gdbserver_type(): def probe_qemu(): try: return gdb.execute("monitor info version", to_string=True) != "" - except: + except gdb.error: return False def probe_kgdb(): try: thread_info = gdb.execute("info thread 2", to_string=True) return "shadowCPU0" in thread_info - except: + except gdb.error: return False global gdbserver_type @@ -172,7 +173,7 @@ def get_gdbserver_type(): def gdb_eval_or_none(expresssion): try: return gdb.parse_and_eval(expresssion) - except: + except gdb.error: return None diff --git a/scripts/gdb/vmlinux-gdb.py b/scripts/gdb/vmlinux-gdb.py index 6e0b0afd888a..eff5a48ac026 100644 --- a/scripts/gdb/vmlinux-gdb.py +++ b/scripts/gdb/vmlinux-gdb.py @@ -27,7 +27,11 @@ else: import linux.modules import linux.dmesg import linux.tasks + import linux.config import linux.cpus import linux.lists + import linux.rbtree import linux.proc import linux.constants + import linux.timerlist + import linux.clk diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore index 64073e050c6a..b02279da6fa1 100644 --- a/tools/testing/selftests/exec/.gitignore +++ b/tools/testing/selftests/exec/.gitignore @@ -6,4 +6,5 @@ execveat.moved execveat.path.ephemeral execveat.ephemeral execveat.denatured -xxxxxxxx*
\ No newline at end of file +/recursion-depth +xxxxxxxx* diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index 427c41ba5151..33339e31e365 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -1,11 +1,15 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS = -Wall +CFLAGS += -Wno-nonnull +CFLAGS += -D_GNU_SOURCE TEST_GEN_PROGS := execveat TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir # Makefile is a run-time dependency, since it's accessed by the execveat test TEST_FILES := Makefile +TEST_GEN_PROGS += recursion-depth + EXTRA_CLEAN := $(OUTPUT)/subdir.moved $(OUTPUT)/execveat.moved $(OUTPUT)/xxxxx* include ../lib.mk diff --git a/tools/testing/selftests/exec/recursion-depth.c b/tools/testing/selftests/exec/recursion-depth.c new file mode 100644 index 000000000000..2dbd5bc45b3e --- /dev/null +++ b/tools/testing/selftests/exec/recursion-depth.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* Test that pointing #! script interpreter to self doesn't recurse. */ +#include <errno.h> +#include <sched.h> +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/mount.h> +#include <unistd.h> + +int main(void) +{ + if (unshare(CLONE_NEWNS) == -1) { + if (errno == ENOSYS || errno == EPERM) { + fprintf(stderr, "error: unshare, errno %d\n", errno); + return 4; + } + fprintf(stderr, "error: unshare, errno %d\n", errno); + return 1; + } + if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) { + fprintf(stderr, "error: mount '/', errno %d\n", errno); + return 1; + } + /* Require "exec" filesystem. */ + if (mount(NULL, "/tmp", "ramfs", 0, NULL) == -1) { + fprintf(stderr, "error: mount ramfs, errno %d\n", errno); + return 1; + } + +#define FILENAME "/tmp/1" + + int fd = creat(FILENAME, 0700); + if (fd == -1) { + fprintf(stderr, "error: creat, errno %d\n", errno); + return 1; + } +#define S "#!" FILENAME "\n" + if (write(fd, S, strlen(S)) != strlen(S)) { + fprintf(stderr, "error: write, errno %d\n", errno); + return 1; + } + close(fd); + + int rv = execve(FILENAME, NULL, NULL); + if (rv == -1 && errno == ELOOP) { + return 0; + } + fprintf(stderr, "error: execve, rv %d, errno %d\n", rv, errno); + return 1; +} diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index 780ce7123374..6a970b127c9b 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -24,19 +24,21 @@ TEST_FILE=$(mktemp) # This represents # -# TEST_ID:TEST_COUNT:ENABLED +# TEST_ID:TEST_COUNT:ENABLED:TARGET # # TEST_ID: is the test id number # TEST_COUNT: number of times we should run the test # ENABLED: 1 if enabled, 0 otherwise +# TARGET: test target file required on the test_sysctl module # # Once these are enabled please leave them as-is. Write your own test, # we have tons of space. -ALL_TESTS="0001:1:1" -ALL_TESTS="$ALL_TESTS 0002:1:1" -ALL_TESTS="$ALL_TESTS 0003:1:1" -ALL_TESTS="$ALL_TESTS 0004:1:1" -ALL_TESTS="$ALL_TESTS 0005:3:1" +ALL_TESTS="0001:1:1:int_0001" +ALL_TESTS="$ALL_TESTS 0002:1:1:string_0001" +ALL_TESTS="$ALL_TESTS 0003:1:1:int_0002" +ALL_TESTS="$ALL_TESTS 0004:1:1:uint_0001" +ALL_TESTS="$ALL_TESTS 0005:3:1:int_0003" +ALL_TESTS="$ALL_TESTS 0006:50:1:bitmap_0001" test_modprobe() { @@ -149,6 +151,9 @@ reset_vals() string_0001) VAL="(none)" ;; + bitmap_0001) + VAL="" + ;; *) ;; esac @@ -157,8 +162,10 @@ reset_vals() set_orig() { - if [ ! -z $TARGET ]; then - echo "${ORIG}" > "${TARGET}" + if [ ! -z $TARGET ] && [ ! -z $ORIG ]; then + if [ -f ${TARGET} ]; then + echo "${ORIG}" > "${TARGET}" + fi fi } @@ -177,9 +184,25 @@ verify() return 0 } +# proc files get read a page at a time, which can confuse diff, +# and get you incorrect results on proc files with long data. To use +# diff against them you must first extract the output to a file, and +# then compare against that file. +verify_diff_proc_file() +{ + TMP_DUMP_FILE=$(mktemp) + cat $1 > $TMP_DUMP_FILE + + if ! diff -w -q $TMP_DUMP_FILE $2; then + return 1 + else + return 0 + fi +} + verify_diff_w() { - echo "$TEST_STR" | diff -q -w -u - $1 + echo "$TEST_STR" | diff -q -w -u - $1 > /dev/null return $? } @@ -600,9 +623,70 @@ run_stringtests() test_rc } +target_exists() +{ + TARGET="${SYSCTL}/$1" + TEST_ID="$2" + + if [ ! -f ${TARGET} ] ; then + echo "Target for test $TEST_ID: $TARGET not exist, skipping test ..." + return 0 + fi + return 1 +} + +run_bitmaptest() { + # Total length of bitmaps string to use, a bit under + # the maximum input size of the test node + LENGTH=$((RANDOM % 65000)) + + # First bit to set + BIT=$((RANDOM % 1024)) + + # String containing our list of bits to set + TEST_STR=$BIT + + # build up the string + while [ "${#TEST_STR}" -le "$LENGTH" ]; do + # Make sure next entry is discontiguous, + # skip ahead at least 2 + BIT=$((BIT + $((2 + RANDOM % 10)))) + + # Add new bit to the list + TEST_STR="${TEST_STR},${BIT}" + + # Randomly make it a range + if [ "$((RANDOM % 2))" -eq "1" ]; then + RANGE_END=$((BIT + $((1 + RANDOM % 10)))) + TEST_STR="${TEST_STR}-${RANGE_END}" + BIT=$RANGE_END + fi + done + + echo -n "Checking bitmap handler... " + TEST_FILE=$(mktemp) + echo -n "$TEST_STR" > $TEST_FILE + + cat $TEST_FILE > $TARGET 2> /dev/null + if [ $? -ne 0 ]; then + echo "FAIL" >&2 + rc=1 + test_rc + fi + + if ! verify_diff_proc_file "$TARGET" "$TEST_FILE"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + rc=0 + fi + test_rc +} + sysctl_test_0001() { - TARGET="${SYSCTL}/int_0001" + TARGET="${SYSCTL}/$(get_test_target 0001)" reset_vals ORIG=$(cat "${TARGET}") TEST_STR=$(( $ORIG + 1 )) @@ -614,7 +698,7 @@ sysctl_test_0001() sysctl_test_0002() { - TARGET="${SYSCTL}/string_0001" + TARGET="${SYSCTL}/$(get_test_target 0002)" reset_vals ORIG=$(cat "${TARGET}") TEST_STR="Testing sysctl" @@ -627,7 +711,7 @@ sysctl_test_0002() sysctl_test_0003() { - TARGET="${SYSCTL}/int_0002" + TARGET="${SYSCTL}/$(get_test_target 0003)" reset_vals ORIG=$(cat "${TARGET}") TEST_STR=$(( $ORIG + 1 )) @@ -640,7 +724,7 @@ sysctl_test_0003() sysctl_test_0004() { - TARGET="${SYSCTL}/uint_0001" + TARGET="${SYSCTL}/$(get_test_target 0004)" reset_vals ORIG=$(cat "${TARGET}") TEST_STR=$(( $ORIG + 1 )) @@ -653,13 +737,21 @@ sysctl_test_0004() sysctl_test_0005() { - TARGET="${SYSCTL}/int_0003" + TARGET="${SYSCTL}/$(get_test_target 0005)" reset_vals ORIG=$(cat "${TARGET}") run_limit_digit_int_array } +sysctl_test_0006() +{ + TARGET="${SYSCTL}/bitmap_0001" + reset_vals + ORIG="" + run_bitmaptest +} + list_tests() { echo "Test ID list:" @@ -673,10 +765,9 @@ list_tests() echo "0003 x $(get_test_count 0003) - tests proc_dointvec()" echo "0004 x $(get_test_count 0004) - tests proc_douintvec()" echo "0005 x $(get_test_count 0005) - tests proc_douintvec() array" + echo "0006 x $(get_test_count 0006) - tests proc_do_large_bitmap()" } -test_reqs - usage() { NUM_TESTS=$(grep -o ' ' <<<"$ALL_TESTS" | grep -c .) @@ -724,25 +815,35 @@ function get_test_count() { test_num $1 TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}') - LAST_TWO=${TEST_DATA#*:*} - echo ${LAST_TWO%:*} + echo ${TEST_DATA} | awk -F":" '{print $2}' } function get_test_enabled() { test_num $1 TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}') - echo ${TEST_DATA#*:*:} + echo ${TEST_DATA} | awk -F":" '{print $3}' +} + +function get_test_target() +{ + test_num $1 + TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}') + echo ${TEST_DATA} | awk -F":" '{print $4}' } function run_all_tests() { for i in $ALL_TESTS ; do - TEST_ID=${i%:*:*} + TEST_ID=${i%:*:*:*} ENABLED=$(get_test_enabled $TEST_ID) TEST_COUNT=$(get_test_count $TEST_ID) + TEST_TARGET=$(get_test_target $TEST_ID) + if target_exists $TEST_TARGET $TEST_ID; then + continue + fi if [[ $ENABLED -eq "1" ]]; then - test_case $TEST_ID $TEST_COUNT + test_case $TEST_ID $TEST_COUNT $TEST_TARGET fi done } @@ -775,12 +876,14 @@ function watch_case() function test_case() { - NUM_TESTS=$DEFAULT_NUM_TESTS - if [ $# -eq 2 ]; then - NUM_TESTS=$2 - fi + NUM_TESTS=$2 i=0 + + if target_exists $3 $1; then + continue + fi + while [ $i -lt $NUM_TESTS ]; do test_num $1 watch_log $i ${TEST_NAME}_test_$1 noclear @@ -803,15 +906,15 @@ function parse_args() elif [[ "$1" = "-t" ]]; then shift test_num $1 - test_case $1 $(get_test_count $1) + test_case $1 $(get_test_count $1) $(get_test_target $1) elif [[ "$1" = "-c" ]]; then shift test_num $1 test_num $2 - test_case $1 $2 + test_case $1 $2 $(get_test_target $1) elif [[ "$1" = "-s" ]]; then shift - test_case $1 1 + test_case $1 1 $(get_test_target $1) elif [[ "$1" = "-l" ]]; then list_tests elif [[ "$1" = "-h" || "$1" = "--help" ]]; then @@ -825,8 +928,8 @@ function parse_args() test_reqs allow_user_defaults check_production_sysctl_writes_strict -test_modprobe load_req_mod +test_modprobe trap "test_finish" EXIT |