diff options
945 files changed, 17108 insertions, 9706 deletions
diff --git a/.gitignore b/.gitignore index 70ec6037fa7a..7f86e0837909 100644 --- a/.gitignore +++ b/.gitignore @@ -103,6 +103,7 @@ modules.order !.get_maintainer.ignore !.gitattributes !.gitignore +!.kunitconfig !.mailmap !.rustfmt.toml @@ -232,6 +232,8 @@ Johan Hovold <johan@kernel.org> <johan@hovoldconsulting.com> John Crispin <john@phrozen.org> <blogic@openwrt.org> John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> John Stultz <johnstul@us.ibm.com> +<jon.toppins+linux@gmail.com> <jtoppins@cumulusnetworks.com> +<jon.toppins+linux@gmail.com> <jtoppins@redhat.com> Jordan Crouse <jordan@cosmicpenguin.net> <jcrouse@codeaurora.org> <josh@joshtriplett.org> <josh@freedesktop.org> <josh@joshtriplett.org> <josh@kernel.org> @@ -297,6 +299,8 @@ Martin Kepplinger <martink@posteo.de> <martin.kepplinger@puri.sm> Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com> Martyna Szapar-Mudlaw <martyna.szapar-mudlaw@linux.intel.com> <martyna.szapar-mudlaw@intel.com> Mathieu Othacehe <m.othacehe@gmail.com> +Mat Martineau <martineau@kernel.org> <mathew.j.martineau@linux.intel.com> +Mat Martineau <martineau@kernel.org> <mathewm@codeaurora.org> Matthew Wilcox <willy@infradead.org> <matthew.r.wilcox@intel.com> Matthew Wilcox <willy@infradead.org> <matthew@wil.cx> Matthew Wilcox <willy@infradead.org> <mawilcox@linuxonhyperv.com> @@ -229,6 +229,10 @@ S: University of Notre Dame S: Notre Dame, Indiana S: USA +N: Kai Bankett +E: chaosman@ontika.net +D: QNX6 filesystem + N: Greg Banks E: gnb@alphalink.com.au D: IDT77105 ATM network driver @@ -886,6 +890,10 @@ W: http://jdelvare.nerim.net/ D: Several hardware monitoring drivers S: France +N: Frank "Jedi/Sector One" Denis +E: j@pureftpd.org +D: QNX4 filesystem + N: Peter Denison E: peterd@pnd-pc.demon.co.uk W: http://www.pnd-pc.demon.co.uk/promise/ @@ -1259,6 +1267,10 @@ S: USA N: Adam Fritzler E: mid@zigamorph.net +N: Richard "Scuba" A. Frowijn +E: scuba@wxs.nl +D: QNX4 filesystem + N: Fernando Fuganti E: fuganti@conectiva.com.br E: fuganti@netbank.com.br @@ -2218,6 +2230,10 @@ D: Digiboard PC/Xe and PC/Xi, Digiboard EPCA D: NUMA support, Slab allocators, Page migration D: Scalability, Time subsystem +N: Anders Larsen +E: al@alarsen.net +D: QNX4 filesystem + N: Paul Laufer E: paul@laufernet.com D: Soundblaster driver fixes, ISAPnP quirk diff --git a/Documentation/ABI/obsolete/sysfs-selinux-checkreqprot b/Documentation/ABI/removed/sysfs-selinux-checkreqprot index ed6b52ca210f..f599a0a87e8b 100644 --- a/Documentation/ABI/obsolete/sysfs-selinux-checkreqprot +++ b/Documentation/ABI/removed/sysfs-selinux-checkreqprot @@ -4,6 +4,9 @@ KernelVersion: 2.6.12-rc2 (predates git) Contact: selinux@vger.kernel.org Description: + REMOVAL UPDATE: The SELinux checkreqprot functionality was removed in + March 2023, the original deprecation notice is shown below. + The selinuxfs "checkreqprot" node allows SELinux to be configured to check the protection requested by userspace for mmap/mprotect calls instead of the actual protection applied by the kernel. diff --git a/Documentation/ABI/obsolete/sysfs-selinux-disable b/Documentation/ABI/removed/sysfs-selinux-disable index c340278e3cf8..cb783c64cab3 100644 --- a/Documentation/ABI/obsolete/sysfs-selinux-disable +++ b/Documentation/ABI/removed/sysfs-selinux-disable @@ -4,6 +4,9 @@ KernelVersion: 2.6.12-rc2 (predates git) Contact: selinux@vger.kernel.org Description: + REMOVAL UPDATE: The SELinux runtime disable functionality was removed + in March 2023, the original deprecation notice is shown below. + The selinuxfs "disable" node allows SELinux to be disabled at runtime prior to a policy being loaded into the kernel. If disabled via this mechanism, SELinux will remain disabled until the system is rebooted. diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst index c9c957c85bac..93d899d53258 100644 --- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst +++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst @@ -277,7 +277,7 @@ the following access functions: Again, only one request in a given batch need actually carry out a grace-period operation, which means there must be an efficient way to -identify which of many concurrent reqeusts will initiate the grace +identify which of many concurrent requests will initiate the grace period, and that there be an efficient way for the remaining requests to wait for that grace period to complete. However, that is the topic of the next section. @@ -405,7 +405,7 @@ Use of Workqueues In earlier implementations, the task requesting the expedited grace period also drove it to completion. This straightforward approach had the disadvantage of needing to account for POSIX signals sent to user -tasks, so more recent implemementations use the Linux kernel's +tasks, so more recent implementations use the Linux kernel's workqueues (see Documentation/core-api/workqueue.rst). The requesting task still does counter snapshotting and funnel-lock @@ -465,7 +465,7 @@ corresponding disadvantage that workqueues cannot be used until they are initialized, which does not happen until some time after the scheduler spawns the first task. Given that there are parts of the kernel that really do want to execute grace periods during this mid-boot “dead -zoneâ€, expedited grace periods must do something else during thie time. +zoneâ€, expedited grace periods must do something else during this time. What they do is to fall back to the old practice of requiring that the requesting task drive the expedited grace period, as was the case before diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst index 7fdf151a8680..5750f125361b 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst @@ -168,7 +168,7 @@ an ``atomic_add_return()`` of zero) to detect idle CPUs. +-----------------------------------------------------------------------+ The approach must be extended to handle one final case, that of waking a -task blocked in ``synchronize_rcu()``. This task might be affinitied to +task blocked in ``synchronize_rcu()``. This task might be affined to a CPU that is not yet aware that the grace period has ended, and thus might not yet be subject to the grace period's memory ordering. Therefore, there is an ``smp_mb()`` after the return from diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt index 588d97366a46..db8f16b392aa 100644 --- a/Documentation/RCU/RTFP.txt +++ b/Documentation/RCU/RTFP.txt @@ -201,7 +201,7 @@ work looked at debugging uses of RCU [Seyster:2011:RFA:2075416.2075425]. In 2012, Josh Triplett received his Ph.D. with his dissertation covering RCU-protected resizable hash tables and the relationship between memory barriers and read-side traversal order: If the updater -is making changes in the opposite direction from the read-side traveral +is making changes in the opposite direction from the read-side traversal order, the updater need only execute a memory-barrier instruction, but if in the same direction, the updater needs to wait for a grace period between the individual updates [JoshTriplettPhD]. Also in 2012, @@ -1245,7 +1245,7 @@ Oregon Health and Sciences University" [Viewed September 5, 2005]" ,annotation={ First posting showing how RCU can be safely adapted for - preemptable RCU read side critical sections. + preemptible RCU read side critical sections. } } @@ -1888,7 +1888,7 @@ Revised: \url{https://lore.kernel.org/r/20070910183004.GA3299@linux.vnet.ibm.com} [Viewed October 25, 2007]" ,annotation={ - Final patch for preemptable RCU to -rt. (Later patches were + Final patch for preemptible RCU to -rt. (Later patches were to mainline, eventually incorporated.) } } @@ -2275,7 +2275,7 @@ lot of {Linux} into your technology!!!" \url{https://lore.kernel.org/r/20090724001429.GA17374@linux.vnet.ibm.com} [Viewed August 15, 2009]" ,annotation={ - First posting of simple and fast preemptable RCU. + First posting of simple and fast preemptible RCU. } } @@ -2639,7 +2639,7 @@ lot of {Linux} into your technology!!!" RCU-protected hash tables, barriers vs. read-side traversal order. . If the updater is making changes in the opposite direction from - the read-side traveral order, the updater need only execute a + the read-side traversal order, the updater need only execute a memory-barrier instruction, but if in the same direction, the updater needs to wait for a grace period between the individual updates. diff --git a/Documentation/RCU/UP.rst b/Documentation/RCU/UP.rst index 8b20fd45f255..4060d7a2f62a 100644 --- a/Documentation/RCU/UP.rst +++ b/Documentation/RCU/UP.rst @@ -107,7 +107,7 @@ UP systems, including PREEMPT SMP builds running on UP systems. Quick Quiz #3: Why can't synchronize_rcu() return immediately on UP systems running - preemptable RCU? + preemptible RCU? .. _answer_quick_quiz_up: @@ -143,7 +143,7 @@ Answer to Quick Quiz #2: Answer to Quick Quiz #3: Why can't synchronize_rcu() return immediately on UP systems - running preemptable RCU? + running preemptible RCU? Because some other task might have been preempted in the middle of an RCU read-side critical section. If synchronize_rcu() diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index cc361fb01ed4..bd3c58c44bef 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -70,7 +70,7 @@ over a rather long period of time, but improvements are always welcome! can serve as rcu_read_lock_sched(), but is less readable and prevents lockdep from detecting locking issues. - Please not that you *cannot* rely on code known to be built + Please note that you *cannot* rely on code known to be built only in non-preemptible kernels. Such code can and will break, especially in kernels built with CONFIG_PREEMPT_COUNT=y. diff --git a/Documentation/RCU/lockdep.rst b/Documentation/RCU/lockdep.rst index 2749f43ec1b0..69e73a39bd11 100644 --- a/Documentation/RCU/lockdep.rst +++ b/Documentation/RCU/lockdep.rst @@ -65,7 +65,7 @@ checking of rcu_dereference() primitives: rcu_access_pointer(p): Return the value of the pointer and omit all barriers, but retain the compiler constraints that prevent duplicating - or coalescsing. This is useful when testing the + or coalescing. This is useful when testing the value of the pointer itself, for example, against NULL. The rcu_dereference_check() check expression can be any boolean diff --git a/Documentation/RCU/torture.rst b/Documentation/RCU/torture.rst index 0316ba0c6922..b3b6dfa85248 100644 --- a/Documentation/RCU/torture.rst +++ b/Documentation/RCU/torture.rst @@ -216,7 +216,7 @@ Kernel boot arguments can also be supplied, for example, to control rcutorture's module parameters. For example, to test a change to RCU's CPU stall-warning code, use "--bootargs 'rcutorture.stall_cpu=30'". This will of course result in the scripting reporting a failure, namely -the resuling RCU CPU stall warning. As noted above, reducing memory may +the resulting RCU CPU stall warning. As noted above, reducing memory may require disabling rcutorture's callback-flooding tests:: kvm.sh --cpus 448 --configs '56*TREE04' --memory 128M \ @@ -370,5 +370,5 @@ You can also re-run a previous remote run in a manner similar to kvm.sh: tools/testing/selftests/rcutorture/res/2022.11.03-11.26.28-remote \ --duration 24h -In this case, most of the kvm-again.sh parmeters may be supplied following +In this case, most of the kvm-again.sh parameters may be supplied following the pathname of the old run-results directory. diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index 2c5563a91998..8eddef28d3a1 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -597,10 +597,10 @@ to avoid having to write your own callback:: If the occasional sleep is permitted, the single-argument form may be used, omitting the rcu_head structure from struct foo. - kfree_rcu(old_fp); + kfree_rcu_mightsleep(old_fp); -This variant of kfree_rcu() almost never blocks, but might do so by -invoking synchronize_rcu() in response to memory-allocation failure. +This variant almost never blocks, but might do so by invoking +synchronize_rcu() in response to memory-allocation failure. Again, see checklist.rst for additional rules governing the use of RCU. diff --git a/Documentation/admin-guide/hw-vuln/mds.rst b/Documentation/admin-guide/hw-vuln/mds.rst index f491de74ea79..48ca0bd85604 100644 --- a/Documentation/admin-guide/hw-vuln/mds.rst +++ b/Documentation/admin-guide/hw-vuln/mds.rst @@ -58,7 +58,7 @@ Because the buffers are potentially shared between Hyper-Threads cross Hyper-Thread attacks are possible. Deeper technical information is available in the MDS specific x86 -architecture section: :ref:`Documentation/x86/mds.rst <mds>`. +architecture section: :ref:`Documentation/arch/x86/mds.rst <mds>`. Attack scenarios diff --git a/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst index 76673affd917..014167ef8dd1 100644 --- a/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst +++ b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst @@ -63,7 +63,7 @@ attacker needs to begin a TSX transaction and raise an asynchronous abort which in turn potentially leaks data stored in the buffers. More detailed technical information is available in the TAA specific x86 -architecture section: :ref:`Documentation/x86/tsx_async_abort.rst <tsx_async_abort>`. +architecture section: :ref:`Documentation/arch/x86/tsx_async_abort.rst <tsx_async_abort>`. Attack scenarios diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 09a563bbe3e7..43ea35613dfc 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -36,6 +36,7 @@ problems and bugs in particular. reporting-issues reporting-regressions + quickly-build-trimmed-linux bug-hunting bug-bisect tainted-kernels diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst index 19600c50277b..1ba8f2a44aac 100644 --- a/Documentation/admin-guide/kernel-parameters.rst +++ b/Documentation/admin-guide/kernel-parameters.rst @@ -128,10 +128,11 @@ parameter is applicable:: KVM Kernel Virtual Machine support is enabled. LIBATA Libata driver is enabled LP Printer support is enabled. + LOONGARCH LoongArch architecture is enabled. LOOP Loopback device support is enabled. M68k M68k architecture is enabled. These options have more detailed description inside of - Documentation/m68k/kernel-options.rst. + Documentation/arch/m68k/kernel-options.rst. MDA MDA console support is enabled. MIPS MIPS architecture is enabled. MOUSE Appropriate mouse support is enabled. @@ -177,7 +178,7 @@ parameter is applicable:: X86-32 X86-32, aka i386 architecture is enabled. X86-64 X86-64 architecture is enabled. More X86-64 boot options can be found in - Documentation/x86/x86_64/boot-options.rst. + Documentation/arch/x86/x86_64/boot-options.rst. X86 Either 32-bit or 64-bit x86 (same as X86-32+X86-64) X86_UV SGI UV support is enabled. XEN Xen support is enabled @@ -192,10 +193,10 @@ In addition, the following text indicates that the option:: Parameters denoted with BOOT are actually interpreted by the boot loader, and have no meaning to the kernel directly. Do not modify the syntax of boot loader parameters without extreme -need or coordination with <Documentation/x86/boot.rst>. +need or coordination with <Documentation/arch/x86/boot.rst>. There are also arch-specific kernel-parameters not documented here. -See for example <Documentation/x86/x86_64/boot-options.rst>. +See for example <Documentation/arch/x86/x86_64/boot-options.rst>. Note that ALL kernel parameters listed below are CASE SENSITIVE, and that a trailing = on the name of any parameter states that that parameter will diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6221a1d057dd..10e2e5c3ff0b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -929,9 +929,6 @@ debug_objects [KNL] Enable object debugging - no_debug_objects - [KNL] Disable object debugging - debug_guardpage_minorder= [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this parameter allows control of the order of pages that will @@ -2976,7 +2973,7 @@ mce [X86-32] Machine Check Exception - mce=option [X86-64] See Documentation/x86/x86_64/boot-options.rst + mce=option [X86-64] See Documentation/arch/x86/x86_64/boot-options.rst md= [HW] RAID subsystems devices and level See Documentation/admin-guide/md.rst. @@ -3184,9 +3181,6 @@ deep - Suspend-To-RAM or equivalent (if supported) See Documentation/admin-guide/pm/sleep-states.rst. - meye.*= [HW] Set MotionEye Camera parameters - See Documentation/admin-guide/media/meye.rst. - mfgpt_irq= [IA-32] Specify the IRQ to use for the Multi-Function General Purpose Timers on AMD Geode platforms. @@ -3428,14 +3422,13 @@ 1 to enable accounting Default value is 0. - nfsaddrs= [NFS] Deprecated. Use ip= instead. - See Documentation/admin-guide/nfs/nfsroot.rst. - - nfsroot= [NFS] nfs root filesystem for disk-less boxes. - See Documentation/admin-guide/nfs/nfsroot.rst. + nfs.cache_getent= + [NFS] sets the pathname to the program which is used + to update the NFS client cache entries. - nfsrootdebug [NFS] enable nfsroot debugging messages. - See Documentation/admin-guide/nfs/nfsroot.rst. + nfs.cache_getent_timeout= + [NFS] sets the timeout after which an attempt to + update a cache entry is deemed to have failed. nfs.callback_nr_threads= [NFSv4] set the total number of threads that the @@ -3446,18 +3439,6 @@ [NFS] set the TCP port on which the NFSv4 callback channel should listen. - nfs.cache_getent= - [NFS] sets the pathname to the program which is used - to update the NFS client cache entries. - - nfs.cache_getent_timeout= - [NFS] sets the timeout after which an attempt to - update a cache entry is deemed to have failed. - - nfs.idmap_cache_timeout= - [NFS] set the maximum lifetime for idmapper cache - entries. - nfs.enable_ino64= [NFS] enable 64-bit inode numbers. If zero, the NFS client will fake up a 32-bit inode @@ -3465,6 +3446,10 @@ of returning the full 64-bit number. The default is to return 64-bit inode numbers. + nfs.idmap_cache_timeout= + [NFS] set the maximum lifetime for idmapper cache + entries. + nfs.max_session_cb_slots= [NFSv4.1] Sets the maximum number of session slots the client will assign to the callback @@ -3492,21 +3477,14 @@ will be autodetected by the client, and it will fall back to using the idmapper. To turn off this behaviour, set the value to '0'. + nfs.nfs4_unique_id= [NFS4] Specify an additional fixed unique ident- ification string that NFSv4 clients can insert into their nfs_client_id4 string. This is typically a UUID that is generated at system install time. - nfs.send_implementation_id = - [NFSv4.1] Send client implementation identification - information in exchange_id requests. - If zero, no implementation identification information - will be sent. - The default is to send the implementation identification - information. - - nfs.recover_lost_locks = + nfs.recover_lost_locks= [NFSv4] Attempt to recover locks that were lost due to a lease timeout on the server. Please note that doing this risks data corruption, since there are @@ -3518,7 +3496,15 @@ The default parameter value of '0' causes the kernel not to attempt recovery of lost locks. - nfs4.layoutstats_timer = + nfs.send_implementation_id= + [NFSv4.1] Send client implementation identification + information in exchange_id requests. + If zero, no implementation identification information + will be sent. + The default is to send the implementation identification + information. + + nfs4.layoutstats_timer= [NFSv4.2] Change the rate at which the kernel sends layoutstats to the pNFS metadata server. @@ -3527,12 +3513,19 @@ driver. A non-zero value sets the minimum interval in seconds between layoutstats transmissions. - nfsd.inter_copy_offload_enable = + nfsd.inter_copy_offload_enable= [NFSv4.2] When set to 1, the server will support server-to-server copies for which this server is the destination of the copy. - nfsd.nfsd4_ssc_umount_timeout = + nfsd.nfs4_disable_idmapping= + [NFSv4] When set to the default of '1', the NFSv4 + server will return only numeric uids and gids to + clients using auth_sys, and will accept numeric uids + and gids from such clients. This is intended to ease + migration from NFSv2/v3. + + nfsd.nfsd4_ssc_umount_timeout= [NFSv4.2] When used as the destination of a server-to-server copy, knfsd temporarily mounts the source server. It caches the mount in case @@ -3540,13 +3533,14 @@ used for the number of milliseconds specified by this parameter. - nfsd.nfs4_disable_idmapping= - [NFSv4] When set to the default of '1', the NFSv4 - server will return only numeric uids and gids to - clients using auth_sys, and will accept numeric uids - and gids from such clients. This is intended to ease - migration from NFSv2/v3. + nfsaddrs= [NFS] Deprecated. Use ip= instead. + See Documentation/admin-guide/nfs/nfsroot.rst. + + nfsroot= [NFS] nfs root filesystem for disk-less boxes. + See Documentation/admin-guide/nfs/nfsroot.rst. + nfsrootdebug [NFS] enable nfsroot debugging messages. + See Documentation/admin-guide/nfs/nfsroot.rst. nmi_backtrace.backtrace_idle [KNL] Dump stacks even of idle CPUs in response to an @@ -3579,7 +3573,21 @@ no5lvl [X86-64] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. - nofsgsbase [X86] Disables FSGSBASE instructions. + noaliencache [MM, NUMA, SLAB] Disables the allocation of alien + caches in the slab allocator. Saves per-node memory, + but will impact performance. + + noalign [KNL,ARM] + + noaltinstr [S390] Disables alternative instructions patching + (CPU alternatives feature). + + noapic [SMP,APIC] Tells the kernel to not make use of any + IOAPICs that may be present in the system. + + noautogroup Disable scheduler automatic task group creation. + + nocache [ARM] no_console_suspend [HW] Never suspend the console @@ -3596,32 +3604,8 @@ /sys/module/printk/parameters/console_suspend) to turn on/off it dynamically. - novmcoredd [KNL,KDUMP] - Disable device dump. Device dump allows drivers to - append dump data to vmcore so you can collect driver - specified debug info. Drivers can append the data - without any limit and this data is stored in memory, - so this may cause significant memory stress. Disabling - device dump can help save memory but the driver debug - data will be no longer available. This parameter - is only available when CONFIG_PROC_VMCORE_DEVICE_DUMP - is set. - - noaliencache [MM, NUMA, SLAB] Disables the allocation of alien - caches in the slab allocator. Saves per-node memory, - but will impact performance. - - noalign [KNL,ARM] - - noaltinstr [S390] Disables alternative instructions patching - (CPU alternatives feature). - - noapic [SMP,APIC] Tells the kernel to not make use of any - IOAPICs that may be present in the system. - - noautogroup Disable scheduler automatic task group creation. - - nocache [ARM] + no_debug_objects + [KNL] Disable object debugging nodsp [SH] Disable hardware DSP at boot time. @@ -3631,14 +3615,6 @@ noexec [IA-64] - nosmap [PPC] - Disable SMAP (Supervisor Mode Access Prevention) - even if it is supported by processor. - - nosmep [PPC64s] - Disable SMEP (Supervisor Mode Execution Prevention) - even if it is supported by processor. - noexec32 [X86-64] This affects only 32-bit executables. noexec32=on: enable non-executable mappings (default) @@ -3646,74 +3622,18 @@ noexec32=off: disable non-executable mappings read implies executable mappings + no_file_caps Tells the kernel not to honor file capabilities. The + only way then for a file to be executed with privilege + is to be setuid root or executed by root. + nofpu [MIPS,SH] Disable hardware FPU at boot time. + nofsgsbase [X86] Disables FSGSBASE instructions. + nofxsr [BUGS=X86-32] Disables x86 floating point extended register save and restore. The kernel will only save legacy floating-point registers on task switch. - nohugeiomap [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings. - - nohugevmalloc [KNL,X86,PPC,ARM64] Disable kernel huge vmalloc mappings. - - nosmt [KNL,S390] Disable symmetric multithreading (SMT). - Equivalent to smt=1. - - [KNL,X86] Disable symmetric multithreading (SMT). - nosmt=force: Force disable SMT, cannot be undone - via the sysfs control file. - - nospectre_v1 [X86,PPC] Disable mitigations for Spectre Variant 1 - (bounds check bypass). With this option data leaks are - possible in the system. - - nospectre_v2 [X86,PPC_E500,ARM64] Disable all mitigations for - the Spectre variant 2 (indirect branch prediction) - vulnerability. System may allow data leaks with this - option. - - nospectre_bhb [ARM64] Disable all mitigations for Spectre-BHB (branch - history injection) vulnerability. System may allow data leaks - with this option. - - nospec_store_bypass_disable - [HW] Disable all mitigations for the Speculative Store Bypass vulnerability - - no_uaccess_flush - [PPC] Don't flush the L1-D cache after accessing user data. - - noxsave [BUGS=X86] Disables x86 extended register state save - and restore using xsave. The kernel will fallback to - enabling legacy floating-point and sse state. - - noxsaveopt [X86] Disables xsaveopt used in saving x86 extended - register states. The kernel will fall back to use - xsave to save the states. By using this parameter, - performance of saving the states is degraded because - xsave doesn't support modified optimization while - xsaveopt supports it on xsaveopt enabled systems. - - noxsaves [X86] Disables xsaves and xrstors used in saving and - restoring x86 extended register state in compacted - form of xsave area. The kernel will fall back to use - xsaveopt and xrstor to save and restore the states - in standard form of xsave area. By using this - parameter, xsave area per process might occupy more - memory on xsaves enabled systems. - - nohlt [ARM,ARM64,MICROBLAZE,SH] Forces the kernel to busy wait - in do_idle() and not use the arch_cpu_idle() - implementation; requires CONFIG_GENERIC_IDLE_POLL_SETUP - to be effective. This is useful on platforms where the - sleep(SH) or wfi(ARM,ARM64) instructions do not work - correctly or when doing power measurements to evaluate - the impact of the sleep instructions. This is also - useful when using JTAG debugger. - - no_file_caps Tells the kernel not to honor file capabilities. The - only way then for a file to be executed with privilege - is to be setuid root or executed by root. - nohalt [IA-64] Tells the kernel not to use the power saving function PAL_HALT_LIGHT when idle. This increases power-consumption. On the positive side, it reduces @@ -3737,6 +3657,19 @@ nohibernate [HIBERNATION] Disable hibernation and resume. + nohlt [ARM,ARM64,MICROBLAZE,SH] Forces the kernel to busy wait + in do_idle() and not use the arch_cpu_idle() + implementation; requires CONFIG_GENERIC_IDLE_POLL_SETUP + to be effective. This is useful on platforms where the + sleep(SH) or wfi(ARM,ARM64) instructions do not work + correctly or when doing power measurements to evaluate + the impact of the sleep instructions. This is also + useful when using JTAG debugger. + + nohugeiomap [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings. + + nohugevmalloc [KNL,X86,PPC,ARM64] Disable kernel huge vmalloc mappings. + nohz= [KNL] Boottime enable/disable dynamic ticks Valid arguments: on, off Default: on @@ -3754,16 +3687,6 @@ Note that this argument takes precedence over the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option. - noiotrap [SH] Disables trapped I/O port accesses. - - noirqdebug [X86-32] Disables the code which attempts to detect and - disable unhandled interrupt sources. - - no_timer_check [X86,APIC] Disables the code which tests for - broken timer IRQ sources. - - noisapnp [ISAPNP] Disables ISA PnP code. - noinitrd [RAM] Tells the kernel not to load any configured initial RAM disk. @@ -3775,6 +3698,13 @@ noinvpcid [X86] Disable the INVPCID cpu feature. + noiotrap [SH] Disables trapped I/O port accesses. + + noirqdebug [X86-32] Disables the code which attempts to detect and + disable unhandled interrupt sources. + + noisapnp [ISAPNP] Disables ISA PnP code. + nojitter [IA-64] Disables jitter checking for ITC timers. nokaslr [KNL] @@ -3782,18 +3712,10 @@ kernel and module base offset ASLR (Address Space Layout Randomization). - no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver - no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page fault handling. - no-vmw-sched-clock - [X86,PV_OPS] Disable paravirtualized VMware scheduler - clock and use the default one. - - no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES] Disable paravirtualized - steal time accounting. steal time is computed, but - won't influence scheduler behaviour + no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver nolapic [X86-32,APIC] Do not enable or use the local APIC. @@ -3806,10 +3728,6 @@ nomfgpt [X86-32] Disable Multi-Function General Purpose Timer usage (for AMD Geode machines). - nonmi_ipi [X86] Disable using NMI IPIs during panic/reboot to - shutdown the other cpus. Instead use the REBOOT_VECTOR - irq. - nomodeset Disable kernel modesetting. Most systems' firmware sets up a display mode and provides framebuffer memory for output. With nomodeset, DRM and fbdev drivers will @@ -3822,6 +3740,10 @@ nomodule Disable module load + nonmi_ipi [X86] Disable using NMI IPIs during panic/reboot to + shutdown the other cpus. Instead use the REBOOT_VECTOR + irq. + nopat [X86] Disable PAT (page attribute table extension of pagetables) support. @@ -3830,6 +3752,9 @@ nopku [X86] Disable Memory Protection Keys CPU feature found in some Intel CPUs. + nopti [X86-64] + Equivalent to pti=off + nopv= [X86,XEN,KVM,HYPER_V,VMWARE] Disables the PV optimizations forcing the guest to run as generic guest with no PV drivers. Currently support @@ -3849,21 +3774,77 @@ noresume [SWSUSP] Disables resume and restores original swap space. + nosbagart [IA-64] + no-scroll [VGA] Disables scrollback. This is required for the Braillex ib80-piezo Braille reader made by F.H. Papenmeier (Germany). - nosbagart [IA-64] - nosgx [X86-64,SGX] Disables Intel SGX kernel support. + nosmap [PPC] + Disable SMAP (Supervisor Mode Access Prevention) + even if it is supported by processor. + + nosmep [PPC64s] + Disable SMEP (Supervisor Mode Execution Prevention) + even if it is supported by processor. + nosmp [SMP] Tells an SMP kernel to act as a UP kernel, and disable the IO APIC. legacy for "maxcpus=0". + nosmt [KNL,S390] Disable symmetric multithreading (SMT). + Equivalent to smt=1. + + [KNL,X86] Disable symmetric multithreading (SMT). + nosmt=force: Force disable SMT, cannot be undone + via the sysfs control file. + nosoftlockup [KNL] Disable the soft-lockup detector. + nospec_store_bypass_disable + [HW] Disable all mitigations for the Speculative Store Bypass vulnerability + + nospectre_bhb [ARM64] Disable all mitigations for Spectre-BHB (branch + history injection) vulnerability. System may allow data leaks + with this option. + + nospectre_v1 [X86,PPC] Disable mitigations for Spectre Variant 1 + (bounds check bypass). With this option data leaks are + possible in the system. + + nospectre_v2 [X86,PPC_E500,ARM64] Disable all mitigations for + the Spectre variant 2 (indirect branch prediction) + vulnerability. System may allow data leaks with this + option. + + no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES] Disable paravirtualized + steal time accounting. steal time is computed, but + won't influence scheduler behaviour + nosync [HW,M68K] Disables sync negotiation for all devices. + no_timer_check [X86,APIC] Disables the code which tests for + broken timer IRQ sources. + + no_uaccess_flush + [PPC] Don't flush the L1-D cache after accessing user data. + + novmcoredd [KNL,KDUMP] + Disable device dump. Device dump allows drivers to + append dump data to vmcore so you can collect driver + specified debug info. Drivers can append the data + without any limit and this data is stored in memory, + so this may cause significant memory stress. Disabling + device dump can help save memory but the driver debug + data will be no longer available. This parameter + is only available when CONFIG_PROC_VMCORE_DEVICE_DUMP + is set. + + no-vmw-sched-clock + [X86,PV_OPS] Disable paravirtualized VMware scheduler + clock and use the default one. + nowatchdog [KNL] Disable both lockup detectors, i.e. soft-lockup and NMI watchdog (hard-lockup). @@ -3875,6 +3856,25 @@ LEGACY_XAPIC_DISABLED bit set in the IA32_XAPIC_DISABLE_STATUS MSR. + noxsave [BUGS=X86] Disables x86 extended register state save + and restore using xsave. The kernel will fallback to + enabling legacy floating-point and sse state. + + noxsaveopt [X86] Disables xsaveopt used in saving x86 extended + register states. The kernel will fall back to use + xsave to save the states. By using this parameter, + performance of saving the states is degraded because + xsave doesn't support modified optimization while + xsaveopt supports it on xsaveopt enabled systems. + + noxsaves [X86] Disables xsaves and xrstors used in saving and + restoring x86 extended register state in compacted + form of xsave area. The kernel will fall back to use + xsaveopt and xrstor to save and restore the states + in standard form of xsave area. By using this + parameter, xsave area per process might occupy more + memory on xsaves enabled systems. + nps_mtm_hs_ctr= [KNL,ARC] This parameter sets the maximum duration, in cycles, each HW thread of the CTOP can run @@ -4410,7 +4410,7 @@ and performance comparison. pirq= [SMP,APIC] Manual mp-table setup - See Documentation/x86/i386/IO-APIC.rst. + See Documentation/arch/x86/i386/IO-APIC.rst. plip= [PPT,NET] Parallel port network link Format: { parport<nr> | timid | 0 } @@ -4582,9 +4582,6 @@ Not specifying this option is equivalent to pti=auto. - nopti [X86-64] - Equivalent to pti=off - pty.legacy_count= [KNL] Number of legacy pty's. Overwrites compiled-in default number. @@ -5591,7 +5588,7 @@ serialnumber [BUGS=X86-32] - sev=option[,option...] [X86-64] See Documentation/x86/x86_64/boot-options.rst + sev=option[,option...] [X86-64] See Documentation/arch/x86/x86_64/boot-options.rst shapers= [NET] Maximal number of shapers. @@ -6770,7 +6767,7 @@ Can be used multiple times for multiple devices. vga= [BOOT,X86-32] Select a particular video mode - See Documentation/x86/boot.rst and + See Documentation/arch/x86/boot.rst and Documentation/admin-guide/svga.rst. Use vga=ask for menu. This is actually a boot loader parameter; the value is @@ -6933,6 +6930,12 @@ When enabled, memory and cache locality will be impacted. + writecombine= [LOONGARCH] Control the MAT (Memory Access Type) of + ioremap_wc(). + + on - Enable writecombine, use WUC for ioremap_wc() + off - Disable writecombine, use SUC for ioremap_wc() + x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of default x2apic cluster mode on platforms supporting x2apic. diff --git a/Documentation/admin-guide/quickly-build-trimmed-linux.rst b/Documentation/admin-guide/quickly-build-trimmed-linux.rst new file mode 100644 index 000000000000..ff4f4cc8522b --- /dev/null +++ b/Documentation/admin-guide/quickly-build-trimmed-linux.rst @@ -0,0 +1,1092 @@ +.. SPDX-License-Identifier: (GPL-2.0+ OR CC-BY-4.0) +.. [see the bottom of this file for redistribution information] + +=========================================== +How to quickly build a trimmed Linux kernel +=========================================== + +This guide explains how to swiftly build Linux kernels that are ideal for +testing purposes, but perfectly fine for day-to-day use, too. + +The essence of the process (aka 'TL;DR') +======================================== + +*[If you are new to compiling Linux, ignore this TLDR and head over to the next +section below: it contains a step-by-step guide, which is more detailed, but +still brief and easy to follow; that guide and its accompanying reference +section also mention alternatives, pitfalls, and additional aspects, all of +which might be relevant for you.]* + +If your system uses techniques like Secure Boot, prepare it to permit starting +self-compiled Linux kernels; install compilers and everything else needed for +building Linux; make sure to have 12 Gigabyte free space in your home directory. +Now run the following commands to download fresh Linux mainline sources, which +you then use to configure, build and install your own kernel:: + + git clone --depth 1 -b master \ + https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git ~/linux/ + cd ~/linux/ + # Hint: if you want to apply patches, do it at this point. See below for details. + # Hint: it's recommended to tag your build at this point. See below for details. + yes "" | make localmodconfig + # Hint: at this point you might want to adjust the build configuration; you'll + # have to, if you are running Debian. See below for details. + make -j $(nproc --all) + # Note: on many commodity distributions the next command suffices, but on Arch + # Linux, its derivatives, and some others it does not. See below for details. + command -v installkernel && sudo make modules_install install + reboot + +If you later want to build a newer mainline snapshot, use these commands:: + + cd ~/linux/ + git fetch --depth 1 origin + # Note: the next command will discard any changes you did to the code: + git checkout --force --detach origin/master + # Reminder: if you want to (re)apply patches, do it at this point. + # Reminder: you might want to add or modify a build tag at this point. + make olddefconfig + make -j $(nproc --all) + # Reminder: the next command on some distributions does not suffice. + command -v installkernel && sudo make modules_install install + reboot + +Step-by-step guide +================== + +Compiling your own Linux kernel is easy in principle. There are various ways to +do it. Which of them actually work and is the best depends on the circumstances. + +This guide describes a way perfectly suited for those who want to quickly +install Linux from sources without being bothered by complicated details; the +goal is to cover everything typically needed on mainstream Linux distributions +running on commodity PC or server hardware. + +The described approach is great for testing purposes, for example to try a +proposed fix or to check if a problem was already fixed in the latest codebase. +Nonetheless, kernels built this way are also totally fine for day-to-day use +while at the same time being easy to keep up to date. + +The following steps describe the important aspects of the process; a +comprehensive reference section later explains each of them in more detail. It +sometimes also describes alternative approaches, pitfalls, as well as errors +that might occur at a particular point -- and how to then get things rolling +again. + +.. + Note: if you see this note, you are reading the text's source file. You + might want to switch to a rendered version, as it makes it a lot easier to + quickly look something up in the reference section and afterwards jump back + to where you left off. Find a the latest rendered version here: + https://docs.kernel.org/admin-guide/quickly-build-trimmed-linux.html + +.. _backup_sbs: + + * Create a fresh backup and put system repair and restore tools at hand, just + to be prepared for the unlikely case of something going sideways. + + [:ref:`details<backup>`] + +.. _secureboot_sbs: + + * On platforms with 'Secure Boot' or similar techniques, prepare everything to + ensure the system will permit your self-compiled kernel to boot later. The + quickest and easiest way to achieve this on commodity x86 systems is to + disable such techniques in the BIOS setup utility; alternatively, remove + their restrictions through a process initiated by + ``mokutil --disable-validation``. + + [:ref:`details<secureboot>`] + +.. _buildrequires_sbs: + + * Install all software required to build a Linux kernel. Often you will need: + 'bc', 'binutils' ('ld' et al.), 'bison', 'flex', 'gcc', 'git', 'openssl', + 'pahole', 'perl', and the development headers for 'libelf' and 'openssl'. The + reference section shows how to quickly install those on various popular Linux + distributions. + + [:ref:`details<buildrequires>`] + +.. _diskspace_sbs: + + * Ensure to have enough free space for building and installing Linux. For the + latter 150 Megabyte in /lib/ and 100 in /boot/ are a safe bet. For storing + sources and build artifacts 12 Gigabyte in your home directory should + typically suffice. If you have less available, be sure to check the reference + section for the step that explains adjusting your kernels build + configuration: it mentions a trick that reduce the amount of required space + in /home/ to around 4 Gigabyte. + + [:ref:`details<diskspace>`] + +.. _sources_sbs: + + * Retrieve the sources of the Linux version you intend to build; then change + into the directory holding them, as all further commands in this guide are + meant to be executed from there. + + *[Note: the following paragraphs describe how to retrieve the sources by + partially cloning the Linux stable git repository. This is called a shallow + clone. The reference section explains two alternatives:* :ref:`packaged + archives<sources_archive>` *and* :ref:`a full git clone<sources_full>` *; + prefer the latter, if downloading a lot of data does not bother you, as that + will avoid some* :ref:`peculiar characteristics of shallow clones the + reference section explains<sources_shallow>` *.]* + + First, execute the following command to retrieve a fresh mainline codebase:: + + git clone --no-checkout --depth 1 -b master \ + https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git ~/linux/ + cd ~/linux/ + + If you want to access recent mainline releases and pre-releases, deepen you + clone's history to the oldest mainline version you are interested in:: + + git fetch --shallow-exclude=v6.0 origin + + In case you want to access a stable/longterm release (say v6.1.5), simply add + the branch holding that series; afterwards fetch the history at least up to + the mainline version that started the series (v6.1):: + + git remote set-branches --add origin linux-6.1.y + git fetch --shallow-exclude=v6.0 origin + + Now checkout the code you are interested in. If you just performed the + initial clone, you will be able to check out a fresh mainline codebase, which + is ideal for checking whether developers already fixed an issue:: + + git checkout --detach origin/master + + If you deepened your clone, you instead of ``origin/master`` can specify the + version you deepened to (``v6.0`` above); later releases like ``v6.1`` and + pre-release like ``v6.2-rc1`` will work, too. Stable or longterm versions + like ``v6.1.5`` work just the same, if you added the appropriate + stable/longterm branch as described. + + [:ref:`details<sources>`] + +.. _patching_sbs: + + * In case you want to apply a kernel patch, do so now. Often a command like + this will do the trick:: + + patch -p1 < ../proposed-fix.patch + + If the ``-p1`` is actually needed, depends on how the patch was created; in + case it does not apply thus try without it. + + If you cloned the sources with git and anything goes sideways, run ``git + reset --hard`` to undo any changes to the sources. + + [:ref:`details<patching>`] + +.. _tagging_sbs: + + * If you patched your kernel or have one of the same version installed already, + better add a unique tag to the one you are about to build:: + + echo "-proposed_fix" > localversion + + Running ``uname -r`` under your kernel later will then print something like + '6.1-rc4-proposed_fix'. + + [:ref:`details<tagging>`] + + .. _configuration_sbs: + + * Create the build configuration for your kernel based on an existing + configuration. + + If you already prepared such a '.config' file yourself, copy it to + ~/linux/ and run ``make olddefconfig``. + + Use the same command, if your distribution or somebody else already tailored + your running kernel to your or your hardware's needs: the make target + 'olddefconfig' will then try to use that kernel's .config as base. + + Using this make target is fine for everybody else, too -- but you often can + save a lot of time by using this command instead:: + + yes "" | make localmodconfig + + This will try to pick your distribution's kernel as base, but then disable + modules for any features apparently superfluous for your setup. This will + reduce the compile time enormously, especially if you are running an + universal kernel from a commodity Linux distribution. + + There is a catch: the make target 'localmodconfig' will disable kernel + features you have not directly or indirectly through some program utilized + since you booted the system. You can reduce or nearly eliminate that risk by + using tricks outlined in the reference section; for quick testing purposes + that risk is often negligible, but it is an aspect you want to keep in mind + in case your kernel behaves oddly. + + [:ref:`details<configuration>`] + +.. _configmods_sbs: + + * Check if you might want to or have to adjust some kernel configuration + options: + + * Evaluate how you want to handle debug symbols. Enable them, if you later + might need to decode a stack trace found for example in a 'panic', 'Oops', + 'warning', or 'BUG'; on the other hand disable them, if you are short on + storage space or prefer a smaller kernel binary. See the reference section + for details on how to do either. If neither applies, it will likely be fine + to simply not bother with this. [:ref:`details<configmods_debugsymbols>`] + + * Are you running Debian? Then to avoid known problems by performing + additional adjustments explained in the reference section. + [:ref:`details<configmods_distros>`]. + + * If you want to influence the other aspects of the configuration, do so now + by using make targets like 'menuconfig' or 'xconfig'. + [:ref:`details<configmods_individual>`]. + +.. _build_sbs: + + * Build the image and the modules of your kernel:: + + make -j $(nproc --all) + + If you want your kernel packaged up as deb, rpm, or tar file, see the + reference section for alternatives. + + [:ref:`details<build>`] + +.. _install_sbs: + + * Now install your kernel:: + + command -v installkernel && sudo make modules_install install + + Often all left for you to do afterwards is a ``reboot``, as many commodity + Linux distributions will then create an initramfs (also known as initrd) and + an entry for your kernel in your bootloader's configuration; but on some + distributions you have to take care of these two steps manually for reasons + the reference section explains. + + On a few distributions like Arch Linux and its derivatives the above command + does nothing at all; in that case you have to manually install your kernel, + as outlined in the reference section. + + [:ref:`details<install>`] + +.. _another_sbs: + + * To later build another kernel you need similar steps, but sometimes slightly + different commands. + + First, switch back into the sources tree:: + + cd ~/linux/ + + In case you want to build a version from a stable/longterm series you have + not used yet (say 6.2.y), tell git to track it:: + + git remote set-branches --add origin linux-6.2.y + + Now fetch the latest upstream changes; you again need to specify the earliest + version you care about, as git otherwise might retrieve the entire commit + history:: + + git fetch --shallow-exclude=v6.1 origin + + If you modified the sources (for example by applying a patch), you now need + to discard those modifications; that's because git otherwise will not be able + to switch to the sources of another version due to potential conflicting + changes:: + + git reset --hard + + Now checkout the version you are interested in, as explained above:: + + git checkout --detach origin/master + + At this point you might want to patch the sources again or set/modify a build + tag, as explained earlier; afterwards adjust the build configuration to the + new codebase and build your next kernel:: + + # reminder: if you want to apply patches, do it at this point + # reminder: you might want to update your build tag at this point + make olddefconfig + make -j $(nproc --all) + + Install the kernel as outlined above:: + + command -v installkernel && sudo make modules_install install + + [:ref:`details<another>`] + +.. _uninstall_sbs: + + * Your kernel is easy to remove later, as its parts are only stored in two + places and clearly identifiable by the kernel's release name. Just ensure to + not delete the kernel you are running, as that might render your system + unbootable. + + Start by deleting the directory holding your kernel's modules, which is named + after its release name -- '6.0.1-foobar' in the following example:: + + sudo rm -rf /lib/modules/6.0.1-foobar + + Now try the following command, which on some distributions will delete all + other kernel files installed while also removing the kernel's entry from the + bootloader configuration:: + + command -v kernel-install && sudo kernel-install -v remove 6.0.1-foobar + + If that command does not output anything or fails, see the reference section; + do the same if any files named '*6.0.1-foobar*' remain in /boot/. + + [:ref:`details<uninstall>`] + +.. _submit_improvements: + +Did you run into trouble following any of the above steps that is not cleared up +by the reference section below? Or do you have ideas how to improve the text? +Then please take a moment of your time and let the maintainer of this document +know by email (Thorsten Leemhuis <linux@leemhuis.info>), ideally while CCing the +Linux docs mailing list (linux-doc@vger.kernel.org). Such feedback is vital to +improve this document further, which is in everybody's interest, as it will +enable more people to master the task described here. + +Reference section for the step-by-step guide +============================================ + +This section holds additional information for each of the steps in the above +guide. + +.. _backup: + +Prepare for emergencies +----------------------- + + *Create a fresh backup and put system repair and restore tools at hand* + [:ref:`... <backup_sbs>`] + +Remember, you are dealing with computers, which sometimes do unexpected things +-- especially if you fiddle with crucial parts like the kernel of an operating +system. That's what you are about to do in this process. Hence, better prepare +for something going sideways, even if that should not happen. + +[:ref:`back to step-by-step guide <backup_sbs>`] + +.. _secureboot: + +Dealing with techniques like Secure Boot +---------------------------------------- + + *On platforms with 'Secure Boot' or similar techniques, prepare everything to + ensure the system will permit your self-compiled kernel to boot later.* + [:ref:`... <secureboot_sbs>`] + +Many modern systems allow only certain operating systems to start; they thus by +default will reject booting self-compiled kernels. + +You ideally deal with this by making your platform trust your self-built kernels +with the help of a certificate and signing. How to do that is not described +here, as it requires various steps that would take the text too far away from +its purpose; 'Documentation/admin-guide/module-signing.rst' and various web +sides already explain this in more detail. + +Temporarily disabling solutions like Secure Boot is another way to make your own +Linux boot. On commodity x86 systems it is possible to do this in the BIOS Setup +utility; the steps to do so are not described here, as they greatly vary between +machines. + +On mainstream x86 Linux distributions there is a third and universal option: +disable all Secure Boot restrictions for your Linux environment. You can +initiate this process by running ``mokutil --disable-validation``; this will +tell you to create a one-time password, which is safe to write down. Now +restart; right after your BIOS performed all self-tests the bootloader Shim will +show a blue box with a message 'Press any key to perform MOK management'. Hit +some key before the countdown exposes. This will open a menu and choose 'Change +Secure Boot state' there. Shim's 'MokManager' will now ask you to enter three +randomly chosen characters from the one-time password specified earlier. Once +you provided them, confirm that you really want to disable the validation. +Afterwards, permit MokManager to reboot the machine. + +[:ref:`back to step-by-step guide <secureboot_sbs>`] + +.. _buildrequires: + +Install build requirements +-------------------------- + + *Install all software required to build a Linux kernel.* + [:ref:`...<buildrequires_sbs>`] + +The kernel is pretty stand-alone, but besides tools like the compiler you will +sometimes need a few libraries to build one. How to install everything needed +depends on your Linux distribution and the configuration of the kernel you are +about to build. + +Here are a few examples what you typically need on some mainstream +distributions: + + * Debian, Ubuntu, and derivatives:: + + sudo apt install bc binutils bison dwarves flex gcc git make openssl \ + pahole perl-base libssl-dev libelf-dev + + * Fedora and derivatives:: + + sudo dnf install binutils /usr/include/{libelf.h,openssl/pkcs7.h} \ + /usr/bin/{bc,bison,flex,gcc,git,openssl,make,perl,pahole} + + * openSUSE and derivatives:: + + sudo zypper install bc binutils bison dwarves flex gcc git make perl-base \ + openssl openssl-devel libelf-dev + +In case you wonder why these lists include openssl and its development headers: +they are needed for the Secure Boot support, which many distributions enable in +their kernel configuration for x86 machines. + +Sometimes you will need tools for compression formats like bzip2, gzip, lz4, +lzma, lzo, xz, or zstd as well. + +You might need additional libraries and their development headers in case you +perform tasks not covered in this guide. For example, zlib will be needed when +building kernel tools from the tools/ directory; adjusting the build +configuration with make targets like 'menuconfig' or 'xconfig' will require +development headers for ncurses or Qt5. + +[:ref:`back to step-by-step guide <buildrequires_sbs>`] + +.. _diskspace: + +Space requirements +------------------ + + *Ensure to have enough free space for building and installing Linux.* + [:ref:`... <diskspace_sbs>`] + +The numbers mentioned are rough estimates with a big extra charge to be on the +safe side, so often you will need less. + +If you have space constraints, remember to read the reference section when you +reach the :ref:`section about configuration adjustments' <configmods>`, as +ensuring debug symbols are disabled will reduce the consumed disk space by quite +a few gigabytes. + +[:ref:`back to step-by-step guide <diskspace_sbs>`] + + +.. _sources: + +Download the sources +-------------------- + + *Retrieve the sources of the Linux version you intend to build.* + [:ref:`...<sources_sbs>`] + +The step-by-step guide outlines how to retrieve Linux' sources using a shallow +git clone. There is :ref:`more to tell about this method<sources_shallow>` and +two alternate ways worth describing: :ref:`packaged archives<sources_archive>` +and :ref:`a full git clone<sources_full>`. And the aspects ':ref:`wouldn't it +be wiser to use a proper pre-release than the latest mainline code +<sources_snapshot>`' and ':ref:`how to get an even fresher mainline codebase +<sources_fresher>`' need elaboration, too. + +Note, to keep things simple the commands used in this guide store the build +artifacts in the source tree. If you prefer to separate them, simply add +something like ``O=~/linux-builddir/`` to all make calls; also adjust the path +in all commands that add files or modify any generated (like your '.config'). + +[:ref:`back to step-by-step guide <sources_sbs>`] + +.. _sources_shallow: + +Noteworthy characteristics of shallow clones +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The step-by-step guide uses a shallow clone, as it is the best solution for most +of this document's target audience. There are a few aspects of this approach +worth mentioning: + + * This document in most places uses ``git fetch`` with ``--shallow-exclude=`` + to specify the earliest version you care about (or to be precise: its git + tag). You alternatively can use the parameter ``--shallow-since=`` to specify + an absolute (say ``'2023-07-15'``) or relative (``'12 months'``) date to + define the depth of the history you want to download. As a second + alternative, you can also specify a certain depth explicitly with a parameter + like ``--depth=1``, unless you add branches for stable/longterm kernels. + + * When running ``git fetch``, remember to always specify the oldest version, + the time you care about, or an explicit depth as shown in the step-by-step + guide. Otherwise you will risk downloading nearly the entire git history, + which will consume quite a bit of time and bandwidth while also stressing the + servers. + + Note, you do not have to use the same version or date all the time. But when + you change it over time, git will deepen or flatten the history to the + specified point. That allows you to retrieve versions you initially thought + you did not need -- or it will discard the sources of older versions, for + example in case you want to free up some disk space. The latter will happen + automatically when using ``--shallow-since=`` or + ``--depth=``. + + * Be warned, when deepening your clone you might encounter an error like + 'fatal: error in object: unshallow cafecaca0c0dacafecaca0c0dacafecaca0c0da'. + In that case run ``git repack -d`` and try again`` + + * In case you want to revert changes from a certain version (say Linux 6.3) or + perform a bisection (v6.2..v6.3), better tell ``git fetch`` to retrieve + objects up to three versions earlier (e.g. 6.0): ``git describe`` will then + be able to describe most commits just like it would in a full git clone. + +[:ref:`back to step-by-step guide <sources_sbs>`] [:ref:`back to section intro <sources>`] + +.. _sources_archive: + +Downloading the sources using a packages archive +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +People new to compiling Linux often assume downloading an archive via the +front-page of https://kernel.org is the best approach to retrieve Linux' +sources. It actually can be, if you are certain to build just one particular +kernel version without changing any code. Thing is: you might be sure this will +be the case, but in practice it often will turn out to be a wrong assumption. + +That's because when reporting or debugging an issue developers will often ask to +give another version a try. They also might suggest temporarily undoing a commit +with ``git revert`` or might provide various patches to try. Sometimes reporters +will also be asked to use ``git bisect`` to find the change causing a problem. +These things rely on git or are a lot easier and quicker to handle with it. + +A shallow clone also does not add any significant overhead. For example, when +you use ``git clone --depth=1`` to create a shallow clone of the latest mainline +codebase git will only retrieve a little more data than downloading the latest +mainline pre-release (aka 'rc') via the front-page of kernel.org would. + +A shallow clone therefore is often the better choice. If you nevertheless want +to use a packaged source archive, download one via kernel.org; afterwards +extract its content to some directory and change to the subdirectory created +during extraction. The rest of the step-by-step guide will work just fine, apart +from things that rely on git -- but this mainly concerns the section on +successive builds of other versions. + +[:ref:`back to step-by-step guide <sources_sbs>`] [:ref:`back to section intro <sources>`] + +.. _sources_full: + +Downloading the sources using a full git clone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If downloading and storing a lot of data (~4,4 Gigabyte as of early 2023) is +nothing that bothers you, instead of a shallow clone perform a full git clone +instead. You then will avoid the specialties mentioned above and will have all +versions and individual commits at hand at any time:: + + curl -L \ + https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/clone.bundle \ + -o linux-stable.git.bundle + git clone clone.bundle ~/linux/ + rm linux-stable.git.bundle + cd ~/linux/ + git remote set-url origin + https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git + git fetch origin + git checkout --detach origin/master + +[:ref:`back to step-by-step guide <sources_sbs>`] [:ref:`back to section intro <sources>`] + +.. _sources_snapshot: + +Proper pre-releases (RCs) vs. latest mainline +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When cloning the sources using git and checking out origin/master, you often +will retrieve a codebase that is somewhere between the latest and the next +release or pre-release. This almost always is the code you want when giving +mainline a shot: pre-releases like v6.1-rc5 are in no way special, as they do +not get any significant extra testing before being published. + +There is one exception: you might want to stick to the latest mainline release +(say v6.1) before its successor's first pre-release (v6.2-rc1) is out. That is +because compiler errors and other problems are more likely to occur during this +time, as mainline then is in its 'merge window': a usually two week long phase, +in which the bulk of the changes for the next release is merged. + +[:ref:`back to step-by-step guide <sources_sbs>`] [:ref:`back to section intro <sources>`] + +.. _sources_fresher: + +Avoiding the mainline lag +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The explanations for both the shallow clone and the full clone both retrieve the +code from the Linux stable git repository. That makes things simpler for this +document's audience, as it allows easy access to both mainline and +stable/longterm releases. This approach has just one downside: + +Changes merged into the mainline repository are only synced to the master branch +of the Linux stable repository every few hours. This lag most of the time is +not something to worry about; but in case you really need the latest code, just +add the mainline repo as additional remote and checkout the code from there:: + + git remote add mainline \ + https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git + git fetch mainline + git checkout --detach mainline/master + +When doing this with a shallow clone, remember to call ``git fetch`` with one +of the parameters described earlier to limit the depth. + +[:ref:`back to step-by-step guide <sources_sbs>`] [:ref:`back to section intro <sources>`] + +.. _patching: + +Patch the sources (optional) +---------------------------- + + *In case you want to apply a kernel patch, do so now.* + [:ref:`...<patching_sbs>`] + +This is the point where you might want to patch your kernel -- for example when +a developer proposed a fix and asked you to check if it helps. The step-by-step +guide already explains everything crucial here. + +[:ref:`back to step-by-step guide <patching_sbs>`] + +.. _tagging: + +Tagging this kernel build (optional, often wise) +------------------------------------------------ + + *If you patched your kernel or already have that kernel version installed, + better tag your kernel by extending its release name:* + [:ref:`...<tagging_sbs>`] + +Tagging your kernel will help avoid confusion later, especially when you patched +your kernel. Adding an individual tag will also ensure the kernel's image and +its modules are installed in parallel to any existing kernels. + +There are various ways to add such a tag. The step-by-step guide realizes one by +creating a 'localversion' file in your build directory from which the kernel +build scripts will automatically pick up the tag. You can later change that file +to use a different tag in subsequent builds or simply remove that file to dump +the tag. + +[:ref:`back to step-by-step guide <tagging_sbs>`] + +.. _configuration: + +Define the build configuration for your kernel +---------------------------------------------- + + *Create the build configuration for your kernel based on an existing + configuration.* [:ref:`... <configuration_sbs>`] + +There are various aspects for this steps that require a more careful +explanation: + +Pitfalls when using another configuration file as base +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Make targets like localmodconfig and olddefconfig share a few common snares you +want to be aware of: + + * These targets will reuse a kernel build configuration in your build directory + (e.g. '~/linux/.config'), if one exists. In case you want to start from + scratch you thus need to delete it. + + * The make targets try to find the configuration for your running kernel + automatically, but might choose poorly. A line like '# using defaults found + in /boot/config-6.0.7-250.fc36.x86_64' or 'using config: + '/boot/config-6.0.7-250.fc36.x86_64' tells you which file they picked. If + that is not the intended one, simply store it as '~/linux/.config' + before using these make targets. + + * Unexpected things might happen if you try to use a config file prepared for + one kernel (say v6.0) on an older generation (say v5.15). In that case you + might want to use a configuration as base which your distribution utilized + when they used that or an slightly older kernel version. + +Influencing the configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The make target olddefconfig and the ``yes "" |`` used when utilizing +localmodconfig will set any undefined build options to their default value. This +among others will disable many kernel features that were introduced after your +base kernel was released. + +If you want to set these configurations options manually, use ``oldconfig`` +instead of ``olddefconfig`` or omit the ``yes "" |`` when utilizing +localmodconfig. Then for each undefined configuration option you will be asked +how to proceed. In case you are unsure what to answer, simply hit 'enter' to +apply the default value. + +Big pitfall when using localmodconfig +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As explained briefly in the step-by-step guide already: with localmodconfig it +can easily happen that your self-built kernel will lack modules for tasks you +did not perform before utilizing this make target. That's because those tasks +require kernel modules that are normally autoloaded when you perform that task +for the first time; if you didn't perform that task at least once before using +localmodonfig, the latter will thus assume these modules are superfluous and +disable them. + +You can try to avoid this by performing typical tasks that often will autoload +additional kernel modules: start a VM, establish VPN connections, loop-mount a +CD/DVD ISO, mount network shares (CIFS, NFS, ...), and connect all external +devices (2FA keys, headsets, webcams, ...) as well as storage devices with file +systems you otherwise do not utilize (btrfs, ext4, FAT, NTFS, XFS, ...). But it +is hard to think of everything that might be needed -- even kernel developers +often forget one thing or another at this point. + +Do not let that risk bother you, especially when compiling a kernel only for +testing purposes: everything typically crucial will be there. And if you forget +something important you can turn on a missing feature later and quickly run the +commands to compile and install a better kernel. + +But if you plan to build and use self-built kernels regularly, you might want to +reduce the risk by recording which modules your system loads over the course of +a few weeks. You can automate this with `modprobed-db +<https://github.com/graysky2/modprobed-db>`_. Afterwards use ``LSMOD=<path>`` to +point localmodconfig to the list of modules modprobed-db noticed being used:: + + yes "" | make LSMOD="${HOME}"/.config/modprobed.db localmodconfig + +Remote building with localmodconfig +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you want to use localmodconfig to build a kernel for another machine, run +``lsmod > lsmod_foo-machine`` on it and transfer that file to your build host. +Now point the build scripts to the file like this: ``yes "" | make +LSMOD=~/lsmod_foo-machine localmodconfig``. Note, in this case +you likely want to copy a base kernel configuration from the other machine over +as well and place it as .config in your build directory. + +[:ref:`back to step-by-step guide <configuration_sbs>`] + +.. _configmods: + +Adjust build configuration +-------------------------- + + *Check if you might want to or have to adjust some kernel configuration + options:* + +Depending on your needs you at this point might want or have to adjust some +kernel configuration options. + +.. _configmods_debugsymbols: + +Debug symbols +~~~~~~~~~~~~~ + + *Evaluate how you want to handle debug symbols.* + [:ref:`...<configmods_sbs>`] + +Most users do not need to care about this, it's often fine to leave everything +as it is; but you should take a closer look at this, if you might need to decode +a stack trace or want to reduce space consumption. + +Having debug symbols available can be important when your kernel throws a +'panic', 'Oops', 'warning', or 'BUG' later when running, as then you will be +able to find the exact place where the problem occurred in the code. But +collecting and embedding the needed debug information takes time and consumes +quite a bit of space: in late 2022 the build artifacts for a typical x86 kernel +configured with localmodconfig consumed around 5 Gigabyte of space with debug +symbols, but less than 1 when they were disabled. The resulting kernel image and +the modules are bigger as well, which increases load times. + +Hence, if you want a small kernel and are unlikely to decode a stack trace +later, you might want to disable debug symbols to avoid above downsides:: + + ./scripts/config --file .config -d DEBUG_INFO \ + -d DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT -d DEBUG_INFO_DWARF4 \ + -d DEBUG_INFO_DWARF5 -e CONFIG_DEBUG_INFO_NONE + make olddefconfig + +You on the other hand definitely want to enable them, if there is a decent +chance that you need to decode a stack trace later (as explained by 'Decode +failure messages' in Documentation/admin-guide/tainted-kernels.rst in more +detail):: + + ./scripts/config --file .config -d DEBUG_INFO_NONE -e DEBUG_KERNEL + -e DEBUG_INFO -e DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT -e KALLSYMS -e KALLSYMS_ALL + make olddefconfig + +Note, many mainstream distributions enable debug symbols in their kernel +configurations -- make targets like localmodconfig and olddefconfig thus will +often pick that setting up. + +[:ref:`back to step-by-step guide <configmods_sbs>`] + +.. _configmods_distros: + +Distro specific adjustments +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + *Are you running* [:ref:`... <configmods_sbs>`] + +The following sections help you to avoid build problems that are known to occur +when following this guide on a few commodity distributions. + +**Debian:** + + * Remove a stale reference to a certificate file that would cause your build to + fail:: + + ./scripts/config --file .config --set-str SYSTEM_TRUSTED_KEYS '' + + Alternatively, download the needed certificate and make that configuration + option point to it, as `the Debian handbook explains in more detail + <https://debian-handbook.info/browse/stable/sect.kernel-compilation.html>`_ + -- or generate your own, as explained in + Documentation/admin-guide/module-signing.rst. + +[:ref:`back to step-by-step guide <configmods_sbs>`] + +.. _configmods_individual: + +Individual adjustments +~~~~~~~~~~~~~~~~~~~~~~ + + *If you want to influence the other aspects of the configuration, do so + now* [:ref:`... <configmods_sbs>`] + +You at this point can use a command like ``make menuconfig`` to enable or +disable certain features using a text-based user interface; to use a graphical +configuration utilize, use the make target ``xconfig`` or ``gconfig`` instead. +All of them require development libraries from toolkits they are based on +(ncurses, Qt5, Gtk2); an error message will tell you if something required is +missing. + +[:ref:`back to step-by-step guide <configmods_sbs>`] + +.. _build: + +Build your kernel +----------------- + + *Build the image and the modules of your kernel* [:ref:`... <build_sbs>`] + +A lot can go wrong at this stage, but the instructions below will help you help +yourself. Another subsection explains how to directly package your kernel up as +deb, rpm or tar file. + +Dealing with build errors +~~~~~~~~~~~~~~~~~~~~~~~~~ + +When a build error occurs, it might be caused by some aspect of your machine's +setup that often can be fixed quickly; other times though the problem lies in +the code and can only be fixed by a developer. A close examination of the +failure messages coupled with some research on the internet will often tell you +which of the two it is. To perform such a investigation, restart the build +process like this:: + + make V=1 + +The ``V=1`` activates verbose output, which might be needed to see the actual +error. To make it easier to spot, this command also omits the ``-j $(nproc +--all)`` used earlier to utilize every CPU core in the system for the job -- but +this parallelism also results in some clutter when failures occur. + +After a few seconds the build process should run into the error again. Now try +to find the most crucial line describing the problem. Then search the internet +for the most important and non-generic section of that line (say 4 to 8 words); +avoid or remove anything that looks remotely system-specific, like your username +or local path names like ``/home/username/linux/``. First try your regular +internet search engine with that string, afterwards search Linux kernel mailing +lists via `lore.kernel.org/all/ <https://lore.kernel.org/all/>`_. + +This most of the time will find something that will explain what is wrong; quite +often one of the hits will provide a solution for your problem, too. If you +do not find anything that matches your problem, try again from a different angle +by modifying your search terms or using another line from the error messages. + +In the end, most trouble you are to run into has likely been encountered and +reported by others already. That includes issues where the cause is not your +system, but lies the code. If you run into one of those, you might thus find a +solution (e.g. a patch) or workaround for your problem, too. + +Package your kernel up +~~~~~~~~~~~~~~~~~~~~~~ + +The step-by-step guide uses the default make targets (e.g. 'bzImage' and +'modules' on x86) to build the image and the modules of your kernel, which later +steps of the guide then install. You instead can also directly build everything +and directly package it up by using one of the following targets: + + * ``make -j $(nproc --all) bindeb-pkg`` to generate a deb package + + * ``make -j $(nproc --all) binrpm-pkg`` to generate a rpm package + + * ``make -j $(nproc --all) tarbz2-pkg`` to generate a bz2 compressed tarball + +This is just a selection of available make targets for this purpose, see +``make help`` for others. You can also use these targets after running +``make -j $(nproc --all)``, as they will pick up everything already built. + +If you employ the targets to generate deb or rpm packages, ignore the +step-by-step guide's instructions on installing and removing your kernel; +instead install and remove the packages using the package utility for the format +(e.g. dpkg and rpm) or a package management utility build on top of them (apt, +aptitude, dnf/yum, zypper, ...). Be aware that the packages generated using +these two make targets are designed to work on various distributions utilizing +those formats, they thus will sometimes behave differently than your +distribution's kernel packages. + +[:ref:`back to step-by-step guide <build_sbs>`] + +.. _install: + +Install your kernel +------------------- + + *Now install your kernel* [:ref:`... <install_sbs>`] + +What you need to do after executing the command in the step-by-step guide +depends on the existence and the implementation of an ``installkernel`` +executable. Many commodity Linux distributions ship such a kernel installer in +``/sbin/`` that does everything needed, hence there is nothing left for you +except rebooting. But some distributions contain an installkernel that does +only part of the job -- and a few lack it completely and leave all the work to +you. + +If ``installkernel`` is found, the kernel's build system will delegate the +actual installation of your kernel's image and related files to this executable. +On almost all Linux distributions it will store the image as '/boot/vmlinuz- +<your kernel's release name>' and put a 'System.map-<your kernel's release +name>' alongside it. Your kernel will thus be installed in parallel to any +existing ones, unless you already have one with exactly the same release name. + +Installkernel on many distributions will afterwards generate an 'initramfs' +(often also called 'initrd'), which commodity distributions rely on for booting; +hence be sure to keep the order of the two make targets used in the step-by-step +guide, as things will go sideways if you install your kernel's image before its +modules. Often installkernel will then add your kernel to the bootloader +configuration, too. You have to take care of one or both of these tasks +yourself, if your distributions installkernel doesn't handle them. + +A few distributions like Arch Linux and its derivatives totally lack an +installkernel executable. On those just install the modules using the kernel's +build system and then install the image and the System.map file manually:: + + sudo make modules_install + sudo install -m 0600 $(make -s image_name) /boot/vmlinuz-$(make -s kernelrelease) + sudo install -m 0600 System.map /boot/System.map-$(make -s kernelrelease) + +If your distribution boots with the help of an initramfs, now generate one for +your kernel using the tools your distribution provides for this process. +Afterwards add your kernel to your bootloader configuration and reboot. + +[:ref:`back to step-by-step guide <install_sbs>`] + +.. _another: + +Another round later +------------------- + + *To later build another kernel you need similar, but sometimes slightly + different commands* [:ref:`... <another_sbs>`] + +The process to build later kernels is similar, but at some points slightly +different. You for example do not want to use 'localmodconfig' for succeeding +kernel builds, as you already created a trimmed down configuration you want to +use from now on. Hence instead just use ``oldconfig`` or ``olddefconfig`` to +adjust your build configurations to the needs of the kernel version you are +about to build. + +If you created a shallow-clone with git, remember what the :ref:`section that +explained the setup described in more detail <sources>`: you need to use a +slightly different ``git fetch`` command and when switching to another series +need to add an additional remote branch. + +[:ref:`back to step-by-step guide <another_sbs>`] + +.. _uninstall: + +Uninstall the kernel later +-------------------------- + + *All parts of your installed kernel are identifiable by its release name and + thus easy to remove later.* [:ref:`... <uninstall_sbs>`] + +Do not worry installing your kernel manually and thus bypassing your +distribution's packaging system will totally mess up your machine: all parts of +your kernel are easy to remove later, as files are stored in two places only and +normally identifiable by the kernel's release name. + +One of the two places is a directory in /lib/modules/, which holds the modules +for each installed kernel. This directory is named after the kernel's release +name; hence, to remove all modules for one of your kernels, simply remove its +modules directory in /lib/modules/. + +The other place is /boot/, where typically one to five files will be placed +during installation of a kernel. All of them usually contain the release name in +their file name, but how many files and their name depends somewhat on your +distribution's installkernel executable (:ref:`see above <install>`) and its +initramfs generator. On some distributions the ``kernel-install`` command +mentioned in the step-by-step guide will remove all of these files for you -- +and the entry for your kernel in the bootloader configuration at the same time, +too. On others you have to take care of these steps yourself. The following +command should interactively remove the two main files of a kernel with the +release name '6.0.1-foobar':: + + rm -i /boot/{System.map,vmlinuz}-6.0.1-foobar + +Now remove the belonging initramfs, which often will be called something like +``/boot/initramfs-6.0.1-foobar.img`` or ``/boot/initrd.img-6.0.1-foobar``. +Afterwards check for other files in /boot/ that have '6.0.1-foobar' in their +name and delete them as well. Now remove the kernel from your bootloader's +configuration. + +Note, be very careful with wildcards like '*' when deleting files or directories +for kernels manually: you might accidentally remove files of a 6.0.11 kernel +when all you want is to remove 6.0 or 6.0.1. + +[:ref:`back to step-by-step guide <uninstall_sbs>`] + +.. _faq: + +FAQ +=== + +Why does this 'how-to' not work on my system? +--------------------------------------------- + +As initially stated, this guide is 'designed to cover everything typically +needed [to build a kernel] on mainstream Linux distributions running on +commodity PC or server hardware'. The outlined approach despite this should work +on many other setups as well. But trying to cover every possible use-case in one +guide would defeat its purpose, as without such a focus you would need dozens or +hundreds of constructs along the lines of 'in case you are having <insert +machine or distro>, you at this point have to do <this and that> +<instead|additionally>'. Each of which would make the text longer, more +complicated, and harder to follow. + +That being said: this of course is a balancing act. Hence, if you think an +additional use-case is worth describing, suggest it to the maintainers of this +document, as :ref:`described above <submit_improvements>`. + + +.. + end-of-content +.. + This document is maintained by Thorsten Leemhuis <linux@leemhuis.info>. If + you spot a typo or small mistake, feel free to let him know directly and + he'll fix it. You are free to do the same in a mostly informal way if you + want to contribute changes to the text -- but for copyright reasons please CC + linux-doc@vger.kernel.org and 'sign-off' your contribution as + Documentation/process/submitting-patches.rst explains in the section 'Sign + your work - the Developer's Certificate of Origin'. +.. + This text is available under GPL-2.0+ or CC-BY-4.0, as stated at the top + of the file. If you want to distribute this text under CC-BY-4.0 only, + please use 'The Linux kernel development community' for author attribution + and link this as source: + https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/Documentation/admin-guide/quickly-build-trimmed-linux.rst +.. + Note: Only the content of this RST file as found in the Linux kernel sources + is available under CC-BY-4.0, as versions of this text that were processed + (for example by the kernel's build system) might contain content taken from + files which use a more restrictive license. + diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst index 7b481b2a368e..8e03751d126d 100644 --- a/Documentation/admin-guide/ras.rst +++ b/Documentation/admin-guide/ras.rst @@ -199,7 +199,7 @@ Architecture (MCA)\ [#f3]_. mode). .. [#f3] For more details about the Machine Check Architecture (MCA), - please read Documentation/x86/x86_64/machinecheck.rst at the Kernel tree. + please read Documentation/arch/x86/x86_64/machinecheck.rst at the Kernel tree. EDAC - Error Detection And Correction ************************************* diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 4b7bfea28cd7..d85d90f5d000 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -95,7 +95,7 @@ is 0x15 and the full version number is 0x234, this file will contain the value 340 = 0x154. See the ``type_of_loader`` and ``ext_loader_type`` fields in -Documentation/x86/boot.rst for additional information. +Documentation/arch/x86/boot.rst for additional information. bootloader_version (x86 only) @@ -105,7 +105,7 @@ The complete bootloader version number. In the example above, this file will contain the value 564 = 0x234. See the ``type_of_loader`` and ``ext_loader_ver`` fields in -Documentation/x86/boot.rst for additional information. +Documentation/arch/x86/boot.rst for additional information. bpf_stats_enabled diff --git a/Documentation/admin-guide/unicode.rst b/Documentation/admin-guide/unicode.rst index 290fe83ebe82..cba7e5017d36 100644 --- a/Documentation/admin-guide/unicode.rst +++ b/Documentation/admin-guide/unicode.rst @@ -3,11 +3,10 @@ Unicode support Last update: 2005-01-17, version 1.4 -This file is maintained by H. Peter Anvin <unicode@lanana.org> as part -of the Linux Assigned Names And Numbers Authority (LANANA) project. -The current version can be found at: - - http://www.lanana.org/docs/unicode/admin-guide/unicode.rst +Note: The original version of this document, which was maintained at +lanana.org as part of the Linux Assigned Names And Numbers Authority +(LANANA) project, is no longer existent. So, this version in the +mainline Linux kernel is now the maintained main document. Introduction ------------ diff --git a/Documentation/arc/arc.rst b/Documentation/arch/arc/arc.rst index 6c4d978f3f4e..6c4d978f3f4e 100644 --- a/Documentation/arc/arc.rst +++ b/Documentation/arch/arc/arc.rst diff --git a/Documentation/arc/features.rst b/Documentation/arch/arc/features.rst index b793583d688a..b793583d688a 100644 --- a/Documentation/arc/features.rst +++ b/Documentation/arch/arc/features.rst diff --git a/Documentation/arc/index.rst b/Documentation/arch/arc/index.rst index 7b098d4a5e3e..7b098d4a5e3e 100644 --- a/Documentation/arc/index.rst +++ b/Documentation/arch/arc/index.rst diff --git a/Documentation/ia64/aliasing.rst b/Documentation/arch/ia64/aliasing.rst index 36a1e1d4842b..36a1e1d4842b 100644 --- a/Documentation/ia64/aliasing.rst +++ b/Documentation/arch/ia64/aliasing.rst diff --git a/Documentation/ia64/efirtc.rst b/Documentation/arch/ia64/efirtc.rst index fd8328408301..fd8328408301 100644 --- a/Documentation/ia64/efirtc.rst +++ b/Documentation/arch/ia64/efirtc.rst diff --git a/Documentation/ia64/err_inject.rst b/Documentation/arch/ia64/err_inject.rst index 900f71e93a29..900f71e93a29 100644 --- a/Documentation/ia64/err_inject.rst +++ b/Documentation/arch/ia64/err_inject.rst diff --git a/Documentation/ia64/features.rst b/Documentation/arch/ia64/features.rst index d7226fdcf5f8..d7226fdcf5f8 100644 --- a/Documentation/ia64/features.rst +++ b/Documentation/arch/ia64/features.rst diff --git a/Documentation/ia64/fsys.rst b/Documentation/arch/ia64/fsys.rst index a702d2cc94b6..a702d2cc94b6 100644 --- a/Documentation/ia64/fsys.rst +++ b/Documentation/arch/ia64/fsys.rst diff --git a/Documentation/ia64/ia64.rst b/Documentation/arch/ia64/ia64.rst index b725019a9492..b725019a9492 100644 --- a/Documentation/ia64/ia64.rst +++ b/Documentation/arch/ia64/ia64.rst diff --git a/Documentation/ia64/index.rst b/Documentation/arch/ia64/index.rst index 761f2154dfa2..761f2154dfa2 100644 --- a/Documentation/ia64/index.rst +++ b/Documentation/arch/ia64/index.rst diff --git a/Documentation/ia64/irq-redir.rst b/Documentation/arch/ia64/irq-redir.rst index 6bbbbe4f73ef..6bbbbe4f73ef 100644 --- a/Documentation/ia64/irq-redir.rst +++ b/Documentation/arch/ia64/irq-redir.rst diff --git a/Documentation/ia64/mca.rst b/Documentation/arch/ia64/mca.rst index 08270bba44a4..08270bba44a4 100644 --- a/Documentation/ia64/mca.rst +++ b/Documentation/arch/ia64/mca.rst diff --git a/Documentation/ia64/serial.rst b/Documentation/arch/ia64/serial.rst index 1de70c305a79..1de70c305a79 100644 --- a/Documentation/ia64/serial.rst +++ b/Documentation/arch/ia64/serial.rst diff --git a/Documentation/arch.rst b/Documentation/arch/index.rst index 41a66a8b38e4..80ee31016584 100644 --- a/Documentation/arch.rst +++ b/Documentation/arch/index.rst @@ -10,18 +10,18 @@ implementation. :maxdepth: 2 arc/index - arm/index - arm64/index + ../arm/index + ../arm64/index ia64/index - loongarch/index + ../loongarch/index m68k/index - mips/index + ../mips/index nios2/index openrisc/index parisc/index - powerpc/index - riscv/index - s390/index + ../powerpc/index + ../riscv/index + ../s390/index sh/index sparc/index x86/index diff --git a/Documentation/m68k/buddha-driver.rst b/Documentation/arch/m68k/buddha-driver.rst index 20e401413991..20e401413991 100644 --- a/Documentation/m68k/buddha-driver.rst +++ b/Documentation/arch/m68k/buddha-driver.rst diff --git a/Documentation/m68k/features.rst b/Documentation/arch/m68k/features.rst index 5107a2119472..5107a2119472 100644 --- a/Documentation/m68k/features.rst +++ b/Documentation/arch/m68k/features.rst diff --git a/Documentation/m68k/index.rst b/Documentation/arch/m68k/index.rst index 0f890dbb5fe2..0f890dbb5fe2 100644 --- a/Documentation/m68k/index.rst +++ b/Documentation/arch/m68k/index.rst diff --git a/Documentation/m68k/kernel-options.rst b/Documentation/arch/m68k/kernel-options.rst index 2008a20b4329..2008a20b4329 100644 --- a/Documentation/m68k/kernel-options.rst +++ b/Documentation/arch/m68k/kernel-options.rst diff --git a/Documentation/nios2/features.rst b/Documentation/arch/nios2/features.rst index 8449e63f69b2..8449e63f69b2 100644 --- a/Documentation/nios2/features.rst +++ b/Documentation/arch/nios2/features.rst diff --git a/Documentation/nios2/index.rst b/Documentation/arch/nios2/index.rst index 4468fe1a1037..4468fe1a1037 100644 --- a/Documentation/nios2/index.rst +++ b/Documentation/arch/nios2/index.rst diff --git a/Documentation/nios2/nios2.rst b/Documentation/arch/nios2/nios2.rst index 43da3f7cee76..43da3f7cee76 100644 --- a/Documentation/nios2/nios2.rst +++ b/Documentation/arch/nios2/nios2.rst diff --git a/Documentation/openrisc/features.rst b/Documentation/arch/openrisc/features.rst index 3f7c40d219f2..3f7c40d219f2 100644 --- a/Documentation/openrisc/features.rst +++ b/Documentation/arch/openrisc/features.rst diff --git a/Documentation/openrisc/index.rst b/Documentation/arch/openrisc/index.rst index 6879f998b87a..6879f998b87a 100644 --- a/Documentation/openrisc/index.rst +++ b/Documentation/arch/openrisc/index.rst diff --git a/Documentation/openrisc/openrisc_port.rst b/Documentation/arch/openrisc/openrisc_port.rst index 657ac4af7be6..657ac4af7be6 100644 --- a/Documentation/openrisc/openrisc_port.rst +++ b/Documentation/arch/openrisc/openrisc_port.rst diff --git a/Documentation/openrisc/todo.rst b/Documentation/arch/openrisc/todo.rst index 420b18b87eda..420b18b87eda 100644 --- a/Documentation/openrisc/todo.rst +++ b/Documentation/arch/openrisc/todo.rst diff --git a/Documentation/parisc/debugging.rst b/Documentation/arch/parisc/debugging.rst index de1b60402c5b..de1b60402c5b 100644 --- a/Documentation/parisc/debugging.rst +++ b/Documentation/arch/parisc/debugging.rst diff --git a/Documentation/parisc/features.rst b/Documentation/arch/parisc/features.rst index 501d7c450037..501d7c450037 100644 --- a/Documentation/parisc/features.rst +++ b/Documentation/arch/parisc/features.rst diff --git a/Documentation/parisc/index.rst b/Documentation/arch/parisc/index.rst index 240685751825..240685751825 100644 --- a/Documentation/parisc/index.rst +++ b/Documentation/arch/parisc/index.rst diff --git a/Documentation/parisc/registers.rst b/Documentation/arch/parisc/registers.rst index 59c8ecf3e856..59c8ecf3e856 100644 --- a/Documentation/parisc/registers.rst +++ b/Documentation/arch/parisc/registers.rst diff --git a/Documentation/sh/booting.rst b/Documentation/arch/sh/booting.rst index d851c49a01bf..d851c49a01bf 100644 --- a/Documentation/sh/booting.rst +++ b/Documentation/arch/sh/booting.rst diff --git a/Documentation/sh/features.rst b/Documentation/arch/sh/features.rst index f722af3b6c99..f722af3b6c99 100644 --- a/Documentation/sh/features.rst +++ b/Documentation/arch/sh/features.rst diff --git a/Documentation/sh/index.rst b/Documentation/arch/sh/index.rst index c64776738cf6..c64776738cf6 100644 --- a/Documentation/sh/index.rst +++ b/Documentation/arch/sh/index.rst diff --git a/Documentation/sh/new-machine.rst b/Documentation/arch/sh/new-machine.rst index e501c52b3b30..e501c52b3b30 100644 --- a/Documentation/sh/new-machine.rst +++ b/Documentation/arch/sh/new-machine.rst diff --git a/Documentation/sh/register-banks.rst b/Documentation/arch/sh/register-banks.rst index 2bef5c8fcbbc..2bef5c8fcbbc 100644 --- a/Documentation/sh/register-banks.rst +++ b/Documentation/arch/sh/register-banks.rst diff --git a/Documentation/sparc/adi.rst b/Documentation/arch/sparc/adi.rst index dbcd8b6e7bc3..dbcd8b6e7bc3 100644 --- a/Documentation/sparc/adi.rst +++ b/Documentation/arch/sparc/adi.rst diff --git a/Documentation/sparc/console.rst b/Documentation/arch/sparc/console.rst index 73132db83ece..73132db83ece 100644 --- a/Documentation/sparc/console.rst +++ b/Documentation/arch/sparc/console.rst diff --git a/Documentation/sparc/features.rst b/Documentation/arch/sparc/features.rst index c0c92468b0fe..c0c92468b0fe 100644 --- a/Documentation/sparc/features.rst +++ b/Documentation/arch/sparc/features.rst diff --git a/Documentation/sparc/index.rst b/Documentation/arch/sparc/index.rst index ae884224eec2..ae884224eec2 100644 --- a/Documentation/sparc/index.rst +++ b/Documentation/arch/sparc/index.rst diff --git a/Documentation/sparc/oradax/dax-hv-api.txt b/Documentation/arch/sparc/oradax/dax-hv-api.txt index 7ecd0bf4957b..7ecd0bf4957b 100644 --- a/Documentation/sparc/oradax/dax-hv-api.txt +++ b/Documentation/arch/sparc/oradax/dax-hv-api.txt diff --git a/Documentation/sparc/oradax/oracle-dax.rst b/Documentation/arch/sparc/oradax/oracle-dax.rst index d1e14d572918..d1e14d572918 100644 --- a/Documentation/sparc/oradax/oracle-dax.rst +++ b/Documentation/arch/sparc/oradax/oracle-dax.rst diff --git a/Documentation/x86/amd-memory-encryption.rst b/Documentation/arch/x86/amd-memory-encryption.rst index 934310ce7258..934310ce7258 100644 --- a/Documentation/x86/amd-memory-encryption.rst +++ b/Documentation/arch/x86/amd-memory-encryption.rst diff --git a/Documentation/x86/amd_hsmp.rst b/Documentation/arch/x86/amd_hsmp.rst index 440e4b645a1c..440e4b645a1c 100644 --- a/Documentation/x86/amd_hsmp.rst +++ b/Documentation/arch/x86/amd_hsmp.rst diff --git a/Documentation/x86/boot.rst b/Documentation/arch/x86/boot.rst index 240d084782a6..33520ecdb37a 100644 --- a/Documentation/x86/boot.rst +++ b/Documentation/arch/x86/boot.rst @@ -1344,7 +1344,7 @@ follow:: In addition to read/modify/write the setup header of the struct boot_params as that of 16-bit boot protocol, the boot loader should also fill the additional fields of the struct boot_params as -described in chapter Documentation/x86/zero-page.rst. +described in chapter Documentation/arch/x86/zero-page.rst. After setting up the struct boot_params, the boot loader can load the 32/64-bit kernel in the same way as that of 16-bit boot protocol. @@ -1380,7 +1380,7 @@ can be calculated as follows:: In addition to read/modify/write the setup header of the struct boot_params as that of 16-bit boot protocol, the boot loader should also fill the additional fields of the struct boot_params as described -in chapter Documentation/x86/zero-page.rst. +in chapter Documentation/arch/x86/zero-page.rst. After setting up the struct boot_params, the boot loader can load 64-bit kernel in the same way as that of 16-bit boot protocol, but diff --git a/Documentation/x86/booting-dt.rst b/Documentation/arch/x86/booting-dt.rst index 965a374071ab..b089ffd56e6e 100644 --- a/Documentation/x86/booting-dt.rst +++ b/Documentation/arch/x86/booting-dt.rst @@ -7,7 +7,7 @@ DeviceTree Booting the decompressor (the real mode entry point goes to the same 32bit entry point once it switched into protected mode). That entry point supports one calling convention which is documented in - Documentation/x86/boot.rst + Documentation/arch/x86/boot.rst The physical pointer to the device-tree block is passed via setup_data which requires at least boot protocol 2.09. The type filed is defined as diff --git a/Documentation/x86/buslock.rst b/Documentation/arch/x86/buslock.rst index 7c051e714943..31ec0ef78086 100644 --- a/Documentation/x86/buslock.rst +++ b/Documentation/arch/x86/buslock.rst @@ -53,8 +53,14 @@ parameter "split_lock_detect". Here is a summary of different options: |off |Do nothing |Do nothing | +------------------+----------------------------+-----------------------+ |warn |Kernel OOPs |Warn once per task and | -|(default) |Warn once per task and |and continues to run. | -| |disable future checking | | +|(default) |Warn once per task, add a |and continues to run. | +| |delay, add synchronization | | +| |to prevent more than one | | +| |core from executing a | | +| |split lock in parallel. | | +| |sysctl split_lock_mitigate | | +| |can be used to avoid the | | +| |delay and synchronization | | | |When both features are | | | |supported, warn in #AC | | +------------------+----------------------------+-----------------------+ diff --git a/Documentation/x86/cpuinfo.rst b/Documentation/arch/x86/cpuinfo.rst index 08246e8ac835..08246e8ac835 100644 --- a/Documentation/x86/cpuinfo.rst +++ b/Documentation/arch/x86/cpuinfo.rst diff --git a/Documentation/x86/earlyprintk.rst b/Documentation/arch/x86/earlyprintk.rst index 51ef11e8f725..51ef11e8f725 100644 --- a/Documentation/x86/earlyprintk.rst +++ b/Documentation/arch/x86/earlyprintk.rst diff --git a/Documentation/x86/elf_auxvec.rst b/Documentation/arch/x86/elf_auxvec.rst index 18e4744717f9..18e4744717f9 100644 --- a/Documentation/x86/elf_auxvec.rst +++ b/Documentation/arch/x86/elf_auxvec.rst diff --git a/Documentation/x86/entry_64.rst b/Documentation/arch/x86/entry_64.rst index 0afdce3c06f4..0afdce3c06f4 100644 --- a/Documentation/x86/entry_64.rst +++ b/Documentation/arch/x86/entry_64.rst diff --git a/Documentation/x86/exception-tables.rst b/Documentation/arch/x86/exception-tables.rst index efde1fef4fbd..efde1fef4fbd 100644 --- a/Documentation/x86/exception-tables.rst +++ b/Documentation/arch/x86/exception-tables.rst diff --git a/Documentation/x86/features.rst b/Documentation/arch/x86/features.rst index b663f15053ce..b663f15053ce 100644 --- a/Documentation/x86/features.rst +++ b/Documentation/arch/x86/features.rst diff --git a/Documentation/x86/i386/IO-APIC.rst b/Documentation/arch/x86/i386/IO-APIC.rst index ce4d8df15e7c..ce4d8df15e7c 100644 --- a/Documentation/x86/i386/IO-APIC.rst +++ b/Documentation/arch/x86/i386/IO-APIC.rst diff --git a/Documentation/x86/i386/index.rst b/Documentation/arch/x86/i386/index.rst index 8747cf5bbd49..8747cf5bbd49 100644 --- a/Documentation/x86/i386/index.rst +++ b/Documentation/arch/x86/i386/index.rst diff --git a/Documentation/x86/ifs.rst b/Documentation/arch/x86/ifs.rst index 97abb696a680..97abb696a680 100644 --- a/Documentation/x86/ifs.rst +++ b/Documentation/arch/x86/ifs.rst diff --git a/Documentation/x86/index.rst b/Documentation/arch/x86/index.rst index c73d133fd37c..c73d133fd37c 100644 --- a/Documentation/x86/index.rst +++ b/Documentation/arch/x86/index.rst diff --git a/Documentation/x86/intel-hfi.rst b/Documentation/arch/x86/intel-hfi.rst index 49dea58ea4fb..49dea58ea4fb 100644 --- a/Documentation/x86/intel-hfi.rst +++ b/Documentation/arch/x86/intel-hfi.rst diff --git a/Documentation/x86/intel_txt.rst b/Documentation/arch/x86/intel_txt.rst index d83c1a2122c9..d83c1a2122c9 100644 --- a/Documentation/x86/intel_txt.rst +++ b/Documentation/arch/x86/intel_txt.rst diff --git a/Documentation/x86/iommu.rst b/Documentation/arch/x86/iommu.rst index 42c7a6faa39a..42c7a6faa39a 100644 --- a/Documentation/x86/iommu.rst +++ b/Documentation/arch/x86/iommu.rst diff --git a/Documentation/x86/kernel-stacks.rst b/Documentation/arch/x86/kernel-stacks.rst index 6b0bcf027ff1..6b0bcf027ff1 100644 --- a/Documentation/x86/kernel-stacks.rst +++ b/Documentation/arch/x86/kernel-stacks.rst diff --git a/Documentation/x86/mds.rst b/Documentation/arch/x86/mds.rst index 5d4330be200f..5d4330be200f 100644 --- a/Documentation/x86/mds.rst +++ b/Documentation/arch/x86/mds.rst diff --git a/Documentation/x86/microcode.rst b/Documentation/arch/x86/microcode.rst index b627c6f36bcf..b627c6f36bcf 100644 --- a/Documentation/x86/microcode.rst +++ b/Documentation/arch/x86/microcode.rst diff --git a/Documentation/x86/mtrr.rst b/Documentation/arch/x86/mtrr.rst index 9f0b1851771a..f65ef034da7a 100644 --- a/Documentation/x86/mtrr.rst +++ b/Documentation/arch/x86/mtrr.rst @@ -28,7 +28,7 @@ are aligned with platform MTRR setup. If MTRRs are only set up by the platform firmware code though and the OS does not make any specific MTRR mapping requests mtrr_type_lookup() should always return MTRR_TYPE_INVALID. -For details refer to Documentation/x86/pat.rst. +For details refer to Documentation/arch/x86/pat.rst. .. tip:: On Intel P6 family processors (Pentium Pro, Pentium II and later) diff --git a/Documentation/x86/orc-unwinder.rst b/Documentation/arch/x86/orc-unwinder.rst index cdb257015bd9..cdb257015bd9 100644 --- a/Documentation/x86/orc-unwinder.rst +++ b/Documentation/arch/x86/orc-unwinder.rst diff --git a/Documentation/x86/pat.rst b/Documentation/arch/x86/pat.rst index 5d901771016d..5d901771016d 100644 --- a/Documentation/x86/pat.rst +++ b/Documentation/arch/x86/pat.rst diff --git a/Documentation/x86/pti.rst b/Documentation/arch/x86/pti.rst index 4b858a9bad8d..4b858a9bad8d 100644 --- a/Documentation/x86/pti.rst +++ b/Documentation/arch/x86/pti.rst diff --git a/Documentation/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst index 387ccbcb558f..387ccbcb558f 100644 --- a/Documentation/x86/resctrl.rst +++ b/Documentation/arch/x86/resctrl.rst diff --git a/Documentation/x86/sgx.rst b/Documentation/arch/x86/sgx.rst index 2bcbffacbed5..2bcbffacbed5 100644 --- a/Documentation/x86/sgx.rst +++ b/Documentation/arch/x86/sgx.rst diff --git a/Documentation/x86/sva.rst b/Documentation/arch/x86/sva.rst index 2e9b8b0f9a0f..2e9b8b0f9a0f 100644 --- a/Documentation/x86/sva.rst +++ b/Documentation/arch/x86/sva.rst diff --git a/Documentation/x86/tdx.rst b/Documentation/arch/x86/tdx.rst index dc8d9fd2c3f7..dc8d9fd2c3f7 100644 --- a/Documentation/x86/tdx.rst +++ b/Documentation/arch/x86/tdx.rst diff --git a/Documentation/x86/tlb.rst b/Documentation/arch/x86/tlb.rst index 82ec58ae63a8..82ec58ae63a8 100644 --- a/Documentation/x86/tlb.rst +++ b/Documentation/arch/x86/tlb.rst diff --git a/Documentation/x86/topology.rst b/Documentation/arch/x86/topology.rst index 7f58010ea86a..7f58010ea86a 100644 --- a/Documentation/x86/topology.rst +++ b/Documentation/arch/x86/topology.rst diff --git a/Documentation/x86/tsx_async_abort.rst b/Documentation/arch/x86/tsx_async_abort.rst index 583ddc185ba2..583ddc185ba2 100644 --- a/Documentation/x86/tsx_async_abort.rst +++ b/Documentation/arch/x86/tsx_async_abort.rst diff --git a/Documentation/x86/usb-legacy-support.rst b/Documentation/arch/x86/usb-legacy-support.rst index e01c08b7c981..e01c08b7c981 100644 --- a/Documentation/x86/usb-legacy-support.rst +++ b/Documentation/arch/x86/usb-legacy-support.rst diff --git a/Documentation/x86/x86_64/5level-paging.rst b/Documentation/arch/x86/x86_64/5level-paging.rst index b792bbdc0b01..71f882f4a173 100644 --- a/Documentation/x86/x86_64/5level-paging.rst +++ b/Documentation/arch/x86/x86_64/5level-paging.rst @@ -20,7 +20,7 @@ physical address space. This "ought to be enough for anybody" ©. QEMU 2.9 and later support 5-level paging. Virtual memory layout for 5-level paging is described in -Documentation/x86/x86_64/mm.rst +Documentation/arch/x86/x86_64/mm.rst Enabling 5-level paging diff --git a/Documentation/x86/x86_64/boot-options.rst b/Documentation/arch/x86/x86_64/boot-options.rst index cbd14124a667..137432d34109 100644 --- a/Documentation/x86/x86_64/boot-options.rst +++ b/Documentation/arch/x86/x86_64/boot-options.rst @@ -9,7 +9,7 @@ only the AMD64 specific ones are listed here. Machine check ============= -Please see Documentation/x86/x86_64/machinecheck.rst for sysfs runtime tunables. +Please see Documentation/arch/x86/x86_64/machinecheck.rst for sysfs runtime tunables. mce=off Disable machine check @@ -82,7 +82,7 @@ APICs Don't use the local APIC (alias for i386 compatibility) pirq=... - See Documentation/x86/i386/IO-APIC.rst + See Documentation/arch/x86/i386/IO-APIC.rst noapictimer Don't set up the APIC timer diff --git a/Documentation/x86/x86_64/cpu-hotplug-spec.rst b/Documentation/arch/x86/x86_64/cpu-hotplug-spec.rst index 8d1c91f0c880..8d1c91f0c880 100644 --- a/Documentation/x86/x86_64/cpu-hotplug-spec.rst +++ b/Documentation/arch/x86/x86_64/cpu-hotplug-spec.rst diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/arch/x86/x86_64/fake-numa-for-cpusets.rst index ff9bcfd2cc14..ba74617d4999 100644 --- a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst +++ b/Documentation/arch/x86/x86_64/fake-numa-for-cpusets.rst @@ -18,7 +18,7 @@ For more information on the features of cpusets, see Documentation/admin-guide/cgroup-v1/cpusets.rst. There are a number of different configurations you can use for your needs. For more information on the numa=fake command line option and its various ways of -configuring fake nodes, see Documentation/x86/x86_64/boot-options.rst. +configuring fake nodes, see Documentation/arch/x86/x86_64/boot-options.rst. For the purposes of this introduction, we'll assume a very primitive NUMA emulation setup of "numa=fake=4*512,". This will split our system memory into diff --git a/Documentation/x86/x86_64/fsgs.rst b/Documentation/arch/x86/x86_64/fsgs.rst index 50960e09e1f6..50960e09e1f6 100644 --- a/Documentation/x86/x86_64/fsgs.rst +++ b/Documentation/arch/x86/x86_64/fsgs.rst diff --git a/Documentation/x86/x86_64/index.rst b/Documentation/arch/x86/x86_64/index.rst index a56070fc8e77..a56070fc8e77 100644 --- a/Documentation/x86/x86_64/index.rst +++ b/Documentation/arch/x86/x86_64/index.rst diff --git a/Documentation/x86/x86_64/machinecheck.rst b/Documentation/arch/x86/x86_64/machinecheck.rst index cea12ee97200..cea12ee97200 100644 --- a/Documentation/x86/x86_64/machinecheck.rst +++ b/Documentation/arch/x86/x86_64/machinecheck.rst diff --git a/Documentation/x86/x86_64/mm.rst b/Documentation/arch/x86/x86_64/mm.rst index 35e5e18c83d0..35e5e18c83d0 100644 --- a/Documentation/x86/x86_64/mm.rst +++ b/Documentation/arch/x86/x86_64/mm.rst diff --git a/Documentation/x86/x86_64/uefi.rst b/Documentation/arch/x86/x86_64/uefi.rst index fbc30c9a071d..fbc30c9a071d 100644 --- a/Documentation/x86/x86_64/uefi.rst +++ b/Documentation/arch/x86/x86_64/uefi.rst diff --git a/Documentation/x86/xstate.rst b/Documentation/arch/x86/xstate.rst index 5cec7fb558d6..5cec7fb558d6 100644 --- a/Documentation/x86/xstate.rst +++ b/Documentation/arch/x86/xstate.rst diff --git a/Documentation/x86/zero-page.rst b/Documentation/arch/x86/zero-page.rst index 45aa9cceb4f1..45aa9cceb4f1 100644 --- a/Documentation/x86/zero-page.rst +++ b/Documentation/arch/x86/zero-page.rst diff --git a/Documentation/xtensa/atomctl.rst b/Documentation/arch/xtensa/atomctl.rst index 1ecbd0ba9a2e..1ecbd0ba9a2e 100644 --- a/Documentation/xtensa/atomctl.rst +++ b/Documentation/arch/xtensa/atomctl.rst diff --git a/Documentation/xtensa/booting.rst b/Documentation/arch/xtensa/booting.rst index e1b83707e5b6..e1b83707e5b6 100644 --- a/Documentation/xtensa/booting.rst +++ b/Documentation/arch/xtensa/booting.rst diff --git a/Documentation/xtensa/features.rst b/Documentation/arch/xtensa/features.rst index 6b92c7bfa19d..6b92c7bfa19d 100644 --- a/Documentation/xtensa/features.rst +++ b/Documentation/arch/xtensa/features.rst diff --git a/Documentation/xtensa/index.rst b/Documentation/arch/xtensa/index.rst index 69952446a9be..69952446a9be 100644 --- a/Documentation/xtensa/index.rst +++ b/Documentation/arch/xtensa/index.rst diff --git a/Documentation/xtensa/mmu.rst b/Documentation/arch/xtensa/mmu.rst index 450573afa31a..450573afa31a 100644 --- a/Documentation/xtensa/mmu.rst +++ b/Documentation/arch/xtensa/mmu.rst diff --git a/Documentation/arm/index.rst b/Documentation/arm/index.rst index ae42fe886f0d..8890ec4314c2 100644 --- a/Documentation/arm/index.rst +++ b/Documentation/arm/index.rst @@ -69,11 +69,9 @@ SoC-specific documents spear/overview - sti/stih416-overview sti/stih407-overview sti/stih418-overview sti/overview - sti/stih415-overview vfp/release-notes diff --git a/Documentation/arm/sti/overview.rst b/Documentation/arm/sti/overview.rst index 70743617a74f..ae16aced800f 100644 --- a/Documentation/arm/sti/overview.rst +++ b/Documentation/arm/sti/overview.rst @@ -7,22 +7,18 @@ Introduction The ST Microelectronics Multimedia and Application Processors range of CortexA9 System-on-Chip are supported by the 'STi' platform of - ARM Linux. Currently STiH415, STiH416 SOCs are supported with both - B2000 and B2020 Reference boards. + ARM Linux. Currently STiH407, STiH410 and STiH418 are supported. configuration ------------- - A generic configuration is provided for both STiH415/416, and can be used as the - default by:: - - make stih41x_defconfig + The configuration for the STi platform is supported via the multi_v7_defconfig. Layout ------ - All the files for multiple machine families (STiH415, STiH416, and STiG125) + All the files for multiple machine families (STiH407, STiH410, and STiH418) are located in the platform code contained in arch/arm/mach-sti There is a generic board board-dt.c in the mach folder which support diff --git a/Documentation/arm/sti/stih415-overview.rst b/Documentation/arm/sti/stih415-overview.rst deleted file mode 100644 index b67452d610c4..000000000000 --- a/Documentation/arm/sti/stih415-overview.rst +++ /dev/null @@ -1,14 +0,0 @@ -================ -STiH415 Overview -================ - -Introduction ------------- - - The STiH415 is the next generation of HD, AVC set-top box processors - for satellite, cable, terrestrial and IP-STB markets. - - Features: - - - ARM Cortex-A9 1.0 GHz, dual-core CPU - - SATA2x2,USB 2.0x3, PCIe, Gbit Ethernet MACx2 diff --git a/Documentation/arm/sti/stih416-overview.rst b/Documentation/arm/sti/stih416-overview.rst deleted file mode 100644 index 93f17d74d8db..000000000000 --- a/Documentation/arm/sti/stih416-overview.rst +++ /dev/null @@ -1,13 +0,0 @@ -================ -STiH416 Overview -================ - -Introduction ------------- - - The STiH416 is the next generation of HD, AVC set-top box processors - for satellite, cable, terrestrial and IP-STB markets. - - Features - - ARM Cortex-A9 1.2 GHz dual core CPU - - SATA2x2,USB 2.0x3, PCIe, Gbit Ethernet MACx2 diff --git a/Documentation/conf.py b/Documentation/conf.py index db16814f182f..37314afd1ac8 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -343,9 +343,10 @@ sys.stderr.write("Using %s theme\n" % html_theme) # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['sphinx-static'] -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -html_use_smartypants = False +# If true, Docutils "smart quotes" will be used to convert quotes and dashes +# to typographically correct entities. This will convert "--" to "—", +# which is not always what we want, so disable it. +smartquotes = False # Custom sidebar templates, maps document names to template names. # Note that the RTD theme ignores this diff --git a/Documentation/core-api/asm-annotations.rst b/Documentation/core-api/asm-annotations.rst index bc514ed59887..11c96d3f9ad6 100644 --- a/Documentation/core-api/asm-annotations.rst +++ b/Documentation/core-api/asm-annotations.rst @@ -44,7 +44,7 @@ information. In particular, on properly annotated objects, ``objtool`` can be run to check and fix the object if needed. Currently, ``objtool`` can report missing frame pointer setup/destruction in functions. It can also automatically generate annotations for the ORC unwinder -(Documentation/x86/orc-unwinder.rst) +(Documentation/arch/x86/orc-unwinder.rst) for most code. Both of these are especially important to support reliable stack traces which are in turn necessary for kernel live patching (Documentation/livepatch/livepatch.rst). diff --git a/Documentation/core-api/dma-api-howto.rst b/Documentation/core-api/dma-api-howto.rst index 828846804e25..72f6cdb6be1c 100644 --- a/Documentation/core-api/dma-api-howto.rst +++ b/Documentation/core-api/dma-api-howto.rst @@ -185,7 +185,7 @@ device struct of your device is embedded in the bus-specific device struct of your device. For example, &pdev->dev is a pointer to the device struct of a PCI device (pdev is a pointer to the PCI device struct of your device). -These calls usually return zero to indicated your device can perform DMA +These calls usually return zero to indicate your device can perform DMA properly on the machine given the address mask you provided, but they might return an error if the mask is too small to be supportable on the given system. If it returns non-zero, your device cannot perform DMA properly on diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst index 5483fd39ef29..2cb00b53339f 100644 --- a/Documentation/dev-tools/kmemleak.rst +++ b/Documentation/dev-tools/kmemleak.rst @@ -227,7 +227,7 @@ Testing with kmemleak-test -------------------------- To check if you have all set up to use kmemleak, you can use the kmemleak-test -module, a module that deliberately leaks memory. Set CONFIG_DEBUG_KMEMLEAK_TEST +module, a module that deliberately leaks memory. Set CONFIG_SAMPLE_KMEMLEAK as module (it can't be used as built-in) and boot the kernel with kmemleak enabled. Load the module and perform a scan with:: diff --git a/Documentation/devicetree/bindings/interrupt-controller/loongarch,cpu-interrupt-controller.yaml b/Documentation/devicetree/bindings/interrupt-controller/loongson,cpu-interrupt-controller.yaml index 2a1cf885c99d..adf989976dcc 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/loongarch,cpu-interrupt-controller.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/loongson,cpu-interrupt-controller.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause %YAML 1.2 --- -$id: http://devicetree.org/schemas/interrupt-controller/loongarch,cpu-interrupt-controller.yaml# +$id: http://devicetree.org/schemas/interrupt-controller/loongson,cpu-interrupt-controller.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: LoongArch CPU Interrupt Controller @@ -11,7 +11,7 @@ maintainers: properties: compatible: - const: loongarch,cpu-interrupt-controller + const: loongson,cpu-interrupt-controller '#interrupt-cells': const: 1 @@ -28,7 +28,7 @@ required: examples: - | interrupt-controller { - compatible = "loongarch,cpu-interrupt-controller"; + compatible = "loongson,cpu-interrupt-controller"; #interrupt-cells = <1>; interrupt-controller; }; diff --git a/Documentation/driver-api/clk.rst b/Documentation/driver-api/clk.rst index 3cad45d14187..93bab5336dfd 100644 --- a/Documentation/driver-api/clk.rst +++ b/Documentation/driver-api/clk.rst @@ -258,6 +258,11 @@ clocks properly but rely on them being on from the bootloader, bypassing the disabling means that the driver will remain functional while the issues are sorted out. +You can see which clocks have been disabled by booting your kernel with these +parameters:: + + tp_printk trace_event=clk:clk_disable + To bypass this disabling, include "clk_ignore_unused" in the bootargs to the kernel. diff --git a/Documentation/driver-api/device-io.rst b/Documentation/driver-api/device-io.rst index 4d2baac0311c..2c7abd234f4e 100644 --- a/Documentation/driver-api/device-io.rst +++ b/Documentation/driver-api/device-io.rst @@ -410,7 +410,7 @@ ioremap_uc() ioremap_uc() behaves like ioremap() except that on the x86 architecture without 'PAT' mode, it marks memory as uncached even when the MTRR has designated -it as cacheable, see Documentation/x86/pat.rst. +it as cacheable, see Documentation/arch/x86/pat.rst. Portable drivers should avoid the use of ioremap_uc(). diff --git a/Documentation/driver-api/firmware/fw_search_path.rst b/Documentation/driver-api/firmware/fw_search_path.rst index a360f1009fa3..d7cb1e8f0076 100644 --- a/Documentation/driver-api/firmware/fw_search_path.rst +++ b/Documentation/driver-api/firmware/fw_search_path.rst @@ -22,5 +22,10 @@ can use the file: * /sys/module/firmware_class/parameters/path -You would echo into it your custom path and firmware requested will be -searched for there first. +You would echo into it your custom path and firmware requested will be searched +for there first. Be aware that newline characters will be taken into account +and may not produce the intended effects. For instance you might want to use: + +echo -n /path/to/script > /sys/module/firmware_class/parameters/path + +to ensure that your script is being used. diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index a43aacf1494e..4654ee57c1d5 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -40,8 +40,8 @@ Here are the main features of EROFS: - Support multiple devices to refer to external blobs, which can be used for container images; - - 4KiB block size and 32-bit block addresses for each device, therefore - 16TiB address space at most for now; + - 32-bit block addresses for each device, therefore 16TiB address space at + most with 4KiB block size for now; - Two inode layouts for different requirements: diff --git a/Documentation/filesystems/idmappings.rst b/Documentation/filesystems/idmappings.rst index b9b31066aef2..ad6d21640576 100644 --- a/Documentation/filesystems/idmappings.rst +++ b/Documentation/filesystems/idmappings.rst @@ -241,7 +241,7 @@ according to the filesystem's idmapping as this would give the wrong owner if the caller is using an idmapping. So the kernel will map the id back up in the idmapping of the caller. Let's -assume the caller has the slighly unconventional idmapping +assume the caller has the somewhat unconventional idmapping ``u3000:k20000:r10000`` then ``k21000`` would map back up to ``u4000``. Consequently the user would see that this file is owned by ``u4000``. @@ -320,6 +320,10 @@ and equally wrong:: from_kuid(u20000:k0:r10000, u1000) = k21000 ~~~~~ +Since userspace ids have type ``uid_t`` and ``gid_t`` and kernel ids have type +``kuid_t`` and ``kgid_t`` the compiler will throw an error when they are +conflated. So the two examples above would cause a compilation failure. + Idmappings when creating filesystem objects ------------------------------------------- @@ -623,42 +627,105 @@ privileged users in the initial user namespace. However, it is perfectly possible to combine idmapped mounts with filesystems mountable inside user namespaces. We will touch on this further below. +Filesystem types vs idmapped mount types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With the introduction of idmapped mounts we need to distinguish between +filesystem ownership and mount ownership of a VFS object such as an inode. The +owner of a inode might be different when looked at from a filesystem +perspective than when looked at from an idmapped mount. Such fundamental +conceptual distinctions should almost always be clearly expressed in the code. +So, to distinguish idmapped mount ownership from filesystem ownership separate +types have been introduced. + +If a uid or gid has been generated using the filesystem or caller's idmapping +then we will use the ``kuid_t`` and ``kgid_t`` types. However, if a uid or gid +has been generated using a mount idmapping then we will be using the dedicated +``vfsuid_t`` and ``vfsgid_t`` types. + +All VFS helpers that generate or take uids and gids as arguments use the +``vfsuid_t`` and ``vfsgid_t`` types and we will be able to rely on the compiler +to catch errors that originate from conflating filesystem and VFS uids and gids. + +The ``vfsuid_t`` and ``vfsgid_t`` types are often mapped from and to ``kuid_t`` +and ``kgid_t`` types similar how ``kuid_t`` and ``kgid_t`` types are mapped +from and to ``uid_t`` and ``gid_t`` types:: + + uid_t <--> kuid_t <--> vfsuid_t + gid_t <--> kgid_t <--> vfsgid_t + +Whenever we report ownership based on a ``vfsuid_t`` or ``vfsgid_t`` type, +e.g., during ``stat()``, or store ownership information in a shared VFS object +based on a ``vfsuid_t`` or ``vfsgid_t`` type, e.g., during ``chown()`` we can +use the ``vfsuid_into_kuid()`` and ``vfsgid_into_kgid()`` helpers. + +To illustrate why this helper currently exists, consider what happens when we +change ownership of an inode from an idmapped mount. After we generated +a ``vfsuid_t`` or ``vfsgid_t`` based on the mount idmapping we later commit to +this ``vfsuid_t`` or ``vfsgid_t`` to become the new filesytem wide ownership. +Thus, we are turning the ``vfsuid_t`` or ``vfsgid_t`` into a global ``kuid_t`` +or ``kgid_t``. And this can be done by using ``vfsuid_into_kuid()`` and +``vfsgid_into_kgid()``. + +Note, whenever a shared VFS object, e.g., a cached ``struct inode`` or a cached +``struct posix_acl``, stores ownership information a filesystem or "global" +``kuid_t`` and ``kgid_t`` must be used. Ownership expressed via ``vfsuid_t`` +and ``vfsgid_t`` is specific to an idmapped mount. + +We already noted that ``vfsuid_t`` and ``vfsgid_t`` types are generated based +on mount idmappings whereas ``kuid_t`` and ``kgid_t`` types are generated based +on filesystem idmappings. To prevent abusing filesystem idmappings to generate +``vfsuid_t`` or ``vfsgid_t`` types or mount idmappings to generate ``kuid_t`` +or ``kgid_t`` types filesystem idmappings and mount idmappings are different +types as well. + +All helpers that map to or from ``vfsuid_t`` and ``vfsgid_t`` types require +a mount idmapping to be passed which is of type ``struct mnt_idmap``. Passing +a filesystem or caller idmapping will cause a compilation error. + +Similar to how we prefix all userspace ids in this document with ``u`` and all +kernel ids with ``k`` we will prefix all VFS ids with ``v``. So a mount +idmapping will be written as: ``u0:v10000:r10000``. + Remapping helpers ~~~~~~~~~~~~~~~~~ Idmapping functions were added that translate between idmappings. They make use -of the remapping algorithm we've introduced earlier. We're going to look at -two: +of the remapping algorithm we've introduced earlier. We're going to look at: -- ``i_uid_into_mnt()`` and ``i_gid_into_mnt()`` +- ``i_uid_into_vfsuid()`` and ``i_gid_into_vfsgid()`` - The ``i_*id_into_mnt()`` functions translate filesystem's kernel ids into - kernel ids in the mount's idmapping:: + The ``i_*id_into_vfs*id()`` functions translate filesystem's kernel ids into + VFS ids in the mount's idmapping:: /* Map the filesystem's kernel id up into a userspace id in the filesystem's idmapping. */ from_kuid(filesystem, kid) = uid - /* Map the filesystem's userspace id down ito a kernel id in the mount's idmapping. */ + /* Map the filesystem's userspace id down ito a VFS id in the mount's idmapping. */ make_kuid(mount, uid) = kuid - ``mapped_fsuid()`` and ``mapped_fsgid()`` The ``mapped_fs*id()`` functions translate the caller's kernel ids into kernel ids in the filesystem's idmapping. This translation is achieved by - remapping the caller's kernel ids using the mount's idmapping:: + remapping the caller's VFS ids using the mount's idmapping:: - /* Map the caller's kernel id up into a userspace id in the mount's idmapping. */ + /* Map the caller's VFS id up into a userspace id in the mount's idmapping. */ from_kuid(mount, kid) = uid /* Map the mount's userspace id down into a kernel id in the filesystem's idmapping. */ make_kuid(filesystem, uid) = kuid +- ``vfsuid_into_kuid()`` and ``vfsgid_into_kgid()`` + + Whenever + Note that these two functions invert each other. Consider the following idmappings:: caller idmapping: u0:k10000:r10000 filesystem idmapping: u0:k20000:r10000 - mount idmapping: u0:k10000:r10000 + mount idmapping: u0:v10000:r10000 Assume a file owned by ``u1000`` is read from disk. The filesystem maps this id to ``k21000`` according to its idmapping. This is what is stored in the @@ -669,20 +736,21 @@ would usually simply use the crossmapping algorithm and map the filesystem's kernel id up to a userspace id in the caller's idmapping. But when the caller is accessing the file on an idmapped mount the kernel will -first call ``i_uid_into_mnt()`` thereby translating the filesystem's kernel id -into a kernel id in the mount's idmapping:: +first call ``i_uid_into_vfsuid()`` thereby translating the filesystem's kernel +id into a VFS id in the mount's idmapping:: - i_uid_into_mnt(k21000): + i_uid_into_vfsuid(k21000): /* Map the filesystem's kernel id up into a userspace id. */ from_kuid(u0:k20000:r10000, k21000) = u1000 - /* Map the filesystem's userspace id down ito a kernel id in the mount's idmapping. */ - make_kuid(u0:k10000:r10000, u1000) = k11000 + /* Map the filesystem's userspace id down into a VFS id in the mount's idmapping. */ + make_kuid(u0:v10000:r10000, u1000) = v11000 Finally, when the kernel reports the owner to the caller it will turn the -kernel id in the mount's idmapping into a userspace id in the caller's +VFS id in the mount's idmapping into a userspace id in the caller's idmapping:: + k11000 = vfsuid_into_kuid(v11000) from_kuid(u0:k10000:r10000, k11000) = u1000 We can test whether this algorithm really works by verifying what happens when @@ -696,18 +764,19 @@ fails. But when the caller is accessing the file on an idmapped mount the kernel will first call ``mapped_fs*id()`` thereby translating the caller's kernel id into -a kernel id according to the mount's idmapping:: +a VFS id according to the mount's idmapping:: mapped_fsuid(k11000): /* Map the caller's kernel id up into a userspace id in the mount's idmapping. */ from_kuid(u0:k10000:r10000, k11000) = u1000 /* Map the mount's userspace id down into a kernel id in the filesystem's idmapping. */ - make_kuid(u0:k20000:r10000, u1000) = k21000 + make_kuid(u0:v20000:r10000, u1000) = v21000 -When finally writing to disk the kernel will then map ``k21000`` up into a +When finally writing to disk the kernel will then map ``v21000`` up into a userspace id in the filesystem's idmapping:: + k21000 = vfsuid_into_kuid(v21000) from_kuid(u0:k20000:r10000, k21000) = u1000 As we can see, we end up with an invertible and therefore information @@ -725,7 +794,7 @@ Example 2 reconsidered caller id: u1000 caller idmapping: u0:k10000:r10000 filesystem idmapping: u0:k20000:r10000 - mount idmapping: u0:k10000:r10000 + mount idmapping: u0:v10000:r10000 When the caller is using a non-initial idmapping the common case is to attach the same idmapping to the mount. We now perform three steps: @@ -734,12 +803,12 @@ the same idmapping to the mount. We now perform three steps: make_kuid(u0:k10000:r10000, u1000) = k11000 -2. Translate the caller's kernel id into a kernel id in the filesystem's +2. Translate the caller's VFS id into a kernel id in the filesystem's idmapping:: - mapped_fsuid(k11000): - /* Map the kernel id up into a userspace id in the mount's idmapping. */ - from_kuid(u0:k10000:r10000, k11000) = u1000 + mapped_fsuid(v11000): + /* Map the VFS id up into a userspace id in the mount's idmapping. */ + from_kuid(u0:v10000:r10000, v11000) = u1000 /* Map the userspace id down into a kernel id in the filesystem's idmapping. */ make_kuid(u0:k20000:r10000, u1000) = k21000 @@ -759,7 +828,7 @@ Example 3 reconsidered caller id: u1000 caller idmapping: u0:k10000:r10000 filesystem idmapping: u0:k0:r4294967295 - mount idmapping: u0:k10000:r10000 + mount idmapping: u0:v10000:r10000 The same translation algorithm works with the third example. @@ -767,12 +836,12 @@ The same translation algorithm works with the third example. make_kuid(u0:k10000:r10000, u1000) = k11000 -2. Translate the caller's kernel id into a kernel id in the filesystem's +2. Translate the caller's VFS id into a kernel id in the filesystem's idmapping:: - mapped_fsuid(k11000): - /* Map the kernel id up into a userspace id in the mount's idmapping. */ - from_kuid(u0:k10000:r10000, k11000) = u1000 + mapped_fsuid(v11000): + /* Map the VFS id up into a userspace id in the mount's idmapping. */ + from_kuid(u0:v10000:r10000, v11000) = u1000 /* Map the userspace id down into a kernel id in the filesystem's idmapping. */ make_kuid(u0:k0:r4294967295, u1000) = k1000 @@ -792,7 +861,7 @@ Example 4 reconsidered file id: u1000 caller idmapping: u0:k10000:r10000 filesystem idmapping: u0:k0:r4294967295 - mount idmapping: u0:k10000:r10000 + mount idmapping: u0:v10000:r10000 In order to report ownership to userspace the kernel now does three steps using the translation algorithm we introduced earlier: @@ -802,17 +871,18 @@ the translation algorithm we introduced earlier: make_kuid(u0:k0:r4294967295, u1000) = k1000 -2. Translate the kernel id into a kernel id in the mount's idmapping:: +2. Translate the kernel id into a VFS id in the mount's idmapping:: - i_uid_into_mnt(k1000): + i_uid_into_vfsuid(k1000): /* Map the kernel id up into a userspace id in the filesystem's idmapping. */ from_kuid(u0:k0:r4294967295, k1000) = u1000 - /* Map the userspace id down into a kernel id in the mounts's idmapping. */ - make_kuid(u0:k10000:r10000, u1000) = k11000 + /* Map the userspace id down into a VFS id in the mounts's idmapping. */ + make_kuid(u0:v10000:r10000, u1000) = v11000 -3. Map the kernel id up into a userspace id in the caller's idmapping:: +3. Map the VFS id up into a userspace id in the caller's idmapping:: + k11000 = vfsuid_into_kuid(v11000) from_kuid(u0:k10000:r10000, k11000) = u1000 Earlier, the caller's kernel id couldn't be crossmapped in the filesystems's @@ -828,7 +898,7 @@ Example 5 reconsidered file id: u1000 caller idmapping: u0:k10000:r10000 filesystem idmapping: u0:k20000:r10000 - mount idmapping: u0:k10000:r10000 + mount idmapping: u0:v10000:r10000 Again, in order to report ownership to userspace the kernel now does three steps using the translation algorithm we introduced earlier: @@ -838,17 +908,18 @@ steps using the translation algorithm we introduced earlier: make_kuid(u0:k20000:r10000, u1000) = k21000 -2. Translate the kernel id into a kernel id in the mount's idmapping:: +2. Translate the kernel id into a VFS id in the mount's idmapping:: - i_uid_into_mnt(k21000): + i_uid_into_vfsuid(k21000): /* Map the kernel id up into a userspace id in the filesystem's idmapping. */ from_kuid(u0:k20000:r10000, k21000) = u1000 - /* Map the userspace id down into a kernel id in the mounts's idmapping. */ - make_kuid(u0:k10000:r10000, u1000) = k11000 + /* Map the userspace id down into a VFS id in the mounts's idmapping. */ + make_kuid(u0:v10000:r10000, u1000) = v11000 -3. Map the kernel id up into a userspace id in the caller's idmapping:: +3. Map the VFS id up into a userspace id in the caller's idmapping:: + k11000 = vfsuid_into_kuid(v11000) from_kuid(u0:k10000:r10000, k11000) = u1000 Earlier, the file's kernel id couldn't be crossmapped in the filesystems's @@ -899,23 +970,23 @@ from above::: caller id: u1125 caller idmapping: u0:k0:r4294967295 filesystem idmapping: u0:k0:r4294967295 - mount idmapping: u1000:k1125:r1 + mount idmapping: u1000:v1125:r1 1. Map the caller's userspace ids into kernel ids in the caller's idmapping:: make_kuid(u0:k0:r4294967295, u1125) = k1125 -2. Translate the caller's kernel id into a kernel id in the filesystem's +2. Translate the caller's VFS id into a kernel id in the filesystem's idmapping:: - mapped_fsuid(k1125): - /* Map the kernel id up into a userspace id in the mount's idmapping. */ - from_kuid(u1000:k1125:r1, k1125) = u1000 + mapped_fsuid(v1125): + /* Map the VFS id up into a userspace id in the mount's idmapping. */ + from_kuid(u1000:v1125:r1, v1125) = u1000 /* Map the userspace id down into a kernel id in the filesystem's idmapping. */ make_kuid(u0:k0:r4294967295, u1000) = k1000 -2. Verify that the caller's kernel ids can be mapped to userspace ids in the +2. Verify that the caller's filesystem ids can be mapped to userspace ids in the filesystem's idmapping:: from_kuid(u0:k0:r4294967295, k1000) = u1000 @@ -930,24 +1001,25 @@ on their work computer: file id: u1000 caller idmapping: u0:k0:r4294967295 filesystem idmapping: u0:k0:r4294967295 - mount idmapping: u1000:k1125:r1 + mount idmapping: u1000:v1125:r1 1. Map the userspace id on disk down into a kernel id in the filesystem's idmapping:: make_kuid(u0:k0:r4294967295, u1000) = k1000 -2. Translate the kernel id into a kernel id in the mount's idmapping:: +2. Translate the kernel id into a VFS id in the mount's idmapping:: - i_uid_into_mnt(k1000): + i_uid_into_vfsuid(k1000): /* Map the kernel id up into a userspace id in the filesystem's idmapping. */ from_kuid(u0:k0:r4294967295, k1000) = u1000 - /* Map the userspace id down into a kernel id in the mounts's idmapping. */ - make_kuid(u1000:k1125:r1, u1000) = k1125 + /* Map the userspace id down into a VFS id in the mounts's idmapping. */ + make_kuid(u1000:v1125:r1, u1000) = v1125 -3. Map the kernel id up into a userspace id in the caller's idmapping:: +3. Map the VFS id up into a userspace id in the caller's idmapping:: + k1125 = vfsuid_into_kuid(v1125) from_kuid(u0:k0:r4294967295, k1125) = u1125 So ultimately the caller will be reported that the file belongs to ``u1125`` diff --git a/Documentation/filesystems/mount_api.rst b/Documentation/filesystems/mount_api.rst index 63204d2094fd..9aaf6ef75eb5 100644 --- a/Documentation/filesystems/mount_api.rst +++ b/Documentation/filesystems/mount_api.rst @@ -79,7 +79,6 @@ context. This is represented by the fs_context structure:: unsigned int sb_flags; unsigned int sb_flags_mask; unsigned int s_iflags; - unsigned int lsm_flags; enum fs_context_purpose purpose:8; ... }; diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 9d5fd9424e8b..59db0bed35e1 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -85,7 +85,7 @@ contact Bodo Bauer at bb@ricochet.net. We'll be happy to add them to this document. The latest version of this document is available online at -http://tldp.org/LDP/Linux-Filesystem-Hierarchy/html/proc.html +https://www.kernel.org/doc/html/latest/filesystems/proc.html If the above direction does not works for you, you could try the kernel mailing list at linux-kernel@vger.kernel.org and/or try to reach me at @@ -232,7 +232,7 @@ asynchronous manner and the value may not be very precise. To see a precise snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table. It's slow but very precise. -.. table:: Table 1-2: Contents of the status files (as of 4.19) +.. table:: Table 1-2: Contents of the status fields (as of 4.19) ========================== =================================================== Field Content @@ -305,7 +305,7 @@ It's slow but very precise. ========================== =================================================== -.. table:: Table 1-3: Contents of the statm files (as of 2.6.8-rc3) +.. table:: Table 1-3: Contents of the statm fields (as of 2.6.8-rc3) ======== =============================== ============================== Field Content @@ -323,7 +323,7 @@ It's slow but very precise. ======== =============================== ============================== -.. table:: Table 1-4: Contents of the stat files (as of 2.6.30-rc7) +.. table:: Table 1-4: Contents of the stat fields (as of 2.6.30-rc7) ============= =============================================================== Field Content @@ -1321,9 +1321,9 @@ many times the slaves link has failed. 1.4 SCSI info ------------- -If you have a SCSI host adapter in your system, you'll find a subdirectory -named after the driver for this adapter in /proc/scsi. You'll also see a list -of all recognized SCSI devices in /proc/scsi:: +If you have a SCSI or ATA host adapter in your system, you'll find a +subdirectory named after the driver for this adapter in /proc/scsi. +You'll also see a list of all recognized SCSI devices in /proc/scsi:: >cat /proc/scsi/scsi Attached devices: @@ -1449,16 +1449,18 @@ Various pieces of information about kernel activity are available in the since the system first booted. For a quick look, simply cat the file:: > cat /proc/stat - cpu 2255 34 2290 22625563 6290 127 456 0 0 0 - cpu0 1132 34 1441 11311718 3675 127 438 0 0 0 - cpu1 1123 0 849 11313845 2614 0 18 0 0 0 - intr 114930548 113199788 3 0 5 263 0 4 [... lots more numbers ...] - ctxt 1990473 - btime 1062191376 - processes 2915 - procs_running 1 + cpu 237902850 368826709 106375398 1873517540 1135548 0 14507935 0 0 0 + cpu0 60045249 91891769 26331539 468411416 495718 0 5739640 0 0 0 + cpu1 59746288 91759249 26609887 468860630 312281 0 4384817 0 0 0 + cpu2 59489247 92985423 26904446 467808813 171668 0 2268998 0 0 0 + cpu3 58622065 92190267 26529524 468436680 155879 0 2114478 0 0 0 + intr 8688370575 8 3373 0 0 0 0 0 0 1 40791 0 0 353317 0 0 0 0 224789828 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 190974333 41958554 123983334 43 0 224593 0 0 0 <more 0's deleted> + ctxt 22848221062 + btime 1605316999 + processes 746787147 + procs_running 2 procs_blocked 0 - softirq 183433 0 21755 12 39 1137 231 21459 2263 + softirq 12121874454 100099120 3938138295 127375644 2795979 187870761 0 173808342 3072582055 52608 224184354 The very first "cpu" line aggregates the numbers in all of the other "cpuN" lines. These numbers identify the amount of time the CPU has spent performing @@ -1520,8 +1522,8 @@ softirq. Information about mounted ext4 file systems can be found in /proc/fs/ext4. Each mounted filesystem will have a directory in /proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or -/proc/fs/ext4/dm-0). The files in each per-device directory are shown -in Table 1-12, below. +/proc/fs/ext4/sda9 or /proc/fs/ext4/dm-0). The files in each per-device +directory are shown in Table 1-12, below. .. table:: Table 1-12: Files in /proc/fs/ext4/<devname> @@ -1601,12 +1603,12 @@ can inadvertently disrupt your system, it is advisable to read both documentation and source before actually making adjustments. In any case, be very careful when writing to any of these files. The entries in /proc may change slightly between the 2.1.* and the 2.2 kernel, so if there is any doubt -review the kernel documentation in the directory /usr/src/linux/Documentation. +review the kernel documentation in the directory linux/Documentation. This chapter is heavily based on the documentation included in the pre 2.2 kernels, and became part of it in version 2.2.1 of the Linux kernel. -Please see: Documentation/admin-guide/sysctl/ directory for descriptions of these -entries. +Please see: Documentation/admin-guide/sysctl/ directory for descriptions of +these entries. Summary ------- diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index f3b344f0c0a4..769be5230210 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -107,7 +107,7 @@ file /proc/filesystems. struct file_system_type ----------------------- -This describes the filesystem. As of kernel 2.6.39, the following +This describes the filesystem. The following members are defined: .. code-block:: c @@ -115,14 +115,24 @@ members are defined: struct file_system_type { const char *name; int fs_flags; + int (*init_fs_context)(struct fs_context *); + const struct fs_parameter_spec *parameters; struct dentry *(*mount) (struct file_system_type *, int, - const char *, void *); + const char *, void *); void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; - struct list_head fs_supers; + struct hlist_head fs_supers; + struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; + struct lock_class_key s_vfs_rename_key; + struct lock_class_key s_writers_key[SB_FREEZE_LEVELS]; + + struct lock_class_key i_lock_key; + struct lock_class_key i_mutex_key; + struct lock_class_key invalidate_lock_key; + struct lock_class_key i_mutex_dir_key; }; ``name`` @@ -132,6 +142,15 @@ members are defined: ``fs_flags`` various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.) +``init_fs_context`` + Initializes 'struct fs_context' ->ops and ->fs_private fields with + filesystem-specific data. + +``parameters`` + Pointer to the array of filesystem parameters descriptors + 'struct fs_parameter_spec'. + More info in Documentation/filesystems/mount_api.rst. + ``mount`` the method to call when a new instance of this filesystem should be mounted @@ -148,7 +167,11 @@ members are defined: ``next`` for internal VFS use: you should initialize this to NULL - s_lock_key, s_umount_key: lockdep-specific +``fs_supers`` + for internal VFS use: hlist of filesystem instances (superblocks) + + s_lock_key, s_umount_key, s_vfs_rename_key, s_writers_key, + i_lock_key, i_mutex_key, invalidate_lock_key, i_mutex_dir_key: lockdep-specific The mount() method has the following arguments: @@ -222,33 +245,42 @@ struct super_operations ----------------------- This describes how the VFS can manipulate the superblock of your -filesystem. As of kernel 2.6.22, the following members are defined: +filesystem. The following members are defined: .. code-block:: c struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); + void (*free_inode)(struct inode *); void (*dirty_inode) (struct inode *, int flags); - int (*write_inode) (struct inode *, int); - void (*drop_inode) (struct inode *); - void (*delete_inode) (struct inode *); + int (*write_inode) (struct inode *, struct writeback_control *wbc); + int (*drop_inode) (struct inode *); + void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); + int (*freeze_super) (struct super_block *); int (*freeze_fs) (struct super_block *); + int (*thaw_super) (struct super_block *); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); - void (*clear_inode) (struct inode *); void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct dentry *); + int (*show_devname)(struct seq_file *, struct dentry *); + int (*show_path)(struct seq_file *, struct dentry *); + int (*show_stats)(struct seq_file *, struct dentry *); ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); - int (*nr_cached_objects)(struct super_block *); - void (*free_cached_objects)(struct super_block *, int); + struct dquot **(*get_dquots)(struct inode *); + + long (*nr_cached_objects)(struct super_block *, + struct shrink_control *); + long (*free_cached_objects)(struct super_block *, + struct shrink_control *); }; All methods are called without any locks being held, unless otherwise @@ -269,6 +301,11 @@ or bottom half). ->alloc_inode was defined and simply undoes anything done by ->alloc_inode. +``free_inode`` + this method is called from RCU callback. If you use call_rcu() + in ->destroy_inode to free 'struct inode' memory, then it's + better to release memory in this method. + ``dirty_inode`` this method is called by the VFS when an inode is marked dirty. This is specifically for the inode itself being marked dirty, @@ -296,8 +333,12 @@ or bottom half). practice of using "force_delete" in the put_inode() case, but does not have the races that the "force_delete()" approach had. -``delete_inode`` - called when the VFS wants to delete an inode +``evict_inode`` + called when the VFS wants to evict an inode. Caller does + *not* evict the pagecache or inode-associated metadata buffers; + the method has to use truncate_inode_pages_final() to get rid + of those. Caller makes sure async writeback cannot be running for + the inode while (or after) ->evict_inode() is called. Optional. ``put_super`` called when the VFS wishes to free the superblock @@ -308,14 +349,25 @@ or bottom half). superblock. The second parameter indicates whether the method should wait until the write out has been completed. Optional. +``freeze_super`` + Called instead of ->freeze_fs callback if provided. + Main difference is that ->freeze_super is called without taking + down_write(&sb->s_umount). If filesystem implements it and wants + ->freeze_fs to be called too, then it has to call ->freeze_fs + explicitly from this callback. Optional. + ``freeze_fs`` called when VFS is locking a filesystem and forcing it into a consistent state. This method is currently used by the Logical - Volume Manager (LVM). + Volume Manager (LVM) and ioctl(FIFREEZE). Optional. + +``thaw_super`` + called when VFS is unlocking a filesystem and making it writable + again after ->freeze_super. Optional. ``unfreeze_fs`` called when VFS is unlocking a filesystem and making it writable - again. + again after ->freeze_fs. Optional. ``statfs`` called when the VFS needs to get filesystem statistics. @@ -324,22 +376,37 @@ or bottom half). called when the filesystem is remounted. This is called with the kernel lock held -``clear_inode`` - called then the VFS clears the inode. Optional - ``umount_begin`` called when the VFS is unmounting a filesystem. ``show_options`` - called by the VFS to show mount options for /proc/<pid>/mounts. + called by the VFS to show mount options for /proc/<pid>/mounts + and /proc/<pid>/mountinfo. (see "Mount Options" section) +``show_devname`` + Optional. Called by the VFS to show device name for + /proc/<pid>/{mounts,mountinfo,mountstats}. If not provided then + '(struct mount).mnt_devname' will be used. + +``show_path`` + Optional. Called by the VFS (for /proc/<pid>/mountinfo) to show + the mount root dentry path relative to the filesystem root. + +``show_stats`` + Optional. Called by the VFS (for /proc/<pid>/mountstats) to show + filesystem-specific mount statistics. + ``quota_read`` called by the VFS to read from filesystem quota file. ``quota_write`` called by the VFS to write to filesystem quota file. +``get_dquots`` + called by quota to get 'struct dquot' array for a particular inode. + Optional. + ``nr_cached_objects`` called by the sb cache shrinking function for the filesystem to return the number of freeable cached objects it contains. diff --git a/Documentation/index.rst b/Documentation/index.rst index 76d1a3ec9be3..9dfdc826618c 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -99,7 +99,7 @@ Architecture-specific documentation .. toctree:: :maxdepth: 2 - arch + arch/index Other documentation diff --git a/Documentation/kbuild/llvm.rst b/Documentation/kbuild/llvm.rst index bfb51685073c..c3851fe1900d 100644 --- a/Documentation/kbuild/llvm.rst +++ b/Documentation/kbuild/llvm.rst @@ -171,6 +171,10 @@ Getting Help Getting LLVM ------------- +We provide prebuilt stable versions of LLVM on `kernel.org <https://kernel.org/pub/tools/llvm/>`_. +Below are links that may be useful for building LLVM from source or procuring +it through a distribution's package manager. + - https://releases.llvm.org/download.html - https://github.com/llvm/llvm-project - https://llvm.org/docs/GettingStarted.html diff --git a/Documentation/kernel-hacking/false-sharing.rst b/Documentation/kernel-hacking/false-sharing.rst new file mode 100644 index 000000000000..122b0e124656 --- /dev/null +++ b/Documentation/kernel-hacking/false-sharing.rst @@ -0,0 +1,206 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +False Sharing +============= + +What is False Sharing +===================== +False sharing is related with cache mechanism of maintaining the data +coherence of one cache line stored in multiple CPU's caches; then +academic definition for it is in [1]_. Consider a struct with a +refcount and a string:: + + struct foo { + refcount_t refcount; + ... + char name[16]; + } ____cacheline_internodealigned_in_smp; + +Member 'refcount'(A) and 'name'(B) _share_ one cache line like below:: + + +-----------+ +-----------+ + | CPU 0 | | CPU 1 | + +-----------+ +-----------+ + / | + / | + V V + +----------------------+ +----------------------+ + | A B | Cache 0 | A B | Cache 1 + +----------------------+ +----------------------+ + | | + ---------------------------+------------------+----------------------------- + | | + +----------------------+ + | | + +----------------------+ + Main Memory | A B | + +----------------------+ + +'refcount' is modified frequently, but 'name' is set once at object +creation time and is never modified. When many CPUs access 'foo' at +the same time, with 'refcount' being only bumped by one CPU frequently +and 'name' being read by other CPUs, all those reading CPUs have to +reload the whole cache line over and over due to the 'sharing', even +though 'name' is never changed. + +There are many real-world cases of performance regressions caused by +false sharing. One of these is a rw_semaphore 'mmap_lock' inside +mm_struct struct, whose cache line layout change triggered a +regression and Linus analyzed in [2]_. + +There are two key factors for a harmful false sharing: + +* A global datum accessed (shared) by many CPUs +* In the concurrent accesses to the data, there is at least one write + operation: write/write or write/read cases. + +The sharing could be from totally unrelated kernel components, or +different code paths of the same kernel component. + + +False Sharing Pitfalls +====================== +Back in time when one platform had only one or a few CPUs, hot data +members could be purposely put in the same cache line to make them +cache hot and save cacheline/TLB, like a lock and the data protected +by it. But for recent large system with hundreds of CPUs, this may +not work when the lock is heavily contended, as the lock owner CPU +could write to the data, while other CPUs are busy spinning the lock. + +Looking at past cases, there are several frequently occurring patterns +for false sharing: + +* lock (spinlock/mutex/semaphore) and data protected by it are + purposely put in one cache line. +* global data being put together in one cache line. Some kernel + subsystems have many global parameters of small size (4 bytes), + which can easily be grouped together and put into one cache line. +* data members of a big data structure randomly sitting together + without being noticed (cache line is usually 64 bytes or more), + like 'mem_cgroup' struct. + +Following 'mitigation' section provides real-world examples. + +False sharing could easily happen unless they are intentionally +checked, and it is valuable to run specific tools for performance +critical workloads to detect false sharing affecting performance case +and optimize accordingly. + + +How to detect and analyze False Sharing +======================================== +perf record/report/stat are widely used for performance tuning, and +once hotspots are detected, tools like 'perf-c2c' and 'pahole' can +be further used to detect and pinpoint the possible false sharing +data structures. 'addr2line' is also good at decoding instruction +pointer when there are multiple layers of inline functions. + +perf-c2c can capture the cache lines with most false sharing hits, +decoded functions (line number of file) accessing that cache line, +and in-line offset of the data. Simple commands are:: + + $ perf c2c record -ag sleep 3 + $ perf c2c report --call-graph none -k vmlinux + +When running above during testing will-it-scale's tlb_flush1 case, +perf reports something like:: + + Total records : 1658231 + Locked Load/Store Operations : 89439 + Load Operations : 623219 + Load Local HITM : 92117 + Load Remote HITM : 139 + + #---------------------------------------------------------------------- + 4 0 2374 0 0 0 0xff1100088366d880 + #---------------------------------------------------------------------- + 0.00% 42.29% 0.00% 0.00% 0.00% 0x8 1 1 0xffffffff81373b7b 0 231 129 5312 64 [k] __mod_lruvec_page_state [kernel.vmlinux] memcontrol.h:752 1 + 0.00% 13.10% 0.00% 0.00% 0.00% 0x8 1 1 0xffffffff81374718 0 226 97 3551 64 [k] folio_lruvec_lock_irqsave [kernel.vmlinux] memcontrol.h:752 1 + 0.00% 11.20% 0.00% 0.00% 0.00% 0x8 1 1 0xffffffff812c29bf 0 170 136 555 64 [k] lru_add_fn [kernel.vmlinux] mm_inline.h:41 1 + 0.00% 7.62% 0.00% 0.00% 0.00% 0x8 1 1 0xffffffff812c3ec5 0 175 108 632 64 [k] release_pages [kernel.vmlinux] mm_inline.h:41 1 + 0.00% 23.29% 0.00% 0.00% 0.00% 0x10 1 1 0xffffffff81372d0a 0 234 279 1051 64 [k] __mod_memcg_lruvec_state [kernel.vmlinux] memcontrol.c:736 1 + +A nice introduction for perf-c2c is [3]_. + +'pahole' decodes data structure layouts delimited in cache line +granularity. Users can match the offset in perf-c2c output with +pahole's decoding to locate the exact data members. For global +data, users can search the data address in System.map. + + +Possible Mitigations +==================== +False sharing does not always need to be mitigated. False sharing +mitigations should balance performance gains with complexity and +space consumption. Sometimes, lower performance is OK, and it's +unnecessary to hyper-optimize every rarely used data structure or +a cold data path. + +False sharing hurting performance cases are seen more frequently with +core count increasing. Because of these detrimental effects, many +patches have been proposed across variety of subsystems (like +networking and memory management) and merged. Some common mitigations +(with examples) are: + +* Separate hot global data in its own dedicated cache line, even if it + is just a 'short' type. The downside is more consumption of memory, + cache line and TLB entries. + + - Commit 91b6d3256356 ("net: cache align tcp_memory_allocated, tcp_sockets_allocated") + +* Reorganize the data structure, separate the interfering members to + different cache lines. One downside is it may introduce new false + sharing of other members. + + - Commit 802f1d522d5f ("mm: page_counter: re-layout structure to reduce false sharing") + +* Replace 'write' with 'read' when possible, especially in loops. + Like for some global variable, use compare(read)-then-write instead + of unconditional write. For example, use:: + + if (!test_bit(XXX)) + set_bit(XXX); + + instead of directly "set_bit(XXX);", similarly for atomic_t data:: + + if (atomic_read(XXX) == AAA) + atomic_set(XXX, BBB); + + - Commit 7b1002f7cfe5 ("bcache: fixup bcache_dev_sectors_dirty_add() multithreaded CPU false sharing") + - Commit 292648ac5cf1 ("mm: gup: allow FOLL_PIN to scale in SMP") + +* Turn hot global data to 'per-cpu data + global data' when possible, + or reasonably increase the threshold for syncing per-cpu data to + global data, to reduce or postpone the 'write' to that global data. + + - Commit 520f897a3554 ("ext4: use percpu_counters for extent_status cache hits/misses") + - Commit 56f3547bfa4d ("mm: adjust vm_committed_as_batch according to vm overcommit policy") + +Surely, all mitigations should be carefully verified to not cause side +effects. To avoid introducing false sharing when coding, it's better +to: + +* Be aware of cache line boundaries +* Group mostly read-only fields together +* Group things that are written at the same time together +* Separate frequently read and frequently written fields on + different cache lines. + +and better add a comment stating the false sharing consideration. + +One note is, sometimes even after a severe false sharing is detected +and solved, the performance may still have no obvious improvement as +the hotspot switches to a new place. + + +Miscellaneous +============= +One open issue is that kernel has an optional data structure +randomization mechanism, which also randomizes the situation of cache +line sharing of data members. + + +.. [1] https://en.wikipedia.org/wiki/False_sharing +.. [2] https://lore.kernel.org/lkml/CAHk-=whoqV=cX5VC80mmR9rr+Z+yQ6fiQZm36Fb-izsanHg23w@mail.gmail.com/ +.. [3] https://joemario.github.io/blog/2016/09/01/c2c-blog/ diff --git a/Documentation/kernel-hacking/index.rst b/Documentation/kernel-hacking/index.rst index f53027652290..79c03bac99a2 100644 --- a/Documentation/kernel-hacking/index.rst +++ b/Documentation/kernel-hacking/index.rst @@ -9,3 +9,4 @@ Kernel Hacking Guides hacking locking + false-sharing diff --git a/Documentation/litmus-tests/README b/Documentation/litmus-tests/README index 7f5c6c3ed6c3..658d37860d39 100644 --- a/Documentation/litmus-tests/README +++ b/Documentation/litmus-tests/README @@ -9,7 +9,7 @@ a kernel test module based on a litmus test, please see tools/memory-model/README. -atomic (/atomic derectory) +atomic (/atomic directory) -------------------------- Atomic-RMW+mb__after_atomic-is-stronger-than-acquire.litmus diff --git a/Documentation/litmus-tests/locking/DCL-broken.litmus b/Documentation/litmus-tests/locking/DCL-broken.litmus new file mode 100644 index 000000000000..bfb7ba4316d6 --- /dev/null +++ b/Documentation/litmus-tests/locking/DCL-broken.litmus @@ -0,0 +1,54 @@ +C DCL-broken + +(* + * Result: Sometimes + * + * This litmus test demonstrates more than just locking is required to + * correctly implement double-checked locking. + *) + +{ + int flag; + int data; +} + +P0(int *flag, int *data, spinlock_t *lck) +{ + int r0; + int r1; + int r2; + + r0 = READ_ONCE(*flag); + if (r0 == 0) { + spin_lock(lck); + r1 = READ_ONCE(*flag); + if (r1 == 0) { + WRITE_ONCE(*data, 1); + WRITE_ONCE(*flag, 1); + } + spin_unlock(lck); + } + r2 = READ_ONCE(*data); +} + +P1(int *flag, int *data, spinlock_t *lck) +{ + int r0; + int r1; + int r2; + + r0 = READ_ONCE(*flag); + if (r0 == 0) { + spin_lock(lck); + r1 = READ_ONCE(*flag); + if (r1 == 0) { + WRITE_ONCE(*data, 1); + WRITE_ONCE(*flag, 1); + } + spin_unlock(lck); + } + r2 = READ_ONCE(*data); +} + +locations [flag;data;0:r0;0:r1;1:r0;1:r1] +exists (0:r2=0 \/ 1:r2=0) diff --git a/Documentation/litmus-tests/locking/DCL-fixed.litmus b/Documentation/litmus-tests/locking/DCL-fixed.litmus new file mode 100644 index 000000000000..d1b60bcb0c8f --- /dev/null +++ b/Documentation/litmus-tests/locking/DCL-fixed.litmus @@ -0,0 +1,55 @@ +C DCL-fixed + +(* + * Result: Never + * + * This litmus test demonstrates that double-checked locking can be + * reliable given proper use of smp_load_acquire() and smp_store_release() + * in addition to the locking. + *) + +{ + int flag; + int data; +} + +P0(int *flag, int *data, spinlock_t *lck) +{ + int r0; + int r1; + int r2; + + r0 = smp_load_acquire(flag); + if (r0 == 0) { + spin_lock(lck); + r1 = READ_ONCE(*flag); + if (r1 == 0) { + WRITE_ONCE(*data, 1); + smp_store_release(flag, 1); + } + spin_unlock(lck); + } + r2 = READ_ONCE(*data); +} + +P1(int *flag, int *data, spinlock_t *lck) +{ + int r0; + int r1; + int r2; + + r0 = smp_load_acquire(flag); + if (r0 == 0) { + spin_lock(lck); + r1 = READ_ONCE(*flag); + if (r1 == 0) { + WRITE_ONCE(*data, 1); + smp_store_release(flag, 1); + } + spin_unlock(lck); + } + r2 = READ_ONCE(*data); +} + +locations [flag;data;0:r0;0:r1;1:r0;1:r1] +exists (0:r2=0 \/ 1:r2=0) diff --git a/Documentation/litmus-tests/locking/RM-broken.litmus b/Documentation/litmus-tests/locking/RM-broken.litmus new file mode 100644 index 000000000000..b7ef30cedfe5 --- /dev/null +++ b/Documentation/litmus-tests/locking/RM-broken.litmus @@ -0,0 +1,41 @@ +C RM-broken + +(* + * Result: DEADLOCK + * + * This litmus test demonstrates that the old "roach motel" approach + * to locking, where code can be freely moved into critical sections, + * cannot be used in the Linux kernel. + *) + +{ + int x; + atomic_t y; +} + +P0(int *x, atomic_t *y, spinlock_t *lck) +{ + int r2; + + spin_lock(lck); + r2 = atomic_inc_return(y); + WRITE_ONCE(*x, 1); + spin_unlock(lck); +} + +P1(int *x, atomic_t *y, spinlock_t *lck) +{ + int r0; + int r1; + int r2; + + spin_lock(lck); + r0 = READ_ONCE(*x); + r1 = READ_ONCE(*x); + r2 = atomic_inc_return(y); + spin_unlock(lck); +} + +locations [x;0:r2;1:r0;1:r1;1:r2] +filter (1:r0=0 /\ 1:r1=1) +exists (1:r2=1) diff --git a/Documentation/litmus-tests/locking/RM-fixed.litmus b/Documentation/litmus-tests/locking/RM-fixed.litmus new file mode 100644 index 000000000000..b62817559616 --- /dev/null +++ b/Documentation/litmus-tests/locking/RM-fixed.litmus @@ -0,0 +1,41 @@ +C RM-fixed + +(* + * Result: Never + * + * This litmus test demonstrates that the old "roach motel" approach + * to locking, where code can be freely moved into critical sections, + * cannot be used in the Linux kernel. + *) + +{ + int x; + atomic_t y; +} + +P0(int *x, atomic_t *y, spinlock_t *lck) +{ + int r2; + + spin_lock(lck); + r2 = atomic_inc_return(y); + WRITE_ONCE(*x, 1); + spin_unlock(lck); +} + +P1(int *x, atomic_t *y, spinlock_t *lck) +{ + int r0; + int r1; + int r2; + + r0 = READ_ONCE(*x); + r1 = READ_ONCE(*x); + spin_lock(lck); + r2 = atomic_inc_return(y); + spin_unlock(lck); +} + +locations [x;0:r2;1:r0;1:r1;1:r2] +filter (1:r0=0 /\ 1:r1=1) +exists (1:r2=1) diff --git a/Documentation/mm/physical_memory.rst b/Documentation/mm/physical_memory.rst index 1bc888d36ea1..531e73b003dd 100644 --- a/Documentation/mm/physical_memory.rst +++ b/Documentation/mm/physical_memory.rst @@ -19,7 +19,7 @@ a bank of memory very suitable for DMA near peripheral devices. Each bank is called a node and the concept is represented under Linux by a ``struct pglist_data`` even if the architecture is UMA. This structure is -always referenced to by it's typedef ``pg_data_t``. ``A pg_data_t`` structure +always referenced by its typedef ``pg_data_t``. A ``pg_data_t`` structure for a particular node can be referenced by ``NODE_DATA(nid)`` macro where ``nid`` is the ID of that node. @@ -114,6 +114,25 @@ RAM equally split between two nodes, there will be ``ZONE_DMA32``, | DMA32 | NORMAL | MOVABLE | | NORMAL | MOVABLE | +---------+----------+-----------+ +------------+-------------+ + +Memory banks may belong to interleaving nodes. In the example below an x86 +machine has 16 Gbytes of RAM in 4 memory banks, even banks belong to node 0 +and odd banks belong to node 1:: + + + 0 4G 8G 12G 16G + +-------------+ +-------------+ +-------------+ +-------------+ + | node 0 | | node 1 | | node 0 | | node 1 | + +-------------+ +-------------+ +-------------+ +-------------+ + + 0 16M 4G + +-----+-------+ +-------------+ +-------------+ +-------------+ + | DMA | DMA32 | | NORMAL | | NORMAL | | NORMAL | + +-----+-------+ +-------------+ +-------------+ +-------------+ + +In this case node 0 will span from 0 to 12 Gbytes and node 1 will span from +4 to 16 Gbytes. + .. _nodes: Nodes diff --git a/Documentation/networking/devlink/ice.rst b/Documentation/networking/devlink/ice.rst index 10f282c2117c..2f60e34ab926 100644 --- a/Documentation/networking/devlink/ice.rst +++ b/Documentation/networking/devlink/ice.rst @@ -7,6 +7,21 @@ ice devlink support This document describes the devlink features implemented by the ``ice`` device driver. +Parameters +========== + +.. list-table:: Generic parameters implemented + + * - Name + - Mode + - Notes + * - ``enable_roce`` + - runtime + - mutually exclusive with ``enable_iwarp`` + * - ``enable_iwarp`` + - runtime + - mutually exclusive with ``enable_roce`` + Info versions ============= diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 87dd1c5283e6..58a78a316697 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -340,6 +340,8 @@ tcp_app_win - INTEGER Reserve max(window/2^tcp_app_win, mss) of window for application buffer. Value 0 is special, it means that nothing is reserved. + Possible values are [0, 31], inclusive. + Default: 31 tcp_autocorking - BOOLEAN diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst index 007e49ef6cec..6db37a46d305 100644 --- a/Documentation/process/coding-style.rst +++ b/Documentation/process/coding-style.rst @@ -1267,5 +1267,5 @@ gcc internals and indent, all available from https://www.gnu.org/manual/ WG14 is the international standardization working group for the programming language C, URL: http://www.open-std.org/JTC1/SC22/WG14/ -Kernel :ref:`process/coding-style.rst <codingstyle>`, by greg@kroah.com at OLS 2002: +Kernel CodingStyle, by greg@kroah.com at OLS 2002: http://www.kroah.com/linux/talks/ols_2002_kernel_codingstyle_talk/html/ diff --git a/Documentation/process/contribution-maturity-model.rst b/Documentation/process/contribution-maturity-model.rst new file mode 100644 index 000000000000..b87ab34de22c --- /dev/null +++ b/Documentation/process/contribution-maturity-model.rst @@ -0,0 +1,109 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================== +Linux Kernel Contribution Maturity Model +======================================== + + +Background +========== + +As a part of the 2021 Linux Kernel Maintainers’ Summit, there was a +`discussion <https://lwn.net/Articles/870581/>`_ about the challenges in +recruiting kernel maintainers as well as maintainer succession. Some of +the conclusions from that discussion included that companies which are a +part of the Linux Kernel community need to allow engineers to be +maintainers as part of their job, so they can grow into becoming +respected leaders and eventually, kernel maintainers. To support a +strong talent pipeline, developers should be allowed and encouraged to +take on upstream contributions such as reviewing other people’s patches, +refactoring kernel infrastructure, and writing documentation. + +To that end, the Linux Foundation Technical Advisory Board (TAB) +proposes this Linux Kernel Contribution Maturity Model. These common +expectations for upstream community engagement aim to increase the +influence of individual developers, increase the collaboration of +organizations, and improve the overall health of the Linux Kernel +ecosystem. + +The TAB urges organizations to continuously evaluate their Open Source +maturity model and commit to improvements to align with this model. To +be effective, this evaluation should incorporate feedback from across +the organization, including management and developers at all seniority +levels. In the spirit of Open Source, we encourage organizations to +publish their evaluations and plans to improve their engagement with the +upstream community. + +Level 0 +======= + +* Software Engineers are not allowed to contribute patches to the Linux + kernel. + + +Level 1 +======= + +* Software Engineers are allowed to contribute patches to the Linux + kernel, either as part of their job responsibilities or on their own + time. + +Level 2 +======= + +* Software Engineers are expected to contribute to the Linux Kernel as + part of their job responsibilities. +* Software Engineers will be supported to attend Linux-related + conferences as a part of their job. +* A Software Engineer’s upstream code contributions will be considered + in promotion and performance reviews. + +Level 3 +======= + +* Software Engineers are expected to review patches (including patches + authored by engineers from other companies) as part of their job + responsibilities +* Contributing presentations or papers to Linux-related or academic + conferences (such those organized by the Linux Foundation, Usenix, + ACM, etc.), are considered part of an engineer’s work. +* A Software Engineer’s community contributions will be considered in + promotion and performance reviews. +* Organizations will regularly report metrics of their open source + contributions and track these metrics over time. These metrics may be + published only internally within the organization, or at the + organization’s discretion, some or all may be published externally. + Metrics that are strongly suggested include: + + * The number of upstream kernel contributions by team or organization + (e.g., all people reporting up to a manager, director, or VP). + * The percentage of kernel developers who have made upstream + contributions relative to the total kernel developers in the + organization. + * The time interval between kernels used in the organization’s servers + and/or products, and the publication date of the upstream kernel + upon which the internal kernel is based. + * The number of out-of-tree commits present in internal kernels. + +Level 4 +======= + +* Software Engineers are encouraged to spend a portion of their work + time focused on Upstream Work, which is defined as reviewing patches, + serving on program committees, improving core project infrastructure + such as writing or maintaining tests, upstream tech debt reduction, + writing documentation, etc. +* Software Engineers are supported in helping to organize Linux-related + conferences. +* Organizations will consider community member feedback in official + performance reviews. + +Level 5 +======= + +* Upstream kernel development is considered a formal job position, with + at least a third of the engineer’s time spent doing Upstream Work. +* Organizations will actively seek out community member feedback as a + factor in official performance reviews. +* Organizations will regularly report internally on the ratio of + Upstream Work to work focused on directly pursuing business goals. diff --git a/Documentation/process/index.rst b/Documentation/process/index.rst index 565df595152e..b501cd977053 100644 --- a/Documentation/process/index.rst +++ b/Documentation/process/index.rst @@ -57,6 +57,7 @@ Other guides to the community that are of interest to most developers are: deprecated maintainers researcher-guidelines + contribution-maturity-model These are some overall technical guides that have been put here for now for lack of a better place. diff --git a/Documentation/process/kernel-docs.rst b/Documentation/process/kernel-docs.rst index 1c6e2ab92f4e..46f927aae6eb 100644 --- a/Documentation/process/kernel-docs.rst +++ b/Documentation/process/kernel-docs.rst @@ -75,13 +75,39 @@ On-line docs Published books --------------- + * Title: **Linux Kernel Debugging: Leverage proven tools and advanced techniques to effectively debug Linux kernels and kernel modules** + + :Author: Kaiwan N Billimoria + :Publisher: Packt Publishing Ltd + :Date: August, 2022 + :Pages: 638 + :ISBN: 978-1801075039 + :Notes: Debugging book + * Title: **Linux Kernel Programming: A Comprehensive Guide to Kernel Internals, Writing Kernel Modules, and Kernel Synchronization** - :Author: Kaiwan N. Billimoria - :Publisher: Packt Publishing Ltd - :Date: 2021 - :Pages: 754 - :ISBN: 978-1789953435 + :Author: Kaiwan N Billimoria + :Publisher: Packt Publishing Ltd + :Date: March, 2021 + :Pages: 754 + :ISBN: 978-1789953435 + + * Title: **Linux Kernel Programming Part 2 - Char Device Drivers and Kernel Synchronization: Create user-kernel interfaces, work with peripheral I/O, and handle hardware interrupts** + + :Author: Kaiwan N Billimoria + :Publisher: Packt Publishing Ltd + :Date: March, 2021 + :Pages: 452 + :ISBN: 978-1801079518 + + * Title: **Linux System Programming: Talking Directly to the Kernel and C Library** + + :Author: Robert Love + :Publisher: O'Reilly Media + :Date: June, 2013 + :Pages: 456 + :ISBN: 978-1449339531 + :Notes: Foundational book * Title: **Linux Kernel Development, 3rd Edition** diff --git a/Documentation/process/maintainer-tip.rst b/Documentation/process/maintainer-tip.rst index 572a3289c9cb..178c95fd17dc 100644 --- a/Documentation/process/maintainer-tip.rst +++ b/Documentation/process/maintainer-tip.rst @@ -128,8 +128,8 @@ uppercase letter and should be written in imperative tone. Changelog ^^^^^^^^^ -The general rules about changelogs in the process documentation, see -:ref:`Documentation/process/ <submittingpatches>`, apply. +The general rules about changelogs in the :ref:`Submitting patches guide +<describe_changes>`, apply. The tip tree maintainers set value on following these rules, especially on the request to write changelogs in imperative mood and not impersonating diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst index 828997bc9ff9..7a5619fecb38 100644 --- a/Documentation/process/submitting-patches.rst +++ b/Documentation/process/submitting-patches.rst @@ -223,20 +223,17 @@ patch. Select the recipients for your patch ------------------------------------ -You should always copy the appropriate subsystem maintainer(s) on any patch -to code that they maintain; look through the MAINTAINERS file and the -source code revision history to see who those maintainers are. The -script scripts/get_maintainer.pl can be very useful at this step (pass paths to -your patches as arguments to scripts/get_maintainer.pl). If you cannot find a +You should always copy the appropriate subsystem maintainer(s) and list(s) on +any patch to code that they maintain; look through the MAINTAINERS file and the +source code revision history to see who those maintainers are. The script +scripts/get_maintainer.pl can be very useful at this step (pass paths to your +patches as arguments to scripts/get_maintainer.pl). If you cannot find a maintainer for the subsystem you are working on, Andrew Morton (akpm@linux-foundation.org) serves as a maintainer of last resort. -You should also normally choose at least one mailing list to receive a copy -of your patch set. linux-kernel@vger.kernel.org should be used by default -for all patches, but the volume on that list has caused a number of -developers to tune it out. Look in the MAINTAINERS file for a -subsystem-specific list; your patch will probably get more attention there. -Please do not spam unrelated lists, though. +linux-kernel@vger.kernel.org should be used by default for all patches, but the +volume on that list has caused a number of developers to tune it out. Please +do not spam unrelated lists and unrelated people, though. Many kernel-related lists are hosted on vger.kernel.org; you can find a list of them at http://vger.kernel.org/vger-lists.html. There are diff --git a/Documentation/riscv/vm-layout.rst b/Documentation/riscv/vm-layout.rst index 3be44e74ec5d..5462c84f4723 100644 --- a/Documentation/riscv/vm-layout.rst +++ b/Documentation/riscv/vm-layout.rst @@ -47,7 +47,7 @@ RISC-V Linux Kernel SV39 | Kernel-space virtual memory, shared between all processes: ____________________________________________________________|___________________________________________________________ | | | | - ffffffc6fee00000 | -228 GB | ffffffc6feffffff | 2 MB | fixmap + ffffffc6fea00000 | -228 GB | ffffffc6feffffff | 6 MB | fixmap ffffffc6ff000000 | -228 GB | ffffffc6ffffffff | 16 MB | PCI io ffffffc700000000 | -228 GB | ffffffc7ffffffff | 4 GB | vmemmap ffffffc800000000 | -224 GB | ffffffd7ffffffff | 64 GB | vmalloc/ioremap space @@ -83,7 +83,7 @@ RISC-V Linux Kernel SV48 | Kernel-space virtual memory, shared between all processes: ____________________________________________________________|___________________________________________________________ | | | | - ffff8d7ffee00000 | -114.5 TB | ffff8d7ffeffffff | 2 MB | fixmap + ffff8d7ffea00000 | -114.5 TB | ffff8d7ffeffffff | 6 MB | fixmap ffff8d7fff000000 | -114.5 TB | ffff8d7fffffffff | 16 MB | PCI io ffff8d8000000000 | -114.5 TB | ffff8f7fffffffff | 2 TB | vmemmap ffff8f8000000000 | -112.5 TB | ffffaf7fffffffff | 32 TB | vmalloc/ioremap space @@ -119,7 +119,7 @@ RISC-V Linux Kernel SV57 | Kernel-space virtual memory, shared between all processes: ____________________________________________________________|___________________________________________________________ | | | | - ff1bfffffee00000 | -57 PB | ff1bfffffeffffff | 2 MB | fixmap + ff1bfffffea00000 | -57 PB | ff1bfffffeffffff | 6 MB | fixmap ff1bffffff000000 | -57 PB | ff1bffffffffffff | 16 MB | PCI io ff1c000000000000 | -57 PB | ff1fffffffffffff | 1 PB | vmemmap ff20000000000000 | -56 PB | ff5fffffffffffff | 16 PB | vmalloc/ioremap space diff --git a/Documentation/rust/arch-support.rst b/Documentation/rust/arch-support.rst index ed7f4f5b3cf1..b91e9ef4d0c2 100644 --- a/Documentation/rust/arch-support.rst +++ b/Documentation/rust/arch-support.rst @@ -15,7 +15,7 @@ support corresponds to ``S`` values in the ``MAINTAINERS`` file. ============ ================ ============================================== Architecture Level of support Constraints ============ ================ ============================================== -``x86`` Maintained ``x86_64`` only. ``um`` Maintained ``x86_64`` only. +``x86`` Maintained ``x86_64`` only. ============ ================ ============================================== diff --git a/Documentation/sound/hd-audio/models.rst b/Documentation/sound/hd-audio/models.rst index 9b52f50a6854..120430450014 100644 --- a/Documentation/sound/hd-audio/models.rst +++ b/Documentation/sound/hd-audio/models.rst @@ -704,7 +704,7 @@ ref no-jd BIOS setup but without jack-detection intel - Intel DG45* mobos + Intel D*45* mobos dell-m6-amic Dell desktops/laptops with analog mics dell-m6-dmic diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index b927fb2b94dc..e8bca5fea7cc 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -3510,7 +3510,7 @@ directories, the rmdir will fail with EBUSY. Stack trace ----------- Since the kernel has a fixed sized stack, it is important not to -waste it in functions. A kernel developer must be conscience of +waste it in functions. A kernel developer must be conscious of what they allocate on the stack. If they add too much, the system can be in danger of a stack overflow, and corruption will occur, usually leading to a system panic. diff --git a/Documentation/translations/it_IT/core-api/symbol-namespaces.rst b/Documentation/translations/it_IT/core-api/symbol-namespaces.rst index 0f6898860d6d..17abc25ee4c1 100644 --- a/Documentation/translations/it_IT/core-api/symbol-namespaces.rst +++ b/Documentation/translations/it_IT/core-api/symbol-namespaces.rst @@ -1,7 +1,6 @@ .. include:: ../disclaimer-ita.rst -:Original: :doc:`../../../core-api/symbol-namespaces` -:Translator: Federico Vaga <federico.vaga@vaga.pv.it> +:Original: Documentation/core-api/symbol-namespaces.rst =========================== Spazio dei nomi dei simboli diff --git a/Documentation/translations/it_IT/doc-guide/parse-headers.rst b/Documentation/translations/it_IT/doc-guide/parse-headers.rst index 993d549ee2b8..c7076a21667a 100644 --- a/Documentation/translations/it_IT/doc-guide/parse-headers.rst +++ b/Documentation/translations/it_IT/doc-guide/parse-headers.rst @@ -1,7 +1,6 @@ .. include:: ../disclaimer-ita.rst -.. note:: Per leggere la documentazione originale in inglese: - :ref:`Documentation/doc-guide/index.rst <doc_guide>` +:Original: Documentation/doc-guide/index.rst ========================================= Includere gli i file di intestazione uAPI @@ -190,7 +189,7 @@ COPYRIGHT Copyright (c) 2016 by Mauro Carvalho Chehab <mchehab@s-opensource.com>. -Licenza GPLv2: GNU GPL version 2 <http://gnu.org/licenses/gpl.html>. +Licenza GPLv2: GNU GPL version 2 <https://gnu.org/licenses/gpl.html>. Questo è software libero: siete liberi di cambiarlo e ridistribuirlo. Non c'è alcuna garanzia, nei limiti permessi dalla legge. diff --git a/Documentation/translations/it_IT/index.rst b/Documentation/translations/it_IT/index.rst index fc5f39814e83..b95dfa1ded04 100644 --- a/Documentation/translations/it_IT/index.rst +++ b/Documentation/translations/it_IT/index.rst @@ -2,9 +2,9 @@ .. _it_linux_doc: -=================== -Traduzione italiana -=================== +================================== +La documentazione del kernel Linux +================================== .. raw:: latex @@ -12,6 +12,18 @@ Traduzione italiana :manutentore: Federico Vaga <federico.vaga@vaga.pv.it> +Questo è il livello principale della documentazione del kernel in +lingua italiana. La traduzione è incompleta, noterete degli avvisi +che vi segnaleranno la mancanza di una traduzione o di un gruppo di +traduzioni. + +Più in generale, la documentazione, come il kernel stesso, sono in +costante sviluppo; particolarmente vero in quanto stiamo lavorando +alla riorganizzazione della documentazione in modo più coerente. +I miglioramenti alla documentazione sono sempre i benvenuti; per cui, +se vuoi aiutare, iscriviti alla lista di discussione linux-doc presso +vger.kernel.org. + .. _it_disclaimer: Avvertenze @@ -54,23 +66,8 @@ Se avete bisogno d'aiuto per comunicare con la comunità Linux ma non vi sentite a vostro agio nello scrivere in inglese, potete chiedere aiuto al manutentore della traduzione. -La documentazione del kernel Linux -================================== - -Questo è il livello principale della documentazione del kernel in -lingua italiana. La traduzione è incompleta, noterete degli avvisi -che vi segnaleranno la mancanza di una traduzione o di un gruppo di -traduzioni. - -Più in generale, la documentazione, come il kernel stesso, sono in -costante sviluppo; particolarmente vero in quanto stiamo lavorando -alla riorganizzazione della documentazione in modo più coerente. -I miglioramenti alla documentazione sono sempre i benvenuti; per cui, -se vuoi aiutare, iscriviti alla lista di discussione linux-doc presso -vger.kernel.org. - Lavorare con la comunità di sviluppo ------------------------------------- +==================================== Le guide fondamentali per l'interazione con la comunità di sviluppo del kernel e su come vedere il proprio lavoro integrato. @@ -85,7 +82,7 @@ su come vedere il proprio lavoro integrato. Manuali sull'API interna ------------------------- +======================== Di seguito una serie di manuali per gli sviluppatori che hanno bisogno di interfacciarsi con il resto del kernel. @@ -96,7 +93,7 @@ interfacciarsi con il resto del kernel. core-api/index Strumenti e processi per lo sviluppo ------------------------------------- +==================================== Di seguito una serie di manuali contenenti informazioni utili a tutti gli sviluppatori del kernel. @@ -109,7 +106,7 @@ sviluppatori del kernel. kernel-hacking/index Documentazione per gli utenti ------------------------------ +============================= Di seguito una serie di manuali per gli *utenti* del kernel - ovvero coloro che stanno cercando di farlo funzionare al meglio per un dato sistema, ma anche @@ -120,16 +117,16 @@ Consultate anche `Linux man pages <https://www.kernel.org/doc/man-pages/>`_, che vengono mantenuti separatamente dalla documentazione del kernel Linux Documentazione relativa ai firmware ------------------------------------ +=================================== Di seguito informazioni sulle aspettative del kernel circa i firmware. Documentazione specifica per architettura ------------------------------------------ +========================================= Documentazione varia --------------------- +==================== Ci sono documenti che sono difficili da inserire nell'attuale organizzazione della documentazione; altri hanno bisogno di essere migliorati e/o convertiti diff --git a/Documentation/translations/it_IT/kernel-hacking/locking.rst b/Documentation/translations/it_IT/kernel-hacking/locking.rst index 05d362b16bf0..a9e781d2e323 100644 --- a/Documentation/translations/it_IT/kernel-hacking/locking.rst +++ b/Documentation/translations/it_IT/kernel-hacking/locking.rst @@ -1029,6 +1029,11 @@ Dato che questo è un problema abbastanza comune con una propensione alle corse critiche, dovreste usare timer_delete_sync() (``include/linux/timer.h``) per gestire questo caso. +Prima di rilasciare un temporizzatore dovreste chiamare la funzione +timer_shutdown() o timer_shutdown_sync() di modo che non venga più ricarmato. +Ogni successivo tentativo di riarmare il temporizzatore verrà silenziosamente +ignorato. + Velocità della sincronizzazione =============================== diff --git a/Documentation/translations/it_IT/process/5.Posting.rst b/Documentation/translations/it_IT/process/5.Posting.rst index cf92a16ed7e5..a7e2a3238415 100644 --- a/Documentation/translations/it_IT/process/5.Posting.rst +++ b/Documentation/translations/it_IT/process/5.Posting.rst @@ -265,15 +265,18 @@ Le etichette in uso più comuni sono: :ref:`Documentation/translations/it_IT/process/submitting-patches.rst <it_submittingpatches>` - Reported-by: menziona l'utente che ha riportato il problema corretto da - questa patch; quest'etichetta viene usata per dare credito alle persone - che hanno verificato il codice e ci hanno fatto sapere quando le cose non - funzionavano correttamente. + questa patch; quest'etichetta viene usata per dare credito alle persone che + hanno verificato il codice e ci hanno fatto sapere quando le cose non + funzionavano correttamente. Se esiste un rapporto disponibile sul web, allora + L'etichetta dovrebbe essere seguita da un collegamento al suddetto rapporto. - Cc: la persona menzionata ha ricevuto una copia della patch ed ha avuto l'opportunità di commentarla. -State attenti ad aggiungere queste etichette alla vostra patch: solo -"Cc:" può essere aggiunta senza il permesso esplicito della persona menzionata. +State attenti ad aggiungere queste etichette alla vostra patch: solo "Cc:" può +essere aggiunta senza il permesso esplicito della persona menzionata. Il più +delle volte anche Reported-by: va bene, ma è sempre meglio chiedere specialmente +se il baco è stato riportato in una comunicazione privata. Inviare la modifica ------------------- diff --git a/Documentation/translations/it_IT/process/changes.rst b/Documentation/translations/it_IT/process/changes.rst index 473ec2cc558e..f37c53f8b524 100644 --- a/Documentation/translations/it_IT/process/changes.rst +++ b/Documentation/translations/it_IT/process/changes.rst @@ -36,7 +36,7 @@ GNU C 5.1 gcc --version Clang/LLVM (optional) 11.0.0 clang --version GNU make 3.81 make --version bash 4.2 bash --version -binutils 2.23 ld -v +binutils 2.25 ld -v flex 2.5.35 flex --version bison 2.0 bison --version pahole 1.16 pahole --version @@ -97,7 +97,7 @@ Questo richiede bash 4.2 o successivo. Binutils -------- -Per generare il kernel è necessario avere Binutils 2.23 o superiore. +Per generare il kernel è necessario avere Binutils 2.25 o superiore. pkg-config ---------- diff --git a/Documentation/translations/it_IT/process/clang-format.rst b/Documentation/translations/it_IT/process/clang-format.rst index 77eac809a639..29f83c198025 100644 --- a/Documentation/translations/it_IT/process/clang-format.rst +++ b/Documentation/translations/it_IT/process/clang-format.rst @@ -40,7 +40,7 @@ Linux più popolari. Cercate ``clang-format`` nel vostro repositorio. Altrimenti, potete scaricare una versione pre-generata dei binari di LLVM/clang oppure generarlo dai codici sorgenti: - http://releases.llvm.org/download.html + https://releases.llvm.org/download.html Troverete più informazioni ai seguenti indirizzi: diff --git a/Documentation/translations/it_IT/process/coding-style.rst b/Documentation/translations/it_IT/process/coding-style.rst index a393ee4182af..5f244e16f511 100644 --- a/Documentation/translations/it_IT/process/coding-style.rst +++ b/Documentation/translations/it_IT/process/coding-style.rst @@ -1204,10 +1204,10 @@ ISBN 0-201-61586-X. Manuali GNU - nei casi in cui sono compatibili con K&R e questo documento - per indent, cpp, gcc e i suoi dettagli interni, tutto disponibile qui -http://www.gnu.org/manual/ +https://www.gnu.org/manual/ WG14 è il gruppo internazionale di standardizzazione per il linguaggio C, -URL: http://www.open-std.org/JTC1/SC22/WG14/ +URL: https://www.open-std.org/JTC1/SC22/WG14/ -Kernel process/coding-style.rst, by greg@kroah.com at OLS 2002: +Kernel CodingStyle, by greg@kroah.com at OLS 2002: http://www.kroah.com/linux/talks/ols_2002_kernel_codingstyle_talk/html/ diff --git a/Documentation/translations/it_IT/process/deprecated.rst b/Documentation/translations/it_IT/process/deprecated.rst index febf83897783..57b501f0dfa4 100644 --- a/Documentation/translations/it_IT/process/deprecated.rst +++ b/Documentation/translations/it_IT/process/deprecated.rst @@ -332,7 +332,7 @@ zero come risultato:: Il valore di ``size`` nell'ultima riga sarà ``zero``, quando uno invece si aspetterebbe che il suo valore sia la dimensione totale in -byte dell'allocazione dynamica che abbiamo appena fatto per l'array +byte dell'allocazione dinamica che abbiamo appena fatto per l'array ``items``. Qui un paio di esempi reali del problema: `collegamento 1 <https://git.kernel.org/linus/f2cd32a443da694ac4e28fbf4ac6f9d5cc63a539>`_, `collegamento 2 @@ -381,4 +381,29 @@ combinazione con struct_size() e flex_array_size():: instance = kmalloc(struct_size(instance, items, count), GFP_KERNEL); instance->count = count; - memcpy(instance->items, source, flex_array_size(instance, items, instance->count)); + memcpy(instance->items, source, flex_array_size(instance, items, instance->count)); + +Ci sono due casi speciali dove è necessario usare la macro DECLARE_FLEX_ARRAY() +(da notare che la stessa macro è chiamata __DECLARE_FLEX_ARRAY() nei file di +intestazione UAPI). Uno è quando l'array flessibile è l'unico elemento di una +struttura, e l'altro è quando è parti un unione. Per motivi non tecnici, entrambi +i casi d'uso non sono permessi dalla specifica C99. Per esempio, per +convertire il seguente codice:: + + struct something { + ... + union { + struct type1 one[0]; + struct type2 two[0]; + }; + }; + +La macro di supporto dev'essere usata:: + + struct something { + ... + union { + DECLARE_FLEX_ARRAY(struct type1, one); + DECLARE_FLEX_ARRAY(struct type2, two); + }; + }; diff --git a/Documentation/translations/it_IT/process/email-clients.rst b/Documentation/translations/it_IT/process/email-clients.rst index 970671cd91af..76ca3226c8cd 100644 --- a/Documentation/translations/it_IT/process/email-clients.rst +++ b/Documentation/translations/it_IT/process/email-clients.rst @@ -364,3 +364,28 @@ un editor esterno. Un altro problema è che Gmail usa la codifica base64 per tutti quei messaggi che contengono caratteri non ASCII. Questo include cose tipo i nomi europei. + +Proton Mail +*********** + +Il servizio Proton Mail ha una funzionalità che cripta tutti i messaggi verso +ogni destinatario per cui è possibile trovare una chiave usando il *Web Key +Directory* (WKD). Il servizio kernel.org pubblica il WKD per ogni sviluppatore +in possesso di un conto kernel.org. Di conseguenza, tutti i messaggi inviati +usando Proton Mail verso indirizzi kernel.org verranno criptati. + +Proton Mail non fornisce alcun meccanismo per disabilitare questa funzionalità +perché verrebbe considerato un problema per la riservatezza. Questa funzionalità +è attiva anche quando si inviano messaggi usando il Proton Mail Bridge. Dunque +tutta la posta in uscita verrà criptata, incluse le patch inviate con ``git +send-email``. + +I messaggi criptati sono una fonte di problemi; altri sviluppatori potrebbero +non aver configurato i loro programmi, o strumenti, per gestire messaggi +criptati; inoltre, alcuni programmi di posta elettronica potrebbero criptare le +risposte a messaggi criptati per tutti i partecipanti alla discussione, inclusa +la lista di discussione stessa. + +A meno che non venga introdotta una maniera per disabilitare questa +funzionalità , non è consigliato usare Proton Mail per contribuire allo sviluppo +del kernel. diff --git a/Documentation/translations/it_IT/process/index.rst b/Documentation/translations/it_IT/process/index.rst index 25602c1a97d1..cd7977905fb8 100644 --- a/Documentation/translations/it_IT/process/index.rst +++ b/Documentation/translations/it_IT/process/index.rst @@ -10,6 +10,7 @@ .. _it_process_index: +=============================================== Lavorare con la comunità di sviluppo del kernel =============================================== diff --git a/Documentation/translations/it_IT/process/maintainer-pgp-guide.rst b/Documentation/translations/it_IT/process/maintainer-pgp-guide.rst index 5526bcabeb0a..cdc43c4a9b0b 100644 --- a/Documentation/translations/it_IT/process/maintainer-pgp-guide.rst +++ b/Documentation/translations/it_IT/process/maintainer-pgp-guide.rst @@ -68,42 +68,24 @@ stesso. Strumenti PGP ============= -Usare GnuPG v2 --------------- +Usare GnuPG 2.2 o successivo +---------------------------- La vostra distribuzione potrebbe avere già installato GnuPG, dovete solo -verificare che stia utilizzando la versione 2.x e non la serie 1.4 -- -molte distribuzioni forniscono entrambe, di base il comando ''gpg'' -invoca GnuPG v.1. Per controllate usate:: +verificare che stia utilizzando la versione abbastanza recente. Per controllate +usate:: $ gpg --version | head -n1 -Se visualizzate ``gpg (GnuPG) 1.4.x``, allora state usando GnuPG v.1. -Provate il comando ``gpg2`` (se non lo avete, potreste aver bisogno -di installare il pacchetto gnupg2):: - - $ gpg2 --version | head -n1 - -Se visualizzate ``gpg (GnuPG) 2.x.x``, allora siete pronti a partire. -Questa guida assume che abbiate la versione 2.2.(o successiva) di GnuPG. -Se state usando la versione 2.0, alcuni dei comandi indicati qui non -funzioneranno, in questo caso considerate un aggiornamento all'ultima versione, -la 2.2. Versioni di gnupg-2.1.11 e successive dovrebbero essere compatibili -per gli obiettivi di questa guida. - -Se avete entrambi i comandi: ``gpg`` e ``gpg2``, assicuratevi di utilizzare -sempre la versione V2, e non quella vecchia. Per evitare errori potreste creare -un alias:: - - $ alias gpg=gpg2 - -Potete mettere questa opzione nel vostro ``.bashrc`` in modo da essere sicuri. +Se state utilizzando la version 2.2 o successiva, allora siete pronti a partire. +Se invece state usando una versione precedente, allora alcuni comandi elencati +in questa guida potrebbero non funzionare. Configurare le opzioni di gpg-agent ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ L'agente GnuPG è uno strumento di aiuto che partirà automaticamente ogni volta -che userete il comando ``gpg`` e funzionerà in background con l'obiettivo di +che userete il comando ``gpg`` e funzionerà in *background* con l'obiettivo di individuare la passphrase. Ci sono due opzioni che dovreste conoscere per personalizzare la scadenza della passphrase nella cache: @@ -131,19 +113,7 @@ valori:: riguarda vecchie le versioni di GnuPG, poiché potrebbero non svolgere più bene il loro compito. -Impostare un *refresh* con cronjob -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Potreste aver bisogno di rinfrescare regolarmente il vostro portachiavi in -modo aggiornare le chiavi pubbliche di altre persone, lavoro che è svolto -al meglio con un cronjob giornaliero:: - - @daily /usr/bin/gpg2 --refresh >/dev/null 2>&1 - -Controllate il percorso assoluto del vostro comando ``gpg`` o ``gpg2`` e usate -il comando ``gpg2`` se per voi ``gpg`` corrisponde alla versione GnuPG v.1. - -.. _it_master_key: +.. _it_protect_your_key: Proteggere la vostra chiave PGP primaria ======================================== @@ -155,55 +125,62 @@ al documento "`Protecting Code Integrity`_" che abbiamo menzionato prima. Dovreste inoltre creare una nuova chiave se quella attuale è inferiore a 2048 bit (RSA). -Chiave principale o sottochiavi -------------------------------- - -Le sottochiavi sono chiavi PGP totalmente indipendenti, e sono collegate alla -chiave principale attraverso firme certificate. È quindi importante -comprendere i seguenti punti: - -1. Non ci sono differenze tecniche tra la chiave principale e la sottochiave. -2. In fase di creazione, assegniamo limitazioni funzionali ad ogni chiave - assegnando capacità specifiche. -3. Una chiave PGP può avere 4 capacità : +Le sottochiavi PGP +------------------ - - **[S]** può essere usata per firmare - - **[E]** può essere usata per criptare - - **[A]** può essere usata per autenticare - - **[C]** può essere usata per certificare altre chiavi +Raramente le chiavi PGP sono composte da una singola coppia -- solitamente, sono +una collezione di sottochiavi indipendenti usate per diversi scopi in funzione +delle capacità assegnate al momento della creazione. Una chiave PGP può avere +quattro capacità : + +- **[S]** può essere usata per firmare +- **[E]** può essere usata per criptare +- **[A]** può essere usata per autenticare +- **[C]** può essere usata per certificare altre chiavi + +La chiave con la capacità **[C]** viene spesso chiamata chiave "passepartout" +(*master key*), ma è una terminologia fuorviante perché lascia intendere che la +chiave di certificato possa essere usate in sostituzione delle altre (proprio +come le vere chiavi passpartout in grado di aprire diverse serrature). Dato che +questo non è il caso, per evitare fraintendimenti, in questa guida ci riferiremo +a questa chiave chiamandola "La chiave di certificazione". + +I seguenti punti sono molto importanti: + +1. Tutte le sottochiavi sono indipendenti. Se perdete una sottochiave privata + non potrete recuperarla usando le altre. +2. Ad eccezione della chiave di certificazione, ci possono essere più + sottochiavi con le stesse capacità (per esempio, potete avere 2 sottochiavi + per criptare, 3 per firmare, ma solo una per una sola per certificare). Tutte + le sottochiavi sono indipendenti -- un messaggio criptato usando una chiave + **[E]** non può essere decriptato usano altre sottochiavi **[E]**. +3. Una sottochiave può avere più capacità (per esempio, la chiave **[C]** può + anche essere una chiave **[S]**). + +La chiave con capacità **[C]** (certificazione) è la sola che può essere usata +per indicare relazioni fra chiavi. Solo la chiave **[C]** può essere usata per: + +- aggiungere o revocare altre chiavi (sottochiavi) che hanno capacità S/E/A; +- aggiungere, modificare o eliminare le identità (unids) associate alla chiave; +- aggiungere o modificare la propria data di scadenza o delle sottochiavi; +- firmare le chiavi di altre persone a scopo di creare una rete di fiducia. -4. Una singola chiave può avere più capacità -5. Una sottochiave è completamente indipendente dalla chiave principale. - Un messaggio criptato con la sottochiave non può essere decrittato con - quella principale. Se perdete la vostra sottochiave privata, non può - essere rigenerata in nessun modo da quella principale. +Di base, alla creazione di nuove chiavi, GnuPG genera quanto segue: -La chiave con capacità **[C]** (certify) è identificata come la chiave -principale perché è l'unica che può essere usata per indicare la relazione -con altre chiavi. Solo la chiave **[C]** può essere usata per: +- Una chiave la capacità di certificazione che quella di firma (**[SC]**) +- Una sottochiave separata con capacità di criptare (**[E]**) -- Aggiungere o revocare altre chiavi (sottochiavi) che hanno capacità S/E/A -- Aggiungere, modificare o eliminare le identità (unids) associate alla chiave -- Aggiungere o modificare la data di termine di sé stessa o di ogni sottochiave -- Firmare le chiavi di altre persone a scopo di creare una rete di fiducia -Di base, alla creazione di nuove chiavi, GnuPG genera quanto segue: -- Una chiave madre che porta sia la capacità di certificazione che quella - di firma (**[SC]**) -- Una sottochiave separata con capacità di criptaggio (**[E]**) -Se avete usato i parametri di base per generare la vostra chiave, quello +Se avete usato i parametri predefiniti per generare la vostra chiave, quello sarà il risultato. Potete verificarlo utilizzando ``gpg --list-secret-keys``, per esempio:: - sec rsa2048 2018-01-23 [SC] [expires: 2020-01-23] + sec ed25519 2022-12-20 [SC] [expires: 2024-12-19] 000000000000000000000000AAAABBBBCCCCDDDD uid [ultimate] Alice Dev <adev@kernel.org> - ssb rsa2048 2018-01-23 [E] [expires: 2020-01-23] - -Qualsiasi chiave che abbia la capacità **[C]** è la vostra chiave madre, -indipendentemente da quali altre capacità potreste averle assegnato. + ssb cv25519 2022-12-20 [E] [expires: 2024-12-19] La lunga riga sotto la voce ``sec`` è la vostra impronta digitale -- negli esempi che seguono, quando vedere ``[fpr]`` ci si riferisce a questa @@ -238,20 +215,10 @@ possano ricevere la vostra nuova sottochiave:: $ gpg --send-key [fpr] .. note:: Supporto ECC in GnuPG - GnuPG 2.1 e successivi supportano pienamente *Elliptic Curve Cryptography*, - con la possibilità di combinare sottochiavi ECC con le tradizionali chiavi - primarie RSA. Il principale vantaggio della crittografia ECC è che è molto - più veloce da calcolare e crea firme più piccole se confrontate byte per - byte con le chiavi RSA a più di 2048 bit. A meno che non pensiate di - utilizzare un dispositivo smartcard che non supporta le operazioni ECC, vi - raccomandiamo ti creare sottochiavi di firma ECC per il vostro lavoro col - kernel. - - Se per qualche ragione preferite rimanere con sottochiavi RSA, nel comando - precedente, sostituite "ed25519" con "rsa2048". In aggiunta, se avete - intenzione di usare un dispositivo hardware che non supporta le chiavi - ED25519 ECC, come la Nitrokey Pro o la Yubikey, allora dovreste usare - "nistp256" al posto di "ed25519". + + Tenete presente che se avete intenzione di usare un dispositivo che non + supporta chiavi ED25519 ECC, allora dovreste usare "nistp256" al posto di + "ed25519". Più avanti ci sono alcune raccomandazioni per i dispositivi. Copia di riserva della chiave primaria per gestire il recupero da disastro -------------------------------------------------------------------------- @@ -360,13 +327,13 @@ Per prima cosa, identificate il keygrip della vostra chiave primaria:: L'output assomiglierà a questo:: - pub rsa2048 2018-01-24 [SC] [expires: 2020-01-24] + pub ed25519 2022-12-20 [SC] [expires: 2022-12-19] 000000000000000000000000AAAABBBBCCCCDDDD Keygrip = 1111000000000000000000000000000000000000 uid [ultimate] Alice Dev <adev@kernel.org> - sub rsa2048 2018-01-24 [E] [expires: 2020-01-24] + sub cv25519 2022-12-20 [E] [expires: 2022-12-19] Keygrip = 2222000000000000000000000000000000000000 - sub ed25519 2018-01-24 [S] + sub ed25519 2022-12-20 [S] Keygrip = 3333000000000000000000000000000000000000 Trovate la voce keygrid che si trova sotto alla riga ``pub`` (appena sotto @@ -389,11 +356,11 @@ Ora, se eseguite il comando ``--list-secret-keys``, vedrete che la chiave primaria non compare più (il simbolo ``#`` indica che non è disponibile):: $ gpg --list-secret-keys - sec# rsa2048 2018-01-24 [SC] [expires: 2020-01-24] + sec# ed25519 2022-12-20 [SC] [expires: 2024-12-19] 000000000000000000000000AAAABBBBCCCCDDDD uid [ultimate] Alice Dev <adev@kernel.org> - ssb rsa2048 2018-01-24 [E] [expires: 2020-01-24] - ssb ed25519 2018-01-24 [S] + ssb cv25519 2022-12-20 [E] [expires: 2024-12-19] + ssb ed25519 2022-12-20 [S] Dovreste rimuovere anche i file ``secring.gpg`` che si trovano nella cartella ``~/.gnupg``, in quanto rimasugli delle versioni precedenti di GnuPG. @@ -461,18 +428,20 @@ soluzioni disponibili: computer portatili più recenti. In aggiunta, offre altre funzionalità di sicurezza come FIDO, U2F, e ora supporta anche le chiavi ECC (NISTP) -`Su LWN c'è una buona recensione`_ dei modelli elencati qui sopra e altri. -La scelta dipenderà dal costo, dalla disponibilità nella vostra area -geografica e vostre considerazioni sull'hardware aperto/proprietario. +La vostra scelta dipenderà dal costo, la disponibilità nella vostra regione, e +sulla scelta fra dispositivi aperti e proprietari. -Se volete usare chiavi ECC, la vostra migliore scelta sul mercato è la -Nitrokey Start. +.. note:: + + Se siete nella lista MAINTAINERS o avete un profilo su kernel.org, allora + `potrete avere gratuitamente una Nitrokey Start`_ grazie alla fondazione + Linux. .. _`Nitrokey Start`: https://shop.nitrokey.com/shop/product/nitrokey-start-6 .. _`Nitrokey Pro 2`: https://shop.nitrokey.com/shop/product/nitrokey-pro-2-3 .. _`Yubikey 5`: https://www.yubico.com/product/yubikey-5-overview/ -.. _Gnuk: http://www.fsij.org/doc-gnuk/ -.. _`Su LWN c'è una buona recensione`: https://lwn.net/Articles/736231/ +.. _Gnuk: https://www.fsij.org/doc-gnuk/ +.. _`potrete avere gratuitamente una Nitrokey Start`: https://www.kernel.org/nitrokey-digital-tokens-for-kernel-developers.html Configurare il vostro dispositivo smartcard ------------------------------------------- @@ -513,6 +482,12 @@ altre informazioni sulla carta che potrebbero trapelare in caso di smarrimento. A dispetto del nome "PIN", né il PIN utente né quello dell'amministratore devono essere esclusivamente numerici. +.. warning:: + + Alcuni dispositivi richiedono la presenza delle sottochiavi nel dispositivo + stesso prima che possiate cambiare la passphare. Verificate la + documentazione del produttore. + Spostare le sottochiavi sulla smartcard --------------------------------------- @@ -525,11 +500,11 @@ dell'amministratore:: Secret subkeys are available. - pub rsa2048/AAAABBBBCCCCDDDD - created: 2018-01-23 expires: 2020-01-23 usage: SC + pub ed25519/AAAABBBBCCCCDDDD + created: 2022-12-20 expires: 2024-12-19 usage: SC trust: ultimate validity: ultimate - ssb rsa2048/1111222233334444 - created: 2018-01-23 expires: never usage: E + ssb cv25519/1111222233334444 + created: 2022-12-20 expires: never usage: E ssb ed25519/5555666677778888 created: 2017-12-07 expires: never usage: S [ultimate] (1). Alice Dev <adev@kernel.org> @@ -594,11 +569,11 @@ Ora, se doveste usare l'opzione ``--list-secret-keys``, vedrete una sottile differenza nell'output:: $ gpg --list-secret-keys - sec# rsa2048 2018-01-24 [SC] [expires: 2020-01-24] + sec# ed25519 2022-12-20 [SC] [expires: 2024-12-19] 000000000000000000000000AAAABBBBCCCCDDDD uid [ultimate] Alice Dev <adev@kernel.org> - ssb> rsa2048 2018-01-24 [E] [expires: 2020-01-24] - ssb> ed25519 2018-01-24 [S] + ssb> cv25519 2022-12-20 [E] [expires: 2024-12-19] + ssb> ed25519 2022-12-20 [S] Il simbolo ``>`` in ``ssb>`` indica che la sottochiave è disponibile solo nella smartcard. Se tornate nella vostra cartella delle chiavi segrete e @@ -661,7 +636,7 @@ eseguite:: Se per voi è più facile da memorizzare, potete anche utilizzare una data specifica (per esempio, il vostro compleanno o capodanno):: - $ gpg --quick-set-expire [fpr] 2020-07-01 + $ gpg --quick-set-expire [fpr] 2025-07-01 Ricordatevi di inviare l'aggiornamento ai keyserver:: @@ -676,6 +651,21 @@ dovreste importarle nella vostra cartella di lavoro abituale:: $ gpg --export | gpg --homedir ~/.gnupg --import $ unset GNUPGHOME +Usare gpg-agent con ssh +~~~~~~~~~~~~~~~~~~~~~~~ + +Se dovete firmare tag o commit su un sistema remoto, potete ridirezionare il +vostro gpg-agent attraverso ssh. Consultate le istruzioni disponibili nella wiki +GnuPG: + +- `Agent Forwarding over SSH`_ + +Funziona senza troppi intoppi se avete la possibilità di modificare le +impostazioni di sshd sul sistema remoto. + +.. _`Agent Forwarding over SSH`: https://wiki.gnupg.org/AgentForwarding + +.. _it_pgp_with_git: Usare PGP con Git ================= @@ -709,11 +699,6 @@ avere più chiavi segrete, potete dire a git quale dovrebbe usare (``[fpg]`` $ git config --global user.signingKey [fpr] -**IMPORTANTE**: se avete una comando dedicato per ``gpg2``, allora dovreste -dire a git di usare sempre quello piuttosto che il vecchio comando ``gpg``:: - - $ git config --global gpg.program gpg2 - Come firmare i tag ------------------ @@ -812,6 +797,61 @@ Potete dire a git di firmare sempre i commit:: .. _it_verify_identities: +Come lavorare con patch firmate +------------------------------- + +Esiste la possibilità di usare la vostra chiave PGP per firmare le patch che +invierete alla liste di discussione del kernel. I meccanismi esistenti per la +firma delle email (PGP-Mime o PGP-inline) tendono a causare problemi +nell'attività di revisione del codice. Si suggerisce, invece, di utilizare lo +strumento sviluppato da kernel.org che mette nell'intestazione del messaggio +un'attestazione delle firme crittografiche (tipo DKIM): + +- `Patatt Patch Attestation`_ + +.. _`Patatt Patch Attestation`: https://pypi.org/project/patatt/ + +Installare e configurate patatt +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Lo strumento patatt è disponibile per diverse distribuzioni, dunque cercatelo +prima lì. Oppure potete installarlo usano pypi "``pip install patatt``" + +Se avete già configurato git con la vostra chiave PGP (usando +``user.signingKey``), allora patatt non ha bisogno di alcuna configurazione +aggiuntiva. Potete iniziare a firmare le vostre patch aggiungendo un aggancio a +git-send-email nel vostro repositorio:: + + patatt install-hook + +Ora, qualsiasi patch che invierete con ``git send-email`` verrà automaticamente +firmata usando la vostra firma crittografica. + +Verificare le firme di patatt +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Se usate ``b4`` per verificare ed applicare le patch, allora tenterà +automaticamente di verificare tutte le firme DKIM e patatt disponibili. Per +esempio:: + + $ b4 am 20220720205013.890942-1-broonie@kernel.org + [...] + Checking attestation on all messages, may take a moment... + --- + ✓ [PATCH v1 1/3] kselftest/arm64: Correct buffer allocation for SVE Z registers + ✓ [PATCH v1 2/3] arm64/sve: Document our actual ABI for clearing registers on syscall + ✓ [PATCH v1 3/3] kselftest/arm64: Enforce actual ABI for SVE syscalls + --- + ✓ Signed: openpgp/broonie@kernel.org + ✓ Signed: DKIM/kernel.org + +.. note:: + + Lo sviluppo di patatt e b4 è piuttosto attivo. Si consiglia di verificare la + documentazione più recente. + +.. _it_kernel_identities: + Come verificare l'identità degli sviluppatori del kernel ======================================================== @@ -884,64 +924,18 @@ di base di GnuPG v2). Per farlo, aggiungete (o modificate) l'impostazione trust-model tofu+pgp -Come usare i keyserver in sicurezza ------------------------------------ -Se ottenete l'errore "No public key" quando cercate di validate il tag di -qualcuno, allora dovreste cercare quella chiave usando un keyserver. È -importante tenere bene a mente che non c'è alcuna garanzia che la chiave -che avete recuperato da un keyserver PGP appartenga davvero alla persona -reale -- è progettato così. Dovreste usare il Web of Trust per assicurarvi -che la chiave sia valida. - -Come mantenere il Web of Trust va oltre gli scopi di questo documento, -semplicemente perché farlo come si deve richiede sia sforzi che perseveranza -che tendono ad andare oltre al livello di interesse della maggior parte degli -esseri umani. Qui di seguito alcuni rapidi suggerimenti per aiutarvi a ridurre -il rischio di importare chiavi maligne. - -Primo, diciamo che avete provato ad eseguire ``git verify-tag`` ma restituisce -un errore dicendo che la chiave non è stata trovata:: - - $ git verify-tag sunxi-fixes-for-4.15-2 - gpg: Signature made Sun 07 Jan 2018 10:51:55 PM EST - gpg: using RSA key DA73759BF8619E484E5A3B47389A54219C0F2430 - gpg: issuer "wens@...org" - gpg: Can't check signature: No public key - -Cerchiamo nel keyserver per maggiori informazioni sull'impronta digitale -della chiave (l'impronta digitale, probabilmente, appartiene ad una -sottochiave, dunque non possiamo usarla direttamente senza trovare prima -l'ID della chiave primaria associata ad essa):: - - $ gpg --search DA73759BF8619E484E5A3B47389A54219C0F2430 - gpg: data source: hkp://keys.gnupg.net - (1) Chen-Yu Tsai <wens@...org> - 4096 bit RSA key C94035C21B4F2AEB, created: 2017-03-14, expires: 2019-03-15 - Keys 1-1 of 1 for "DA73759BF8619E484E5A3B47389A54219C0F2430". Enter number(s), N)ext, or Q)uit > q - -Localizzate l'ID della chiave primaria, nel nostro esempio -``C94035C21B4F2AEB``. Ora visualizzate le chiavi di Linus Torvalds -che avete nel vostro portachiavi:: - - $ gpg --list-key torvalds@kernel.org - pub rsa2048 2011-09-20 [SC] - ABAF11C65A2970B130ABE3C479BE3E4300411886 - uid [ unknown] Linus Torvalds <torvalds@kernel.org> - sub rsa2048 2011-09-20 [E] - -Poi, cercate un percorso affidabile da Linux Torvalds alla chiave che avete -trovato con ``gpg --search`` usando la chiave sconosciuta.Per farlo potete usare -diversi strumenti come https://github.com/mricon/wotmate, -https://git.kernel.org/pub/scm/docs/kernel/pgpkeys.git/tree/graphs, e -https://the.earth.li/~noodles/pathfind.html. - -Se trovate un paio di percorsi affidabili è un buon segno circa la validità -della chiave. Ora, potete aggiungerla al vostro portachiavi dal keyserver:: - - $ gpg --recv-key C94035C21B4F2AEB - -Questa procedura non è perfetta, e ovviamente state riponendo la vostra -fiducia nell'amministratore del servizio *PGP Pathfinder* sperando che non -sia malintenzionato (infatti, questo va contro :ref:`it_devs_not_infra`). -Tuttavia, se mantenete con cura la vostra rete di fiducia sarà un deciso -miglioramento rispetto alla cieca fiducia nei keyserver. +Usare il repositorio kernel.org per il web of trust +--------------------------------------------------- + +Il progetto kernel.org mantiene un repositorio git con le chiavi pubbliche degli sviluppatori in alternativa alla replica dei server di chiavi che negli ultimi anni sono spariti. La documentazione completa su come impostare il repositorio come vostra sorgente di chiavi pubbliche può essere trovato qui: + +- `Kernel developer PGP Keyring`_ + +Se siete uno sviluppatore del kernel, per favore valutate l'idea di inviare la +vostra chiave per l'inclusione in quel portachiavi. + + +If you are a kernel developer, please consider submitting your key for +inclusion into that keyring. + +.. _`Kernel developer PGP Keyring`: https://korg.docs.kernel.org/pgpkeys.html diff --git a/Documentation/translations/it_IT/process/programming-language.rst b/Documentation/translations/it_IT/process/programming-language.rst index c1a9b481a6f9..5bc5b9d42f31 100644 --- a/Documentation/translations/it_IT/process/programming-language.rst +++ b/Documentation/translations/it_IT/process/programming-language.rst @@ -18,10 +18,6 @@ Linux supporta anche ``clang`` [it-clang]_, leggete la documentazione Questo dialetto contiene diverse estensioni al linguaggio [it-gnu-extensions]_, e molte di queste vengono usate sistematicamente dal kernel. -Il kernel offre un certo livello di supporto per la compilazione con -``icc`` [it-icc]_ su diverse architetture, tuttavia in questo momento -il supporto non è completo e richiede delle patch aggiuntive. - Attributi --------- @@ -43,11 +39,30 @@ possono usare e/o per accorciare il codice. Per maggiori informazioni consultate il file d'intestazione ``include/linux/compiler_attributes.h``. +Rust +---- + +Il kernel supporta sperimentalmente il linguaggio di programmazione Rust +[it-rust-language]_ abilitando l'opzione di configurazione ``CONFIG_RUST``. Il +codice verrà compilato usando ``rustc`` [it-rustc]_ con l'opzione +``--edition=2021`` [it-rust-editions]_. Le edizioni Rust sono un modo per +introdurre piccole modifiche senza compatibilità all'indietro._ + +In aggiunta, nel kernel vengono utilizzate alcune funzionalità considerate +instabili [it-rust-unstable-features]_. Queste funzionalità potrebbero cambiare +in futuro, dunque è un'obiettivo importante è quello di far uso solo di +funzionalità stabili. + +Per maggiori informazioni fate riferimento a Documentation/rust/index.rst . + .. [it-c-language] http://www.open-std.org/jtc1/sc22/wg14/www/standards .. [it-gcc] https://gcc.gnu.org .. [it-clang] https://clang.llvm.org -.. [it-icc] https://software.intel.com/en-us/c-compilers .. [it-gcc-c-dialect-options] https://gcc.gnu.org/onlinedocs/gcc/C-Dialect-Options.html .. [it-gnu-extensions] https://gcc.gnu.org/onlinedocs/gcc/C-Extensions.html .. [it-gcc-attribute-syntax] https://gcc.gnu.org/onlinedocs/gcc/Attribute-Syntax.html .. [it-n2049] http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2049.pdf +.. [it-rust-language] https://www.rust-lang.org +.. [it-rustc] https://doc.rust-lang.org/rustc/ +.. [it-rust-editions] https://doc.rust-lang.org/edition-guide/editions/ +.. [it-rust-unstable-features] https://github.com/Rust-for-Linux/linux/issues/2 diff --git a/Documentation/translations/it_IT/process/stable-kernel-rules.rst b/Documentation/translations/it_IT/process/stable-kernel-rules.rst index 0be675b03199..248bf1e4b171 100644 --- a/Documentation/translations/it_IT/process/stable-kernel-rules.rst +++ b/Documentation/translations/it_IT/process/stable-kernel-rules.rst @@ -106,6 +106,12 @@ al messaggio della patch, così: commit <sha1> upstream. +o in alternativa: + +.. code-block:: none + + [ Upstream commit <sha1> ] + In aggiunta, alcune patch inviate attraverso l':ref:`it_option_1` potrebbero dipendere da altre che devo essere incluse. Questa situazione può essere indicata nel seguente modo nell'area dedicata alle firme: diff --git a/Documentation/translations/it_IT/process/submitting-patches.rst b/Documentation/translations/it_IT/process/submitting-patches.rst index 167fce813032..447b18792e61 100644 --- a/Documentation/translations/it_IT/process/submitting-patches.rst +++ b/Documentation/translations/it_IT/process/submitting-patches.rst @@ -429,7 +429,7 @@ poi dovete solo aggiungere una riga che dice:: Signed-off-by: Random J Developer <random@developer.example.org> -usando il vostro vero nome (spiacenti, non si accettano pseudonimi o +usando il vostro vero nome (spiacenti, non si accettano contributi anonimi). Questo verrà fatto automaticamente se usate ``git commit -s``. Anche il ripristino di uno stato precedente dovrebbe includere "Signed-off-by", se usate ``git revert -s`` questo verrà @@ -785,7 +785,7 @@ Riferimenti ----------- Andrew Morton, "La patch perfetta" (tpp). - <http://www.ozlabs.org/~akpm/stuff/tpp.txt> + <https://www.ozlabs.org/~akpm/stuff/tpp.txt> Jeff Garzik, "Formato per la sottomissione di patch per il kernel Linux" <https://web.archive.org/web/20180829112450/http://linux.yyz.us/patch-format.html> diff --git a/Documentation/translations/it_IT/process/volatile-considered-harmful.rst b/Documentation/translations/it_IT/process/volatile-considered-harmful.rst index efc640cac596..4fff9a59b548 100644 --- a/Documentation/translations/it_IT/process/volatile-considered-harmful.rst +++ b/Documentation/translations/it_IT/process/volatile-considered-harmful.rst @@ -119,9 +119,9 @@ concorrenza siano stati opportunamente considerati. Riferimenti =========== -[1] http://lwn.net/Articles/233481/ +[1] https://lwn.net/Articles/233481/ -[2] http://lwn.net/Articles/233482/ +[2] https://lwn.net/Articles/233482/ Crediti ======= diff --git a/Documentation/translations/sp_SP/memory-barriers.txt b/Documentation/translations/sp_SP/memory-barriers.txt index f62bd797216d..27097a808c88 100644 --- a/Documentation/translations/sp_SP/memory-barriers.txt +++ b/Documentation/translations/sp_SP/memory-barriers.txt @@ -604,7 +604,7 @@ READ_ONCE() para DEC Alpha, lo que significa que las únicas personas que necesitan prestar atención a esta sección son aquellas que trabajan en el código especÃfico de la arquitectura DEC Alpha y aquellas que trabajan en READ_ONCE() por dentro. Para aquellos que lo necesitan, y para aquellos que -estén interesados ​​desde un punto de vista histórico, aquà está la historia +estén interesados desde un punto de vista histórico, aquà está la historia de las barreras de dependencia de dirección. [!] Si bien las dependencias de direcciones se observan tanto en carga a diff --git a/Documentation/translations/sp_SP/process/deprecated.rst b/Documentation/translations/sp_SP/process/deprecated.rst new file mode 100644 index 000000000000..d52120e0d753 --- /dev/null +++ b/Documentation/translations/sp_SP/process/deprecated.rst @@ -0,0 +1,381 @@ +.. include:: ../disclaimer-sp.rst + +:Original: :ref:`Documentation/process/deprecated.rst <deprecated>` +:Translator: Sergio Gonzalez <sergio.collado@gmail.com> + +.. _sp_deprecated: + +============================================================================ +Interfaces obsoletos, CaracterÃsticas del lenguaje, Atributos y Convenciones +============================================================================ + +En un mundo perfecto, serÃa posible convertir todas las instancias de +alguna API obsoleta en una nueva API y quitar la API anterior en un +único ciclo de desarrollo. Desafortunadamente, debido al tamaño del kernel, +la jerarquÃa de mantenimiento, y el tiempo, no siempre es posible hacer +estos cambios de una única vez. Esto significa que las nuevas instancias +han de ir creándose en el kernel, mientras que las antiguas se quitan, +haciendo que la cantidad de trabajo para limpiar las APIs crezca. Para +informar a los desarrolladores sobre qué ha sido declarado obsoleto y por +qué, ha sido creada esta lista como un lugar donde indicar cuando los usos +obsoletos son propuestos para incluir en el kernel. + +__deprecated +------------ +Mientras que este atributo señala visualmente que un interface ha sido +declarado obsoleto, este `no produce más avisos durante las compilaciones +<https://git.kernel.org/linus/771c035372a036f83353eef46dbb829780330234>`_ +porque uno de los objetivos del kernel es que compile sin avisos, y +nadie ha hecho nada para quitar estos interfaces obsoletos. Mientras +que usar `__deprecated` es sencillo para anotar una API obsoleta en +un archivo de cabecera, no es la solución completa. Dichos interfaces +deben o bien ser quitados por completo, o añadidos a este archivo para +desanimar a otros a usarla en el futuro. + +BUG() y BUG_ON() +---------------- +Use WARN() y WARN_ON() en su lugar, y gestione las condiciones de error +"imposibles" tan elegantemente como se pueda. Mientras que la familia de +funciones BUG() fueron originalmente diseñadas para actuar como una +"situación imposible", confirmar y disponer de un hilo del kernel de forma +"segura", estas funciones han resultado ser demasiado arriesgadas. (e.g. +"¿en qué orden se necesitan liberar los locks? ¿Se han restaurado sus +estados?). La popular función BUG() desestabilizará el sistema o lo romperá +totalmente, lo cual hace imposible depurarlo o incluso generar reportes de +crash. Linus tiene una `opinión muy fuerte +<https://lore.kernel.org/lkml/CA+55aFy6jNLsywVYdGp83AMrXBo_P-pkjkphPGrO=82SPKCpLQ@mail.gmail.com/>`_ +y sentimientos `sobre esto +<https://lore.kernel.org/lkml/CAHk-=whDHsbK3HTOpTF=ue_o04onRwTEaK_ZoJp_fjbqq4+=Jw@mail.gmail.com/>`_. + +Nótese que la familia de funciones WARN() únicamente deberÃa ser usada +en situaciones que se "esperan no sean alcanzables". Si se quiere +avisar sobre situaciones "alcanzables pero no deseadas", úsese la familia +de funciones pr_warn(). Los responsables del sistema pueden haber definido +*panic_on_warn* sysctl para asegurarse que sus sistemas no continúan +ejecutándose en presencia del condiciones "no alcanzables". (Por ejemplo, +véase commits como `este +<https://git.kernel.org/linus/d4689846881d160a4d12a514e991a740bcb5d65a>`_.) + +Operaciones aritméticas en los argumentos de reserva de memoria +--------------------------------------------------------------- +Los cálculos dinámicos de tamaño (especialmente multiplicaciones) no +deberÃan realizarse en los argumentos de reserva de memoria (o similares) +debido al riesgo de desbordamiento. Esto puede llevar a valores rotando y +que se realicen reservas de memoria menores que las que se esperaban. El +uso de esas reservas puede llevar a desbordamientos en el 'heap' de memoria +y otros funcionamientos incorrectos. (Una excepción a esto son los valores +literales donde el compilador si puede avisar si estos puede desbordarse. +De todos modos, el método recomendado en estos caso es reescribir el código +como se sugiere a continuación para evitar las operaciones aritméticas en +la reserva de memoria.) + +Por ejemplo, no utilice `count * size`` como argumento, como en:: + + foo = kmalloc(count * size, GFP_KERNEL); + +En vez de eso, utilice la reserva con dos argumentos:: + + foo = kmalloc_array(count, size, GFP_KERNEL); + +EspecÃficamente, kmalloc() puede ser sustituido con kmalloc_array(), +kzalloc() puede ser sustituido con kcalloc(). + +Si no existen funciones con dos argumentos, utilice las funciones que se +saturan, en caso de desbordamiento:: + + bar = vmalloc(array_size(count, size)); + +Otro caso común a evitar es calcular el tamaño de una estructura com +la suma de otras estructuras, como en:: + + header = kzalloc(sizeof(*header) + count * sizeof(*header->item), + GFP_KERNEL); + +En vez de eso emplee:: + + header = kzalloc(struct_size(header, item, count), GFP_KERNEL); + +.. note:: Si se usa struct_size() en una estructura que contiene un elemento + de longitud cero o un array de un único elemento como un array miembro, + por favor reescribir ese uso y cambiar a un `miembro array flexible + <#zero-length-and-one-element-arrays>`_ + + +Para otros cálculos, por favor use las funciones de ayuda: size_mul(), +size_add(), and size_sub(). Por ejemplo, en el caso de:: + + foo = krealloc(current_size + chunk_size * (count - 3), GFP_KERNEL); + +Re-escrÃbase, como:: + + foo = krealloc(size_add(current_size, + size_mul(chunk_size, + size_sub(count, 3))), GFP_KERNEL); + +Para más detalles, mire también array3_size() y flex_array_size(), +como también la familia de funciones relacionadas check_mul_overflow(), +check_add_overflow(), check_sub_overflow(), y check_shl_overflow(). + + +simple_strtol(), simple_strtoll(), simple_strtoul(), simple_strtoull() +---------------------------------------------------------------------- +Las funciones: simple_strtol(), simple_strtoll(), simple_strtoul(), y +simple_strtoull() explÃcitamente ignoran los desbordamientos, lo que puede +llevar a resultados inesperados por las funciones que las llaman. Las +funciones respectivas kstrtol(), kstrtoll(), kstrtoul(), y kstrtoull() +tienden a ser reemplazos correctos, aunque nótese que necesitarán que la +cadena de caracteres termine en NUL o en el carácter de lÃnea nueva. + + +strcpy() +-------- +strcpy() no realiza verificaciones de los lÃmites del buffer de destino. +Esto puede resultar en desbordamientos lineals más allá del fin del buffer, +causando todo tipo de errores. Mientras `CONFIG_FORTIFY_SOURCE=y` otras +varias opciones de compilación reducen el riesgo de usar esta función, no +hay ninguna buena razón para añadir nuevos usos de esta. El remplazo seguro +es la función strscpy(), aunque se ha de tener cuidado con cualquier caso +en el el valor retornado por strcpy() sea usado, ya que strscpy() no +devuelve un puntero a el destino, sino el número de caracteres no nulos +compilados (o el valor negativo de errno cuando se trunca la cadena de +caracteres). + +strncpy() en cadenas de caracteres terminadas en NUL +---------------------------------------------------- +El uso de strncpy() no garantiza que el buffer de destino esté terminado en +NUL. Esto puede causar varios errores de desbordamiento en lectura y otros +tipos de funcionamiento erróneo debido a que falta la terminación en NUL. +Esta función también termina la cadena de caracteres en NUL en el buffer de +destino si la cadena de origen es más corta que el buffer de destino, lo +cual puede ser una penalización innecesaria para funciones usen esta +función con cadenas de caracteres que sà están terminadas en NUL. + +Cuando se necesita que la cadena de destino sea terminada en NUL, +el mejor reemplazo es usar la función strscpy(), aunque se ha de tener +cuidado en los casos en los que el valor de strncpy() fuera usado, ya que +strscpy() no devuelve un puntero al destino, sino el número de +caracteres no nulos copiados (o el valor negativo de errno cuando se trunca +la cadena de caracteres). Cualquier caso restante que necesitase todavÃa +ser terminado en el caracter nulo, deberÃa usar strscpy_pad(). + +Si una función usa cadenas de caracteres que no necesitan terminar en NUL, +deberÃa usarse strtomem(), y el destino deberÃa señalarse con el atributo +`__nonstring +<https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html>`_ +para evitar avisos futuros en el compilador. Para casos que todavÃa +necesitan cadenas de caracteres que se rellenen al final con el +caracter NUL, usar strtomem_pad(). + +strlcpy() +--------- +strlcpy() primero lee por completo el buffer de origen (ya que el valor +devuelto intenta ser el mismo que el de strlen()). Esta lectura puede +sobrepasar el lÃmite de tamaño del destino. Esto ineficiente y puede causar +desbordamientos de lectura si la cadena de origen no está terminada en el +carácter NUL. El reemplazo seguro de esta función es strscpy(), pero se ha +de tener cuidado que en los casos en lso que se usase el valor devuelto de +strlcpy(), ya que strscpy() devolverá valores negativos de erno cuando se +produzcan truncados. + +Especificación de formato %p +---------------------------- +Tradicionalmente,el uso de "%p" en el formato de cadenas de caracteres +resultarÃa en exponer esas direcciones en dmesg, proc, sysfs, etc. En vez +de dejar que sean una vulnerabilidad, todos los "%p" que se usan en el +kernel se imprimen como un hash, haciéndolos efectivamente inutilizables +para usarlos como direcciones de memoria. Nuevos usos de "%p" no deberÃan +ser añadidos al kernel. Para textos de direcciones, usar "%pS" es +mejor, ya que resulta en el nombre del sÃmbolo. Para prácticamente el +resto de casos, mejor no usar "%p" en absoluto. + +Parafraseando las actuales `direcciones de Linus <https://lore.kernel.org/lkml/CA+55aFwQEd_d40g4mUCSsVRZzrFPUJt74vc6PPpb675hYNXcKw@mail.gmail.com/>`_: + +- Si el valor "hasheado" "%p" no tienen ninguna finalidad, preguntarse si el + puntero es realmente importante. ¿Quizás se podrÃa quitar totalmente? +- Si realmente se piensa que el valor del puntero es importante, ¿porqué + algún estado del sistema o nivel de privilegio de usuario es considerado + "especial"? Si piensa que puede justificarse (en comentarios y mensajes + del commit), de forma suficiente como para pasar el escrutinio de Linux, + quizás pueda usar el "%p", a la vez que se asegura que tiene los permisos + correspondientes. + +Si está depurando algo donde el "%p" hasheado está causando problemas, +se puede arrancar temporalmente con la opción de depuración "`no_hash_pointers +<https://git.kernel.org/linus/5ead723a20e0447bc7db33dc3070b420e5f80aa6>`_". + + +Arrays de longitud variable (VLAs) +---------------------------------- +Usando VLA en la pila (stack) produce un código mucho peor que los arrays +de tamaño estático. Mientras que estos errores no triviales de `rendimiento +<https://git.kernel.org/linus/02361bc77888>`_ son razón suficiente +para no usar VLAs, esto además son un riesgo de seguridad. El crecimiento +dinámico del array en la pila, puede exceder la memoria restante en +el segmento de la pila. Esto podrÃa llevara a un fallo, posible sobre-escritura +de contenido al final de la pila (cuando se construye sin +`CONFIG_THREAD_INFO_IN_TASK=y`), o sobre-escritura de la memoria adyacente +a la pila (cuando se construye sin `CONFIG_VMAP_STACK=y`). + + +Switch case fall-through implÃcito +---------------------------------- +El lenguaje C permite a las sentencias 'switch' saltar de un caso al +siguiente caso cuando la sentencia de ruptura "break" no aparece al final +del caso. Esto, introduce ambigüedad en el código, ya que no siempre está +claro si el 'break' que falta es intencionado o un olvido. Por ejemplo, no +es obvio solamente mirando al código si `STATE_ONE` está escrito para +intencionadamente saltar en `STATE_TWO`:: + + switch (value) { + case STATE_ONE: + do_something(); + case STATE_TWO: + do_other(); + break; + default: + WARN("unknown state"); + } + +Ya que ha habido una larga lista de defectos `debidos a declaraciones de "break" +que faltan <https://cwe.mitre.org/data/definitions/484.html>`_, no se +permiten 'fall-through' implÃcitos. Para identificar 'fall-through' +intencionados, se ha adoptado la pseudo-palabra-clave macro "falltrhrough", +que expande las extensiones de gcc `__attribute__((__fallthrough__)) +<https://gcc.gnu.org/onlinedocs/gcc/Statement-Attributes.html>`_. +(Cuando la sintaxis de C17/c18 `[[fallthrough]]` sea más comúnmente +soportadas por los compiladores de C, analizadores estáticos, e IDEs, +se puede cambiar a usar esa sintaxis para esa pseudo-palabra-clave. + +Todos los bloques switch/case deben acabar en uno de: + +* break; +* fallthrough; +* continue; +* goto <label>; +* return [expression]; + + +Arrays de longitud cero y un elemento +------------------------------------- +Hay una necesidad habitual en el kernel de proveer una forma para declarar +un grupo de elementos consecutivos de tamaño dinámico en una estructura. +El código del kernel deberÃa usar siempre `"miembros array flexible" <https://en.wikipedia.org/wiki/Flexible_array_member>`_ +en estos casos. El estilo anterior de arrays de un elemento o de longitud +cero, no deben usarse más. + +En el código C más antiguo, los elementos finales de tamaño dinámico se +obtenÃan especificando un array de un elemento al final de una estructura:: + + struct something { + size_t count; + struct foo items[1]; + }; + +En código C más antiguo, elementos seguidos de tamaño dinámico eran creados +especificando una array de un único elemento al final de una estructura:: + + struct something { + size_t count; + struct foo items[1]; + }; + +Esto llevó a resultados incorrectos en los cálculos de tamaño mediante +sizeof() (el cual hubiera necesitado eliminar el tamaño del último elemento +para tener un tamaño correcto de la "cabecera"). Una `extensión de GNU C +<https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html>`_ se empezó a usar +para permitir los arrays de longitud cero, para evitar estos tipos de +problemas de tamaño:: + + struct something { + size_t count; + struct foo items[0]; + }; + +Pero esto llevó a otros problemas, y no solucionó algunos otros problemas +compartidos por ambos estilos, como no ser capaz de detectar cuando ese array +accidentalmente _no_ es usado al final de la estructura (lo que podÃa pasar +directamente, o cuando dicha estructura era usada en uniones, estructuras +de estructuras, etc). + +C99 introdujo "los arrays miembros flexibles", los cuales carecen de un +tamaño numérico en su declaración del array:: + + struct something { + size_t count; + struct foo items[]; + }; + +Esta es la forma en la que el kernel espera que se declaren los elementos +de tamaño dinámico concatenados. Esto permite al compilador generar +errores, cuando el array flexible no es declarado en el último lugar de la +estructura, lo que ayuda a prevenir errores en él código del tipo +`comportamiento indefinido <https://git.kernel.org/linus/76497732932f15e7323dc805e8ea8dc11bb587cf>`_. +Esto también permite al compilador analizar correctamente los tamaños de +los arrays (via sizeof(), `CONFIG_FORTIFY_SOURCE`, y `CONFIG_UBSAN_BOUNDS`). +Por ejemplo, si no hay un mecanismo que avise que el siguiente uso de +sizeof() en un array de longitud cero, siempre resulta en cero:: + + struct something { + size_t count; + struct foo items[0]; + }; + + struct something *instance; + + instance = kmalloc(struct_size(instance, items, count), GFP_KERNEL); + instance->count = count; + + size = sizeof(instance->items) * instance->count; + memcpy(instance->items, source, size); + +En la última lÃnea del código anterior, ``zero`` vale ``cero``, cuando uno +podrÃa esperar que representa el tamaño total en bytes de la memoria dinámica +reservada para el array consecutivo ``items``. Aquà hay un par de ejemplos +más sobre este tema: `link 1 +<https://git.kernel.org/linus/f2cd32a443da694ac4e28fbf4ac6f9d5cc63a539>`_, +`link 2 +<https://git.kernel.org/linus/ab91c2a89f86be2898cee208d492816ec238b2cf>`_. +Sin embargo, los array de miembros flexibles tienen un type incompleto, y +no se ha de aplicar el operador sizeof()<https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html>`_, +asà cualquier mal uso de dichos operadores será detectado inmediatamente en +el momento de compilación. + +Con respecto a los arrays de un único elemento, se ha de ser consciente de +que dichos arrays ocupan al menos tanto espacio como un único objeto del +tipo https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html>`_, de ahà que +estos contribuyan al tamaño de la estructura que los contiene. Esto es +proclive a errores cada vez que se quiere calcular el tamaño total de la +memoria dinámica para reservar una estructura que contenga un array de este +tipo como su miembro:: + + struct something { + size_t count; + struct foo items[1]; + }; + + struct something *instance; + + instance = kmalloc(struct_size(instance, items, count - 1), GFP_KERNEL); + instance->count = count; + + size = sizeof(instance->items) * instance->count; + memcpy(instance->items, source, size); + +En el ejemplo anterior, hemos de recordar calcular ``count - 1``, cuando se +usa la función de ayuda struct_size(), de otro modo estarÃamos +--desintencionadamente--reservando memoria para un ``items`` de más. La +forma más clara y menos proclive a errores es implementar esto mediante el +uso de `array miembro flexible`, junto con las funciones de ayuda: +struct_size() y flex_array_size():: + + struct something { + size_t count; + struct foo items[]; + }; + + struct something *instance; + + instance = kmalloc(struct_size(instance, items, count), GFP_KERNEL); + instance->count = count; + + memcpy(instance->items, source, flex_array_size(instance, items, instance->count)); diff --git a/Documentation/translations/sp_SP/process/index.rst b/Documentation/translations/sp_SP/process/index.rst index 0f1e131b3bb1..3b0c32593726 100644 --- a/Documentation/translations/sp_SP/process/index.rst +++ b/Documentation/translations/sp_SP/process/index.rst @@ -18,3 +18,4 @@ email-clients magic-number programming-language + deprecated diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/lru_sort.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/lru_sort.rst index 812ef315c8f6..03d33c710604 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/lru_sort.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/lru_sort.rst @@ -250,7 +250,7 @@ LRU的优先级的æå‡ï¼ŒåŒæ—¶é™ä½Žé‚£äº›è¶…过120ç§’æ— äººè®¿é—®çš„å†…å˜åŒ ç†è¢«é™åˆ¶åœ¨æœ€å¤š1%çš„CPU以é¿å…DAMON_LRU_SORT消费过多CPU时间。在系统空闲内å˜è¶…过50% æ—¶DAMON_LRU_SORTåœæ¢å·¥ä½œï¼Œå¹¶åœ¨ä½ŽäºŽ40%æ—¶é‡æ–°å¼€å§‹å·¥ä½œã€‚如果DAMON_RECLAIM没有å–å¾— 进展且空闲内å˜ä½ŽäºŽ20%,å†æ¬¡è®©DAMON_LRU_SORTåœæ¢å·¥ä½œï¼Œä»¥æ¤å›žé€€åˆ°ä»¥LRU链表为基础 -以页é¢ä¸ºå•ä½çš„内å˜å›žæ”¶ä¸Šã€‚ +以页é¢ä¸ºå•ä½çš„内å˜å›žæ”¶ä¸Šã€‚ :: # cd /sys/modules/damon_lru_sort/parameters # echo 500 > hot_thres_access_freq diff --git a/Documentation/translations/zh_CN/arch.rst b/Documentation/translations/zh_CN/arch/index.rst index 690e173d8b2a..908ea131bb1c 100644 --- a/Documentation/translations/zh_CN/arch.rst +++ b/Documentation/translations/zh_CN/arch/index.rst @@ -8,12 +8,12 @@ .. toctree:: :maxdepth: 2 - mips/index - arm64/index - riscv/index + ../mips/index + ../arm64/index + ../riscv/index openrisc/index parisc/index - loongarch/index + ../loongarch/index TODOList: diff --git a/Documentation/translations/zh_CN/openrisc/index.rst b/Documentation/translations/zh_CN/arch/openrisc/index.rst index 9ad6cc600884..da21f8ab894b 100644 --- a/Documentation/translations/zh_CN/openrisc/index.rst +++ b/Documentation/translations/zh_CN/arch/openrisc/index.rst @@ -1,8 +1,8 @@ .. SPDX-License-Identifier: GPL-2.0 -.. include:: ../disclaimer-zh_CN.rst +.. include:: ../../disclaimer-zh_CN.rst -:Original: Documentation/openrisc/index.rst +:Original: Documentation/arch/openrisc/index.rst :翻译: diff --git a/Documentation/translations/zh_CN/openrisc/openrisc_port.rst b/Documentation/translations/zh_CN/arch/openrisc/openrisc_port.rst index b8a67670492d..cadc580fa23b 100644 --- a/Documentation/translations/zh_CN/openrisc/openrisc_port.rst +++ b/Documentation/translations/zh_CN/arch/openrisc/openrisc_port.rst @@ -1,6 +1,6 @@ -.. include:: ../disclaimer-zh_CN.rst +.. include:: ../../disclaimer-zh_CN.rst -:Original: Documentation/openrisc/openrisc_port.rst +:Original: Documentation/arch/openrisc/openrisc_port.rst :翻译: diff --git a/Documentation/translations/zh_CN/openrisc/todo.rst b/Documentation/translations/zh_CN/arch/openrisc/todo.rst index 63c38717edb1..1f6f95616633 100644 --- a/Documentation/translations/zh_CN/openrisc/todo.rst +++ b/Documentation/translations/zh_CN/arch/openrisc/todo.rst @@ -1,6 +1,6 @@ -.. include:: ../disclaimer-zh_CN.rst +.. include:: ../../disclaimer-zh_CN.rst -:Original: Documentation/openrisc/todo.rst +:Original: Documentation/arch/openrisc/todo.rst :翻译: diff --git a/Documentation/translations/zh_CN/parisc/debugging.rst b/Documentation/translations/zh_CN/arch/parisc/debugging.rst index 68b73eb57105..c6b9de6d3175 100644 --- a/Documentation/translations/zh_CN/parisc/debugging.rst +++ b/Documentation/translations/zh_CN/arch/parisc/debugging.rst @@ -1,6 +1,6 @@ -.. include:: ../disclaimer-zh_CN.rst +.. include:: ../../disclaimer-zh_CN.rst -:Original: Documentation/parisc/debugging.rst +:Original: Documentation/arch/parisc/debugging.rst :翻译: diff --git a/Documentation/translations/zh_CN/parisc/index.rst b/Documentation/translations/zh_CN/arch/parisc/index.rst index 0cc553fc8272..9f69283bd1c9 100644 --- a/Documentation/translations/zh_CN/parisc/index.rst +++ b/Documentation/translations/zh_CN/arch/parisc/index.rst @@ -1,7 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0 -.. include:: ../disclaimer-zh_CN.rst +.. include:: ../../disclaimer-zh_CN.rst -:Original: Documentation/parisc/index.rst +:Original: Documentation/arch/parisc/index.rst :翻译: diff --git a/Documentation/translations/zh_CN/parisc/registers.rst b/Documentation/translations/zh_CN/arch/parisc/registers.rst index d2ab1874a602..a55250afcc27 100644 --- a/Documentation/translations/zh_CN/parisc/registers.rst +++ b/Documentation/translations/zh_CN/arch/parisc/registers.rst @@ -1,6 +1,6 @@ -.. include:: ../disclaimer-zh_CN.rst +.. include:: ../../disclaimer-zh_CN.rst -:Original: Documentation/parisc/registers.rst +:Original: Documentation/arch/parisc/registers.rst :翻译: diff --git a/Documentation/translations/zh_CN/index.rst b/Documentation/translations/zh_CN/index.rst index 7c3216845b71..299704c0818d 100644 --- a/Documentation/translations/zh_CN/index.rst +++ b/Documentation/translations/zh_CN/index.rst @@ -120,7 +120,7 @@ TODOList: .. toctree:: :maxdepth: 2 - arch + arch/index 其他文档 -------- diff --git a/Documentation/userspace-api/ELF.rst b/Documentation/userspace-api/ELF.rst new file mode 100644 index 000000000000..ac8aeacd458d --- /dev/null +++ b/Documentation/userspace-api/ELF.rst @@ -0,0 +1,34 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================= +Linux-specific ELF idiosyncrasies +================================= + +Definitions +=========== + +"First" program header is the one with the smallest offset in the file: +e_phoff. + +"Last" program header is the one with the biggest offset in the file: +e_phoff + (e_phnum - 1) * sizeof(Elf_Phdr). + +PT_INTERP +========= + +First PT_INTERP program header is used to locate the filename of ELF +interpreter. Other PT_INTERP headers are ignored (since Linux 2.4.11). + +PT_GNU_STACK +============ + +Last PT_GNU_STACK program header defines userspace stack executability +(since Linux 2.6.6). Other PT_GNU_STACK headers are ignored. + +PT_GNU_PROPERTY +=============== + +ELF interpreter's last PT_GNU_PROPERTY program header is used (since +Linux 5.8). If interpreter doesn't have one, then the last PT_GNU_PROPERTY +program header of an executable is used. Other PT_GNU_PROPERTY headers +are ignored. diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index f16337bdb852..72a65db0c498 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -23,6 +23,7 @@ place where this information is gathered. spec_ctrl accelerators/ocxl ebpf/index + ELF ioctl/index iommu iommufd diff --git a/Documentation/userspace-api/media/v4l/pixfmt-rgb.rst b/Documentation/userspace-api/media/v4l/pixfmt-rgb.rst index d330aeb4d3eb..d9d7b7621d8c 100644 --- a/Documentation/userspace-api/media/v4l/pixfmt-rgb.rst +++ b/Documentation/userspace-api/media/v4l/pixfmt-rgb.rst @@ -778,7 +778,7 @@ number of bits for each component. \tiny \setlength{\tabcolsep}{2pt} -.. tabularcolumns:: |p{2.8cm}|p{2.0cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}| +.. tabularcolumns:: |p{3.2cm}|p{0.8cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}| .. flat-table:: RGB Formats 10 Bits Per Color Component @@ -868,7 +868,6 @@ number of bits for each component. - r\ :sub:`4` - r\ :sub:`3` - r\ :sub:`2` - - * .. _V4L2-PIX-FMT-RGBA1010102: - ``V4L2_PIX_FMT_RGBA1010102`` @@ -909,7 +908,6 @@ number of bits for each component. - r\ :sub:`4` - r\ :sub:`3` - r\ :sub:`2` - - * .. _V4L2-PIX-FMT-ARGB2101010: - ``V4L2_PIX_FMT_ARGB2101010`` @@ -950,7 +948,6 @@ number of bits for each component. - r\ :sub:`6` - r\ :sub:`5` - r\ :sub:`4` - - .. raw:: latex diff --git a/Documentation/virt/coco/sev-guest.rst b/Documentation/virt/coco/sev-guest.rst index bf593e88cfd9..68b0d2363af8 100644 --- a/Documentation/virt/coco/sev-guest.rst +++ b/Documentation/virt/coco/sev-guest.rst @@ -37,11 +37,11 @@ along with a description: the return value. General error numbers (-ENOMEM, -EINVAL) are not detailed, but errors with specific meanings are. -The guest ioctl should be issued on a file descriptor of the /dev/sev-guest device. -The ioctl accepts struct snp_user_guest_request. The input and output structure is -specified through the req_data and resp_data field respectively. If the ioctl fails -to execute due to a firmware error, then fw_err code will be set otherwise the -fw_err will be set to 0x00000000000000ff. +The guest ioctl should be issued on a file descriptor of the /dev/sev-guest +device. The ioctl accepts struct snp_user_guest_request. The input and +output structure is specified through the req_data and resp_data field +respectively. If the ioctl fails to execute due to a firmware error, then +the fw_error code will be set, otherwise fw_error will be set to -1. The firmware checks that the message sequence counter is one greater than the guests message sequence counter. If guest driver fails to increment message @@ -57,8 +57,14 @@ counter (e.g. counter overflow), then -EIO will be returned. __u64 req_data; __u64 resp_data; - /* firmware error code on failure (see psp-sev.h) */ - __u64 fw_err; + /* bits[63:32]: VMM error code, bits[31:0] firmware error code (see psp-sev.h) */ + union { + __u64 exitinfo2; + struct { + __u32 fw_error; + __u32 vmm_error; + }; + }; }; 2.1 SNP_GET_REPORT diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index a5c803f39832..841e9d1987bd 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7456,7 +7456,7 @@ system fingerprint. To prevent userspace from circumventing such restrictions by running an enclave in a VM, KVM prevents access to privileged attributes by default. -See Documentation/x86/sgx.rst for more details. +See Documentation/arch/x86/sgx.rst for more details. 7.26 KVM_CAP_PPC_RPT_INVALIDATE ------------------------------- diff --git a/MAINTAINERS b/MAINTAINERS index 90abe83c02f3..30cc8f683dc6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -224,13 +224,13 @@ S: Orphan / Obsolete F: drivers/net/ethernet/8390/ 9P FILE SYSTEM -M: Eric Van Hensbergen <ericvh@gmail.com> +M: Eric Van Hensbergen <ericvh@kernel.org> M: Latchesar Ionkov <lucho@ionkov.net> M: Dominique Martinet <asmadeus@codewreck.org> R: Christian Schoenebeck <linux_oss@crudebyte.com> -L: v9fs-developer@lists.sourceforge.net +L: v9fs@lists.linux.dev S: Maintained -W: http://swik.net/v9fs +W: http://github.com/v9fs Q: http://patchwork.kernel.org/project/v9fs-devel/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs.git T: git git://github.com/martinetd/linux.git @@ -1071,7 +1071,7 @@ M: Naveen Krishna Chatradhi <naveenkrishna.chatradhi@amd.com> R: Carlos Bilbao <carlos.bilbao@amd.com> L: platform-driver-x86@vger.kernel.org S: Maintained -F: Documentation/x86/amd_hsmp.rst +F: Documentation/arch/x86/amd_hsmp.rst F: arch/x86/include/asm/amd_hsmp.h F: arch/x86/include/uapi/asm/amd_hsmp.h F: drivers/platform/x86/amd/hsmp.c @@ -4461,14 +4461,14 @@ F: Documentation/devicetree/bindings/net/ieee802154/ca8210.txt F: drivers/net/ieee802154/ca8210.c CANAAN/KENDRYTE K210 SOC FPIOA DRIVER -M: Damien Le Moal <damien.lemoal@wdc.com> +M: Damien Le Moal <dlemoal@kernel.org> L: linux-riscv@lists.infradead.org L: linux-gpio@vger.kernel.org (pinctrl driver) F: Documentation/devicetree/bindings/pinctrl/canaan,k210-fpioa.yaml F: drivers/pinctrl/pinctrl-k210.c CANAAN/KENDRYTE K210 SOC RESET CONTROLLER DRIVER -M: Damien Le Moal <damien.lemoal@wdc.com> +M: Damien Le Moal <dlemoal@kernel.org> L: linux-kernel@vger.kernel.org L: linux-riscv@lists.infradead.org S: Maintained @@ -4476,7 +4476,7 @@ F: Documentation/devicetree/bindings/reset/canaan,k210-rst.yaml F: drivers/reset/reset-k210.c CANAAN/KENDRYTE K210 SOC SYSTEM CONTROLLER DRIVER -M: Damien Le Moal <damien.lemoal@wdc.com> +M: Damien Le Moal <dlemoal@kernel.org> L: linux-riscv@lists.infradead.org S: Maintained F: Documentation/devicetree/bindings/mfd/canaan,k210-sysctl.yaml @@ -5940,11 +5940,6 @@ F: drivers/devfreq/event/ F: include/dt-bindings/pmu/exynos_ppmu.h F: include/linux/devfreq-event.h -DEVICE NUMBER REGISTRY -M: Torben Mathiasen <device@lanana.org> -S: Maintained -W: http://lanana.org/docs/device-list/index.html - DEVICE RESOURCE MANAGEMENT HELPERS M: Hans de Goede <hdegoede@redhat.com> R: Matti Vaittinen <mazziesaccount@gmail.com> @@ -6207,6 +6202,7 @@ DOCUMENTATION REPORTING ISSUES M: Thorsten Leemhuis <linux@leemhuis.info> L: linux-doc@vger.kernel.org S: Maintained +F: Documentation/admin-guide/quickly-build-trimmed-linux.rst F: Documentation/admin-guide/reporting-issues.rst DOCUMENTATION SCRIPTS @@ -9729,7 +9725,7 @@ F: include/linux/i3c/ IA64 (Itanium) PLATFORM L: linux-ia64@vger.kernel.org S: Orphan -F: Documentation/ia64/ +F: Documentation/arch/ia64/ F: arch/ia64/ IBM Operation Panel Input Driver @@ -10648,7 +10644,7 @@ L: tboot-devel@lists.sourceforge.net S: Supported W: http://tboot.sourceforge.net T: hg http://tboot.hg.sourceforge.net:8000/hgroot/tboot/tboot -F: Documentation/x86/intel_txt.rst +F: Documentation/arch/x86/intel_txt.rst F: arch/x86/kernel/tboot.c F: include/linux/tboot.h @@ -10659,7 +10655,7 @@ L: linux-sgx@vger.kernel.org S: Supported Q: https://patchwork.kernel.org/project/intel-sgx/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/sgx -F: Documentation/x86/sgx.rst +F: Documentation/arch/x86/sgx.rst F: arch/x86/entry/vdso/vsgx.S F: arch/x86/include/asm/sgx.h F: arch/x86/include/uapi/asm/sgx.h @@ -11758,7 +11754,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git F: drivers/ata/sata_promise.* LIBATA SUBSYSTEM (Serial and Parallel ATA drivers) -M: Damien Le Moal <damien.lemoal@opensource.wdc.com> +M: Damien Le Moal <dlemoal@kernel.org> L: linux-ide@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/libata.git @@ -14594,6 +14590,7 @@ F: net/netlabel/ NETWORKING [MPTCP] M: Matthieu Baerts <matthieu.baerts@tessares.net> +M: Mat Martineau <martineau@kernel.org> L: netdev@vger.kernel.org L: mptcp@lists.linux.dev S: Maintained @@ -15639,7 +15636,7 @@ S: Maintained W: http://openrisc.io T: git https://github.com/openrisc/linux.git F: Documentation/devicetree/bindings/openrisc/ -F: Documentation/openrisc/ +F: Documentation/arch/openrisc/ F: arch/openrisc/ F: drivers/irqchip/irq-ompic.c F: drivers/irqchip/irq-or1k-* @@ -15835,7 +15832,7 @@ W: https://parisc.wiki.kernel.org Q: http://patchwork.kernel.org/project/linux-parisc/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/jejb/parisc-2.6.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/deller/parisc-linux.git -F: Documentation/parisc/ +F: Documentation/arch/parisc/ F: arch/parisc/ F: drivers/char/agp/parisc-agp.c F: drivers/input/misc/hp_sdc_rtc.c @@ -17166,6 +17163,12 @@ F: fs/qnx4/ F: include/uapi/linux/qnx4_fs.h F: include/uapi/linux/qnxtypes.h +QNX6 FILESYSTEM +S: Orphan +F: Documentation/filesystems/qnx6.rst +F: fs/qnx6/ +F: include/linux/qnx6_fs.h + QORIQ DPAA2 FSL-MC BUS DRIVER M: Stuart Yoder <stuyoder@gmail.com> M: Laurentiu Tudor <laurentiu.tudor@nxp.com> @@ -17626,7 +17629,7 @@ M: Fenghua Yu <fenghua.yu@intel.com> M: Reinette Chatre <reinette.chatre@intel.com> L: linux-kernel@vger.kernel.org S: Supported -F: Documentation/x86/resctrl* +F: Documentation/arch/x86/resctrl* F: arch/x86/include/asm/resctrl.h F: arch/x86/kernel/cpu/resctrl/ F: tools/testing/selftests/resctrl/ @@ -17635,11 +17638,13 @@ READ-COPY UPDATE (RCU) M: "Paul E. McKenney" <paulmck@kernel.org> M: Frederic Weisbecker <frederic@kernel.org> (kernel/rcu/tree_nocb.h) M: Neeraj Upadhyay <quic_neeraju@quicinc.com> (kernel/rcu/tasks.h) +M: Joel Fernandes <joel@joelfernandes.org> M: Josh Triplett <josh@joshtriplett.org> +M: Boqun Feng <boqun.feng@gmail.com> R: Steven Rostedt <rostedt@goodmis.org> R: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> R: Lai Jiangshan <jiangshanlai@gmail.com> -R: Joel Fernandes <joel@joelfernandes.org> +R: Zqiang <qiang1.zhang@intel.com> L: rcu@vger.kernel.org S: Supported W: http://www.rdrop.com/users/paulmck/RCU/ @@ -18821,8 +18826,8 @@ S: Supported W: https://selinuxproject.org W: https://github.com/SELinuxProject T: git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git -F: Documentation/ABI/obsolete/sysfs-selinux-checkreqprot -F: Documentation/ABI/obsolete/sysfs-selinux-disable +F: Documentation/ABI/removed/sysfs-selinux-checkreqprot +F: Documentation/ABI/removed/sysfs-selinux-disable F: Documentation/admin-guide/LSM/SELinux.rst F: include/trace/events/avc.h F: include/uapi/linux/selinux_netlink.h @@ -20112,7 +20117,7 @@ M: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> L: linux-sh@vger.kernel.org S: Maintained Q: http://patchwork.kernel.org/project/linux-sh/list/ -F: Documentation/sh/ +F: Documentation/arch/sh/ F: arch/sh/ F: drivers/sh/ @@ -20172,7 +20177,7 @@ M: Vineet Gupta <vgupta@kernel.org> L: linux-snps-arc@lists.infradead.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git -F: Documentation/arc/ +F: Documentation/arch/arc F: Documentation/devicetree/bindings/arc/* F: Documentation/devicetree/bindings/interrupt-controller/snps,arc* F: arch/arc/ @@ -21231,6 +21236,14 @@ S: Maintained F: Documentation/tools/rtla/ F: tools/tracing/rtla/ +TECHNICAL ADVISORY BOARD PROCESS DOCS +M: "Theodore Ts'o" <tytso@mit.edu> +M: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +L: tech-board-discuss@lists.linux-foundation.org +S: Maintained +F: Documentation/process/researcher-guidelines.rst +F: Documentation/process/contribution-maturity-model.rst + TRADITIONAL CHINESE DOCUMENTATION M: Hu Haowen <src.res@email.cn> L: linux-doc-tw-discuss@lists.sourceforge.net (moderated for non-subscribers) @@ -22164,7 +22177,9 @@ L: virtualization@lists.linux-foundation.org L: netdev@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git +F: kernel/vhost_task.c F: drivers/vhost/ +F: include/linux/sched/vhost_task.h F: include/linux/vhost_iotlb.h F: include/uapi/linux/vhost.h @@ -22638,7 +22653,7 @@ L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core F: Documentation/devicetree/bindings/x86/ -F: Documentation/x86/ +F: Documentation/arch/x86/ F: arch/x86/ X86 ENTRY CODE @@ -22648,13 +22663,24 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/asm F: arch/x86/entry/ +X86 HARDWARE VULNERABILITIES +M: Thomas Gleixner <tglx@linutronix.de> +M: Borislav Petkov <bp@alien8.de> +M: Peter Zijlstra <peterz@infradead.org> +M: Josh Poimboeuf <jpoimboe@kernel.org> +R: Pawan Gupta <pawan.kumar.gupta@linux.intel.com> +S: Maintained +F: Documentation/admin-guide/hw-vuln/ +F: arch/x86/include/asm/nospec-branch.h +F: arch/x86/kernel/cpu/bugs.c + X86 MCE INFRASTRUCTURE M: Tony Luck <tony.luck@intel.com> M: Borislav Petkov <bp@alien8.de> L: linux-edac@vger.kernel.org S: Maintained F: Documentation/ABI/testing/sysfs-mce -F: Documentation/x86/x86_64/machinecheck.rst +F: Documentation/arch/x86/x86_64/machinecheck.rst F: arch/x86/kernel/cpu/mce/* X86 MICROCODE UPDATE SUPPORT @@ -23115,7 +23141,7 @@ S: Maintained F: arch/x86/kernel/cpu/zhaoxin.c ZONEFS FILESYSTEM -M: Damien Le Moal <damien.lemoal@opensource.wdc.com> +M: Damien Le Moal <dlemoal@kernel.org> M: Naohiro Aota <naohiro.aota@wdc.com> R: Johannes Thumshirn <jth@kernel.org> L: linux-fsdevel@vger.kernel.org @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 3 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index e24a9820e12f..d64ad0fe6c0c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -986,7 +986,7 @@ config SMP uniprocessor machines. On a uniprocessor machine, the kernel will run faster if you say N here. - See also <file:Documentation/x86/i386/IO-APIC.rst>, + See also <file:Documentation/arch/x86/i386/IO-APIC.rst>, <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO available at <http://tldp.org/HOWTO/SMP-HOWTO.html>. diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 2ef651a78fa2..726ecabcef09 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -107,7 +107,7 @@ ccflags-remove-$(CONFIG_FUNCTION_TRACER) += -pg asflags-y := -DZIMAGE # Supply kernel BSS size to the decompressor via a linker symbol. -KBSS_SZ = $(shell echo $$(($$($(NM) $(obj)/../../../../vmlinux | \ +KBSS_SZ = $(shell echo $$(($$($(NM) vmlinux | \ sed -n -e 's/^\([^ ]*\) [ABD] __bss_start$$/-0x\1/p' \ -e 's/^\([^ ]*\) [ABD] __bss_stop$$/+0x\1/p') )) ) LDFLAGS_vmlinux = --defsym _kernel_bss_size=$(KBSS_SZ) diff --git a/arch/arm/boot/dts/imx6ull-colibri.dtsi b/arch/arm/boot/dts/imx6ull-colibri.dtsi index bf64ba84b358..fde8a19aac0f 100644 --- a/arch/arm/boot/dts/imx6ull-colibri.dtsi +++ b/arch/arm/boot/dts/imx6ull-colibri.dtsi @@ -33,15 +33,9 @@ self-powered; type = "micro"; - ports { - #address-cells = <1>; - #size-cells = <0>; - - port@0 { - reg = <0>; - usb_dr_connector: endpoint { - remote-endpoint = <&usb1_drd_sw>; - }; + port { + usb_dr_connector: endpoint { + remote-endpoint = <&usb1_drd_sw>; }; }; }; diff --git a/arch/arm/boot/dts/imx7d-remarkable2.dts b/arch/arm/boot/dts/imx7d-remarkable2.dts index 8b2f11e85e05..427f8d04ec89 100644 --- a/arch/arm/boot/dts/imx7d-remarkable2.dts +++ b/arch/arm/boot/dts/imx7d-remarkable2.dts @@ -118,8 +118,6 @@ reg = <0x62>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_epdpmic>; - #address-cells = <1>; - #size-cells = <0>; #thermal-sensor-cells = <0>; epd-pwr-good-gpios = <&gpio6 21 GPIO_ACTIVE_HIGH>; diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi index 2ca76b69add7..511ca864c1b2 100644 --- a/arch/arm/boot/dts/rk3288.dtsi +++ b/arch/arm/boot/dts/rk3288.dtsi @@ -942,7 +942,7 @@ status = "disabled"; }; - spdif: sound@ff88b0000 { + spdif: sound@ff8b0000 { compatible = "rockchip,rk3288-spdif", "rockchip,rk3066-spdif"; reg = <0x0 0xff8b0000 0x0 0x10000>; #sound-dai-cells = <0>; diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig index 6dc6fed12af8..8d002c6e6cb3 100644 --- a/arch/arm/configs/imx_v6_v7_defconfig +++ b/arch/arm/configs/imx_v6_v7_defconfig @@ -76,7 +76,7 @@ CONFIG_RFKILL=y CONFIG_RFKILL_INPUT=y CONFIG_PCI=y CONFIG_PCI_MSI=y -CONFIG_PCI_IMX6=y +CONFIG_PCI_IMX6_HOST=y CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y # CONFIG_STANDALONE is not set diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 06b48ce23e1c..505a306e0271 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -244,19 +244,6 @@ THUMB( fpreg .req r7 ) .endm #endif - .macro local_bh_disable, ti, tmp - ldr \tmp, [\ti, #TI_PREEMPT] - add \tmp, \tmp, #SOFTIRQ_DISABLE_OFFSET - str \tmp, [\ti, #TI_PREEMPT] - .endm - - .macro local_bh_enable_ti, ti, tmp - get_thread_info \ti - ldr \tmp, [\ti, #TI_PREEMPT] - sub \tmp, \tmp, #SOFTIRQ_DISABLE_OFFSET - str \tmp, [\ti, #TI_PREEMPT] - .endm - #define USERL(l, x...) \ 9999: x; \ .pushsection __ex_table,"a"; \ diff --git a/arch/arm/vfp/entry.S b/arch/arm/vfp/entry.S index 9a89264cdcc0..7483ef8bccda 100644 --- a/arch/arm/vfp/entry.S +++ b/arch/arm/vfp/entry.S @@ -22,18 +22,7 @@ @ IRQs enabled. @ ENTRY(do_vfp) - local_bh_disable r10, r4 - ldr r4, .LCvfp - ldr r11, [r10, #TI_CPU] @ CPU number - add r10, r10, #TI_VFPSTATE @ r10 = workspace - ldr pc, [r4] @ call VFP entry point + mov r1, r10 + mov r3, r9 + b vfp_entry ENDPROC(do_vfp) - -ENTRY(vfp_null_entry) - local_bh_enable_ti r10, r4 - ret lr -ENDPROC(vfp_null_entry) - - .align 2 -.LCvfp: - .word vfp_vector diff --git a/arch/arm/vfp/vfphw.S b/arch/arm/vfp/vfphw.S index 26c4f61ecfa3..4d8478264d82 100644 --- a/arch/arm/vfp/vfphw.S +++ b/arch/arm/vfp/vfphw.S @@ -6,9 +6,9 @@ * Written by Deep Blue Solutions Limited. * * This code is called from the kernel's undefined instruction trap. - * r9 holds the return address for successful handling. + * r1 holds the thread_info pointer + * r3 holds the return address for successful handling. * lr holds the return address for unrecognised instructions. - * r10 points at the start of the private FP workspace in the thread structure * sp points to a struct pt_regs (as defined in include/asm/proc/ptrace.h) */ #include <linux/init.h> @@ -69,13 +69,15 @@ @ VFP hardware support entry point. @ @ r0 = instruction opcode (32-bit ARM or two 16-bit Thumb) +@ r1 = thread_info pointer @ r2 = PC value to resume execution after successful emulation -@ r9 = normal "successful" return address -@ r10 = vfp_state union -@ r11 = CPU number +@ r3 = normal "successful" return address @ lr = unrecognised instruction return address @ IRQs enabled. ENTRY(vfp_support_entry) + ldr r11, [r1, #TI_CPU] @ CPU number + add r10, r1, #TI_VFPSTATE @ r10 = workspace + DBGSTR3 "instr %08x pc %08x state %p", r0, r2, r10 .fpu vfpv2 @@ -85,9 +87,9 @@ ENTRY(vfp_support_entry) bne look_for_VFP_exceptions @ VFP is already enabled DBGSTR1 "enable %x", r10 - ldr r3, vfp_current_hw_state_address + ldr r9, vfp_current_hw_state_address orr r1, r1, #FPEXC_EN @ user FPEXC has the enable bit set - ldr r4, [r3, r11, lsl #2] @ vfp_current_hw_state pointer + ldr r4, [r9, r11, lsl #2] @ vfp_current_hw_state pointer bic r5, r1, #FPEXC_EX @ make sure exceptions are disabled cmp r4, r10 @ this thread owns the hw context? #ifndef CONFIG_SMP @@ -146,7 +148,7 @@ vfp_reload_hw: #endif DBGSTR1 "load state %p", r10 - str r10, [r3, r11, lsl #2] @ update the vfp_current_hw_state pointer + str r10, [r9, r11, lsl #2] @ update the vfp_current_hw_state pointer @ Load the saved state back into the VFP VFPFLDMIA r10, r5 @ reload the working registers while @ FPEXC is in a safe state @@ -175,9 +177,12 @@ vfp_hw_state_valid: @ else it's one 32-bit instruction, so @ always subtract 4 from the following @ instruction address. - local_bh_enable_ti r10, r4 - ret r9 @ we think we have handled things + mov lr, r3 @ we think we have handled things +local_bh_enable_and_ret: + adr r0, . + mov r1, #SOFTIRQ_DISABLE_OFFSET + b __local_bh_enable_ip @ tail call look_for_VFP_exceptions: @ Check for synchronous or asynchronous exception @@ -200,13 +205,12 @@ skip: @ not recognised by VFP DBGSTR "not VFP" - local_bh_enable_ti r10, r4 - ret lr + b local_bh_enable_and_ret process_exception: DBGSTR "bounce" mov r2, sp @ nothing stacked - regdump is at TOS - mov lr, r9 @ setup for a return to the user code. + mov lr, r3 @ setup for a return to the user code. @ Now call the C code to package up the bounce to the support code @ r0 holds the trigger instruction diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c index 01bc48d73847..349dcb944a93 100644 --- a/arch/arm/vfp/vfpmodule.c +++ b/arch/arm/vfp/vfpmodule.c @@ -32,10 +32,9 @@ /* * Our undef handlers (in entry.S) */ -asmlinkage void vfp_support_entry(void); -asmlinkage void vfp_null_entry(void); +asmlinkage void vfp_support_entry(u32, void *, u32, u32); -asmlinkage void (*vfp_vector)(void) = vfp_null_entry; +static bool have_vfp __ro_after_init; /* * Dual-use variable. @@ -645,6 +644,25 @@ static int vfp_starting_cpu(unsigned int unused) return 0; } +/* + * Entered with: + * + * r0 = instruction opcode (32-bit ARM or two 16-bit Thumb) + * r1 = thread_info pointer + * r2 = PC value to resume execution after successful emulation + * r3 = normal "successful" return address + * lr = unrecognised instruction return address + */ +asmlinkage void vfp_entry(u32 trigger, struct thread_info *ti, u32 resume_pc, + u32 resume_return_address) +{ + if (unlikely(!have_vfp)) + return; + + local_bh_disable(); + vfp_support_entry(trigger, ti, resume_pc, resume_return_address); +} + #ifdef CONFIG_KERNEL_MODE_NEON static int vfp_kmode_exception(struct pt_regs *regs, unsigned int instr) @@ -798,7 +816,6 @@ static int __init vfp_init(void) vfpsid = fmrx(FPSID); barrier(); unregister_undef_hook(&vfp_detect_hook); - vfp_vector = vfp_null_entry; pr_info("VFP support v0.3: "); if (VFP_arch) { @@ -883,7 +900,7 @@ static int __init vfp_init(void) "arm/vfp:starting", vfp_starting_cpu, vfp_dying_cpu); - vfp_vector = vfp_support_entry; + have_vfp = true; thread_register_notifier(&vfp_notifier_block); vfp_pm_init(); diff --git a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi index 123a56f7f818..feb27a0ccfb4 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi @@ -1571,15 +1571,20 @@ dmc: bus@38000 { compatible = "simple-bus"; - reg = <0x0 0x38000 0x0 0x400>; #address-cells = <2>; #size-cells = <2>; - ranges = <0x0 0x0 0x0 0x38000 0x0 0x400>; + ranges = <0x0 0x0 0x0 0x38000 0x0 0x2000>; canvas: video-lut@48 { compatible = "amlogic,canvas"; reg = <0x0 0x48 0x0 0x14>; }; + + pmu: pmu@80 { + reg = <0x0 0x80 0x0 0x40>, + <0x0 0xc00 0x0 0x40>; + interrupts = <GIC_SPI 52 IRQ_TYPE_EDGE_RISING>; + }; }; usb2_phy1: phy@3a000 { @@ -1705,12 +1710,6 @@ }; }; - pmu: pmu@ff638000 { - reg = <0x0 0xff638000 0x0 0x100>, - <0x0 0xff638c00 0x0 0x100>; - interrupts = <GIC_SPI 52 IRQ_TYPE_EDGE_RISING>; - }; - aobus: bus@ff800000 { compatible = "simple-bus"; reg = <0x0 0xff800000 0x0 0x100000>; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-evk.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-evk.dtsi index d1a6390976a9..3f9dfd4d3884 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-evk.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-evk.dtsi @@ -194,7 +194,7 @@ rohm,reset-snvs-powered; #clock-cells = <0>; - clocks = <&osc_32k 0>; + clocks = <&osc_32k>; clock-output-names = "clk-32k-out"; regulators { diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi index 88321b5b0693..6f0811587142 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi @@ -99,7 +99,7 @@ compatible = "regulator-fixed"; enable-active-high; gpio = <&gpio2 20 GPIO_ACTIVE_HIGH>; /* PMIC_EN_ETH */ - off-on-delay = <500000>; + off-on-delay-us = <500000>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_reg_eth>; regulator-always-on; @@ -139,7 +139,7 @@ enable-active-high; /* Verdin SD_1_PWR_EN (SODIMM 76) */ gpio = <&gpio3 5 GPIO_ACTIVE_HIGH>; - off-on-delay = <100000>; + off-on-delay-us = <100000>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_usdhc2_pwr_en>; regulator-max-microvolt = <3300000>; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-verdin-dev.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-verdin-dev.dtsi index 361426c0da0a..c29622529200 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-verdin-dev.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-verdin-dev.dtsi @@ -10,7 +10,7 @@ compatible = "regulator-fixed"; enable-active-high; gpio = <&gpio_expander_21 4 GPIO_ACTIVE_HIGH>; /* ETH_PWR_EN */ - off-on-delay = <500000>; + off-on-delay-us = <500000>; regulator-max-microvolt = <3300000>; regulator-min-microvolt = <3300000>; regulator-name = "+V3.3_ETH"; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi index 0dd6180a8e39..1608775da0ad 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi @@ -87,7 +87,7 @@ compatible = "regulator-fixed"; enable-active-high; gpio = <&gpio2 20 GPIO_ACTIVE_HIGH>; /* PMIC_EN_ETH */ - off-on-delay = <500000>; + off-on-delay-us = <500000>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_reg_eth>; regulator-always-on; @@ -128,7 +128,7 @@ enable-active-high; /* Verdin SD_1_PWR_EN (SODIMM 76) */ gpio = <&gpio4 22 GPIO_ACTIVE_HIGH>; - off-on-delay = <100000>; + off-on-delay-us = <100000>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_usdhc2_pwr_en>; regulator-max-microvolt = <3300000>; diff --git a/arch/arm64/boot/dts/freescale/imx8mp.dtsi b/arch/arm64/boot/dts/freescale/imx8mp.dtsi index 2dd60e3252f3..a237275ee017 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp.dtsi @@ -1128,7 +1128,7 @@ lcdif2: display-controller@32e90000 { compatible = "fsl,imx8mp-lcdif"; - reg = <0x32e90000 0x238>; + reg = <0x32e90000 0x10000>; interrupts = <GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>; clocks = <&clk IMX8MP_CLK_MEDIA_DISP2_PIX_ROOT>, <&clk IMX8MP_CLK_MEDIA_APB_ROOT>, diff --git a/arch/arm64/boot/dts/qcom/ipq8074-hk01.dts b/arch/arm64/boot/dts/qcom/ipq8074-hk01.dts index ca3f96646b90..5cf07caf4103 100644 --- a/arch/arm64/boot/dts/qcom/ipq8074-hk01.dts +++ b/arch/arm64/boot/dts/qcom/ipq8074-hk01.dts @@ -62,11 +62,11 @@ perst-gpios = <&tlmm 58 GPIO_ACTIVE_LOW>; }; -&pcie_phy0 { +&pcie_qmp0 { status = "okay"; }; -&pcie_phy1 { +&pcie_qmp1 { status = "okay"; }; diff --git a/arch/arm64/boot/dts/qcom/ipq8074-hk10.dtsi b/arch/arm64/boot/dts/qcom/ipq8074-hk10.dtsi index 651a231554e0..1b8379ba87f9 100644 --- a/arch/arm64/boot/dts/qcom/ipq8074-hk10.dtsi +++ b/arch/arm64/boot/dts/qcom/ipq8074-hk10.dtsi @@ -48,11 +48,11 @@ perst-gpios = <&tlmm 61 GPIO_ACTIVE_LOW>; }; -&pcie_phy0 { +&pcie_qmp0 { status = "okay"; }; -&pcie_phy1 { +&pcie_qmp1 { status = "okay"; }; diff --git a/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts b/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts index aa0a7bd7307c..dd924331b0ee 100644 --- a/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts +++ b/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts @@ -1012,7 +1012,7 @@ left_spkr: speaker@0,3 { compatible = "sdw10217211000"; reg = <0 3>; - powerdown-gpios = <&tlmm 130 GPIO_ACTIVE_HIGH>; + powerdown-gpios = <&tlmm 130 GPIO_ACTIVE_LOW>; #thermal-sensor-cells = <0>; sound-name-prefix = "SpkrLeft"; #sound-dai-cells = <0>; @@ -1021,7 +1021,7 @@ right_spkr: speaker@0,4 { compatible = "sdw10217211000"; reg = <0 4>; - powerdown-gpios = <&tlmm 130 GPIO_ACTIVE_HIGH>; + powerdown-gpios = <&tlmm 130 GPIO_ACTIVE_LOW>; #thermal-sensor-cells = <0>; sound-name-prefix = "SpkrRight"; #sound-dai-cells = <0>; diff --git a/arch/arm64/boot/dts/qcom/sc7280-herobrine.dtsi b/arch/arm64/boot/dts/qcom/sc7280-herobrine.dtsi index b6137816f2f3..313083ec1f39 100644 --- a/arch/arm64/boot/dts/qcom/sc7280-herobrine.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7280-herobrine.dtsi @@ -464,7 +464,7 @@ ap_i2c_tpm: &i2c14 { &mdss_dp_out { data-lanes = <0 1>; - link-frequencies = /bits/ 64 <1620000000 2700000000 5400000000 8100000000>; + link-frequencies = /bits/ 64 <1620000000 2700000000 5400000000>; }; &mdss_mdp { diff --git a/arch/arm64/boot/dts/qcom/sc8280xp-pmics.dtsi b/arch/arm64/boot/dts/qcom/sc8280xp-pmics.dtsi index df7d28f7ae60..be446eba4fa7 100644 --- a/arch/arm64/boot/dts/qcom/sc8280xp-pmics.dtsi +++ b/arch/arm64/boot/dts/qcom/sc8280xp-pmics.dtsi @@ -59,8 +59,9 @@ #size-cells = <0>; pmk8280_pon: pon@1300 { - compatible = "qcom,pm8998-pon"; - reg = <0x1300>; + compatible = "qcom,pmk8350-pon"; + reg = <0x1300>, <0x800>; + reg-names = "hlos", "pbs"; pmk8280_pon_pwrkey: pwrkey { compatible = "qcom,pmk8350-pwrkey"; diff --git a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts index 67d2a663ce75..5c688cb6a7ce 100644 --- a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts +++ b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts @@ -753,7 +753,7 @@ left_spkr: speaker@0,3 { compatible = "sdw10217211000"; reg = <0 3>; - powerdown-gpios = <&wcdgpio 1 GPIO_ACTIVE_HIGH>; + powerdown-gpios = <&wcdgpio 1 GPIO_ACTIVE_LOW>; #thermal-sensor-cells = <0>; sound-name-prefix = "SpkrLeft"; #sound-dai-cells = <0>; @@ -761,7 +761,7 @@ right_spkr: speaker@0,4 { compatible = "sdw10217211000"; - powerdown-gpios = <&wcdgpio 2 GPIO_ACTIVE_HIGH>; + powerdown-gpios = <&wcdgpio 2 GPIO_ACTIVE_LOW>; reg = <0 4>; #thermal-sensor-cells = <0>; sound-name-prefix = "SpkrRight"; diff --git a/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts b/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts index 9850140514ba..41f59e32af64 100644 --- a/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts +++ b/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts @@ -662,7 +662,7 @@ left_spkr: speaker@0,3 { compatible = "sdw10217211000"; reg = <0 3>; - powerdown-gpios = <&wcdgpio 1 GPIO_ACTIVE_HIGH>; + powerdown-gpios = <&wcdgpio 1 GPIO_ACTIVE_LOW>; #thermal-sensor-cells = <0>; sound-name-prefix = "SpkrLeft"; #sound-dai-cells = <0>; @@ -670,7 +670,7 @@ right_spkr: speaker@0,4 { compatible = "sdw10217211000"; - powerdown-gpios = <&wcdgpio 2 GPIO_ACTIVE_HIGH>; + powerdown-gpios = <&wcdgpio 2 GPIO_ACTIVE_LOW>; reg = <0 4>; #thermal-sensor-cells = <0>; sound-name-prefix = "SpkrRight"; diff --git a/arch/arm64/boot/dts/qcom/sm8250-mtp.dts b/arch/arm64/boot/dts/qcom/sm8250-mtp.dts index e54cdc8bc31f..4c9de236676d 100644 --- a/arch/arm64/boot/dts/qcom/sm8250-mtp.dts +++ b/arch/arm64/boot/dts/qcom/sm8250-mtp.dts @@ -764,7 +764,7 @@ left_spkr: speaker@0,3 { compatible = "sdw10217211000"; reg = <0 3>; - powerdown-gpios = <&tlmm 26 GPIO_ACTIVE_HIGH>; + powerdown-gpios = <&tlmm 26 GPIO_ACTIVE_LOW>; #thermal-sensor-cells = <0>; sound-name-prefix = "SpkrLeft"; #sound-dai-cells = <0>; @@ -773,7 +773,7 @@ right_spkr: speaker@0,4 { compatible = "sdw10217211000"; reg = <0 4>; - powerdown-gpios = <&tlmm 127 GPIO_ACTIVE_HIGH>; + powerdown-gpios = <&tlmm 127 GPIO_ACTIVE_LOW>; #thermal-sensor-cells = <0>; sound-name-prefix = "SpkrRight"; #sound-dai-cells = <0>; diff --git a/arch/arm64/boot/dts/rockchip/rk3326-anbernic-rg351m.dts b/arch/arm64/boot/dts/rockchip/rk3326-anbernic-rg351m.dts index 61b31688b469..ce318e05f0a6 100644 --- a/arch/arm64/boot/dts/rockchip/rk3326-anbernic-rg351m.dts +++ b/arch/arm64/boot/dts/rockchip/rk3326-anbernic-rg351m.dts @@ -24,6 +24,8 @@ &internal_display { compatible = "elida,kd35t133"; + iovcc-supply = <&vcc_lcd>; + vdd-supply = <&vcc_lcd>; }; &pwm0 { diff --git a/arch/arm64/boot/dts/rockchip/rk3326-odroid-go.dtsi b/arch/arm64/boot/dts/rockchip/rk3326-odroid-go.dtsi index 04eba432fb0e..80fc53c807a4 100644 --- a/arch/arm64/boot/dts/rockchip/rk3326-odroid-go.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3326-odroid-go.dtsi @@ -235,10 +235,8 @@ internal_display: panel@0 { reg = <0>; backlight = <&backlight>; - iovcc-supply = <&vcc_lcd>; reset-gpios = <&gpio3 RK_PC0 GPIO_ACTIVE_LOW>; rotation = <270>; - vdd-supply = <&vcc_lcd>; port { mipi_in_panel: endpoint { diff --git a/arch/arm64/boot/dts/rockchip/rk3326-odroid-go2-v11.dts b/arch/arm64/boot/dts/rockchip/rk3326-odroid-go2-v11.dts index 139c898e590e..d94ac81eb4e6 100644 --- a/arch/arm64/boot/dts/rockchip/rk3326-odroid-go2-v11.dts +++ b/arch/arm64/boot/dts/rockchip/rk3326-odroid-go2-v11.dts @@ -83,6 +83,8 @@ &internal_display { compatible = "elida,kd35t133"; + iovcc-supply = <&vcc_lcd>; + vdd-supply = <&vcc_lcd>; }; &rk817 { diff --git a/arch/arm64/boot/dts/rockchip/rk3326-odroid-go2.dts b/arch/arm64/boot/dts/rockchip/rk3326-odroid-go2.dts index 4702183b673c..aa6f5b12206b 100644 --- a/arch/arm64/boot/dts/rockchip/rk3326-odroid-go2.dts +++ b/arch/arm64/boot/dts/rockchip/rk3326-odroid-go2.dts @@ -59,6 +59,8 @@ &internal_display { compatible = "elida,kd35t133"; + iovcc-supply = <&vcc_lcd>; + vdd-supply = <&vcc_lcd>; }; &rk817_charger { diff --git a/arch/arm64/boot/dts/rockchip/rk3368-evb.dtsi b/arch/arm64/boot/dts/rockchip/rk3368-evb.dtsi index 083452c67711..e47d1398aeca 100644 --- a/arch/arm64/boot/dts/rockchip/rk3368-evb.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3368-evb.dtsi @@ -61,7 +61,6 @@ pinctrl-names = "default"; pinctrl-0 = <&bl_en>; pwms = <&pwm0 0 1000000 PWM_POLARITY_INVERTED>; - pwm-delay-us = <10000>; }; emmc_pwrseq: emmc-pwrseq { diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi index ee6095baba4d..5c1929d41cc0 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi @@ -198,7 +198,6 @@ power-supply = <&pp3300_disp>; pinctrl-names = "default"; pinctrl-0 = <&bl_en>; - pwm-delay-us = <10000>; }; gpio_keys: gpio-keys { diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi index a47d9f758611..c5e7de60c121 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi @@ -167,7 +167,6 @@ pinctrl-names = "default"; pinctrl-0 = <&bl_en>; pwms = <&pwm1 0 1000000 0>; - pwm-delay-us = <10000>; }; dmic: dmic { diff --git a/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts b/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts index 194e48c755f6..ddd45de97950 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts +++ b/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts @@ -50,19 +50,9 @@ pinctrl-0 = <&panel_en_pin>; power-supply = <&vcc3v3_panel>; - ports { - #address-cells = <1>; - #size-cells = <0>; - - port@0 { - reg = <0>; - #address-cells = <1>; - #size-cells = <0>; - - panel_in_edp: endpoint@0 { - reg = <0>; - remote-endpoint = <&edp_out_panel>; - }; + port { + panel_in_edp: endpoint { + remote-endpoint = <&edp_out_panel>; }; }; }; @@ -943,7 +933,7 @@ disable-wp; pinctrl-names = "default"; pinctrl-0 = <&sdmmc_clk &sdmmc_cmd &sdmmc_bus4>; - sd-uhs-sdr104; + sd-uhs-sdr50; vmmc-supply = <&vcc3v0_sd>; vqmmc-supply = <&vcc_sdio>; status = "okay"; diff --git a/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi index 78157521e944..bca2b50e0a93 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi @@ -647,16 +647,10 @@ avdd-supply = <&avdd>; backlight = <&backlight>; dvdd-supply = <&vcc3v3_s0>; - ports { - #address-cells = <1>; - #size-cells = <0>; - port@0 { - reg = <0>; - - mipi_in_panel: endpoint { - remote-endpoint = <&mipi_out_panel>; - }; + port { + mipi_in_panel: endpoint { + remote-endpoint = <&mipi_out_panel>; }; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index 1881b4b71f91..40e7c4a70055 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -552,7 +552,7 @@ <0x0 0xfff10000 0 0x10000>, /* GICH */ <0x0 0xfff20000 0 0x10000>; /* GICV */ interrupts = <GIC_PPI 9 IRQ_TYPE_LEVEL_HIGH 0>; - its: interrupt-controller@fee20000 { + its: msi-controller@fee20000 { compatible = "arm,gic-v3-its"; msi-controller; #msi-cells = <1>; diff --git a/arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg353x.dtsi b/arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg353x.dtsi index 65a80d1f6d91..9a0e217f069f 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg353x.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg353x.dtsi @@ -16,8 +16,10 @@ }; &cru { - assigned-clocks = <&cru PLL_GPLL>, <&pmucru PLL_PPLL>, <&cru PLL_VPLL>; - assigned-clock-rates = <1200000000>, <200000000>, <241500000>; + assigned-clocks = <&pmucru CLK_RTC_32K>, <&cru PLL_GPLL>, + <&pmucru PLL_PPLL>, <&cru PLL_VPLL>; + assigned-clock-rates = <32768>, <1200000000>, + <200000000>, <241500000>; }; &gpio_keys_control { diff --git a/arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg503.dts b/arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg503.dts index b4b2df821cba..c763c7f3b1b3 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg503.dts +++ b/arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg503.dts @@ -105,8 +105,10 @@ }; &cru { - assigned-clocks = <&cru PLL_GPLL>, <&pmucru PLL_PPLL>, <&cru PLL_VPLL>; - assigned-clock-rates = <1200000000>, <200000000>, <500000000>; + assigned-clocks = <&pmucru CLK_RTC_32K>, <&cru PLL_GPLL>, + <&pmucru PLL_PPLL>, <&cru PLL_VPLL>; + assigned-clock-rates = <32768>, <1200000000>, + <200000000>, <500000000>; }; &dsi_dphy0 { diff --git a/arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi b/arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi index ce7165d7f1a1..102e448bc026 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi @@ -598,7 +598,7 @@ non-removable; pinctrl-names = "default"; pinctrl-0 = <&sdmmc1_bus4 &sdmmc1_cmd &sdmmc1_clk>; - sd-uhs-sdr104; + sd-uhs-sdr50; vmmc-supply = <&vcc3v3_sys>; vqmmc-supply = <&vcc_1v8>; status = "okay"; diff --git a/arch/arm64/boot/dts/rockchip/rk3588s.dtsi b/arch/arm64/boot/dts/rockchip/rk3588s.dtsi index 005cde61b4b2..a506948b5572 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588s.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588s.dtsi @@ -222,6 +222,7 @@ cache-size = <131072>; cache-line-size = <64>; cache-sets = <512>; + cache-level = <2>; next-level-cache = <&l3_cache>; }; @@ -230,6 +231,7 @@ cache-size = <131072>; cache-line-size = <64>; cache-sets = <512>; + cache-level = <2>; next-level-cache = <&l3_cache>; }; @@ -238,6 +240,7 @@ cache-size = <131072>; cache-line-size = <64>; cache-sets = <512>; + cache-level = <2>; next-level-cache = <&l3_cache>; }; @@ -246,6 +249,7 @@ cache-size = <131072>; cache-line-size = <64>; cache-sets = <512>; + cache-level = <2>; next-level-cache = <&l3_cache>; }; @@ -254,6 +258,7 @@ cache-size = <524288>; cache-line-size = <64>; cache-sets = <1024>; + cache-level = <2>; next-level-cache = <&l3_cache>; }; @@ -262,6 +267,7 @@ cache-size = <524288>; cache-line-size = <64>; cache-sets = <1024>; + cache-level = <2>; next-level-cache = <&l3_cache>; }; @@ -270,6 +276,7 @@ cache-size = <524288>; cache-line-size = <64>; cache-sets = <1024>; + cache-level = <2>; next-level-cache = <&l3_cache>; }; @@ -278,6 +285,7 @@ cache-size = <524288>; cache-line-size = <64>; cache-sets = <1024>; + cache-level = <2>; next-level-cache = <&l3_cache>; }; @@ -286,6 +294,7 @@ cache-size = <3145728>; cache-line-size = <64>; cache-sets = <4096>; + cache-level = <3>; }; }; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index bcd774d74f34..3dd691c85ca0 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -576,9 +576,22 @@ struct kvm_vcpu_arch { ({ \ __build_check_flag(v, flagset, f, m); \ \ - v->arch.flagset & (m); \ + READ_ONCE(v->arch.flagset) & (m); \ }) +/* + * Note that the set/clear accessors must be preempt-safe in order to + * avoid nesting them with load/put which also manipulate flags... + */ +#ifdef __KVM_NVHE_HYPERVISOR__ +/* the nVHE hypervisor is always non-preemptible */ +#define __vcpu_flags_preempt_disable() +#define __vcpu_flags_preempt_enable() +#else +#define __vcpu_flags_preempt_disable() preempt_disable() +#define __vcpu_flags_preempt_enable() preempt_enable() +#endif + #define __vcpu_set_flag(v, flagset, f, m) \ do { \ typeof(v->arch.flagset) *fset; \ @@ -586,9 +599,11 @@ struct kvm_vcpu_arch { __build_check_flag(v, flagset, f, m); \ \ fset = &v->arch.flagset; \ + __vcpu_flags_preempt_disable(); \ if (HWEIGHT(m) > 1) \ *fset &= ~(m); \ *fset |= (f); \ + __vcpu_flags_preempt_enable(); \ } while (0) #define __vcpu_clear_flag(v, flagset, f, m) \ @@ -598,7 +613,9 @@ struct kvm_vcpu_arch { __build_check_flag(v, flagset, f, m); \ \ fset = &v->arch.flagset; \ + __vcpu_flags_preempt_disable(); \ *fset &= ~(m); \ + __vcpu_flags_preempt_enable(); \ } while (0) #define vcpu_get_flag(v, ...) __vcpu_get_flag((v), __VA_ARGS__) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index ca6eadeb7d1a..f531da6b362e 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -29,7 +29,6 @@ menuconfig KVM select KVM_MMIO select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_XFER_TO_GUEST_WORK - select SRCU select KVM_VFIO select HAVE_KVM_EVENTFD select HAVE_KVM_IRQFD diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 3f6a5efdbcf0..4b2e16e696a8 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1890,9 +1890,33 @@ static int __init do_pkvm_init(u32 hyp_va_bits) return ret; } +static u64 get_hyp_id_aa64pfr0_el1(void) +{ + /* + * Track whether the system isn't affected by spectre/meltdown in the + * hypervisor's view of id_aa64pfr0_el1, used for protected VMs. + * Although this is per-CPU, we make it global for simplicity, e.g., not + * to have to worry about vcpu migration. + * + * Unlike for non-protected VMs, userspace cannot override this for + * protected VMs. + */ + u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3)); + + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), + arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), + arm64_get_meltdown_state() == SPECTRE_UNAFFECTED); + + return val; +} + static void kvm_hyp_init_symbols(void) { - kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = get_hyp_id_aa64pfr0_el1(); kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1); kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h index 07edfc7524c9..37440e1dda93 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h +++ b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h @@ -33,11 +33,14 @@ * Allow for protected VMs: * - Floating-point and Advanced SIMD * - Data Independent Timing + * - Spectre/Meltdown Mitigation */ #define PVM_ID_AA64PFR0_ALLOW (\ ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \ ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3) \ ) /* diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 08d2b004f4b7..edd969a1f36b 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -85,19 +85,12 @@ static u64 get_restricted_features_unsigned(u64 sys_reg_val, static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu) { - const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); u64 set_mask = 0; u64 allow_mask = PVM_ID_AA64PFR0_ALLOW; set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val, PVM_ID_AA64PFR0_RESTRICT_UNSIGNED); - /* Spectre and Meltdown mitigation in KVM */ - set_mask |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), - (u64)kvm->arch.pfr0_csv2); - set_mask |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), - (u64)kvm->arch.pfr0_csv3); - return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask; } diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 5da884e11337..c4b4678bc4a4 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -397,6 +397,8 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) u64 val; int wa_level; + if (KVM_REG_SIZE(reg->id) != sizeof(val)) + return -ENOENT; if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id))) return -EFAULT; diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index c243b10f3e15..5eca0cdd961d 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -558,6 +558,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) for_each_set_bit(i, &mask, 32) kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); } + kvm_vcpu_pmu_restore_guest(vcpu); } static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1b2c161120be..34688918c811 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -794,7 +794,6 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (!kvm_supports_32bit_el0()) val |= ARMV8_PMU_PMCR_LC; kvm_pmu_handle_pmcr(vcpu, val); - kvm_vcpu_pmu_restore_guest(vcpu); } else { /* PMCR.P & PMCR.C are RAZ */ val = __vcpu_sys_reg(vcpu, PMCR_EL0) diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index a6acb94ea3d6..c2edadb8ec6a 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -281,4 +281,8 @@ /* DMB */ #define A64_DMB_ISH aarch64_insn_gen_dmb(AARCH64_INSN_MB_ISH) +/* ADR */ +#define A64_ADR(Rd, offset) \ + aarch64_insn_gen_adr(0, offset, Rd, AARCH64_INSN_ADR_TYPE_ADR) + #endif /* _BPF_JIT_H */ diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 62f805f427b7..b26da8efa616 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1900,7 +1900,8 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, restore_args(ctx, args_off, nargs); /* call original func */ emit(A64_LDR64I(A64_R(10), A64_SP, retaddr_off), ctx); - emit(A64_BLR(A64_R(10)), ctx); + emit(A64_ADR(A64_LR, AARCH64_INSN_SIZE * 2), ctx); + emit(A64_RET(A64_R(10)), ctx); /* store return value */ emit(A64_STR64I(A64_R(0), A64_SP, retval_off), ctx); /* reserve a nop for bpf_tramp_image_put */ diff --git a/arch/csky/include/asm/processor.h b/arch/csky/include/asm/processor.h index ea75d72dea86..e487a46d1c37 100644 --- a/arch/csky/include/asm/processor.h +++ b/arch/csky/include/asm/processor.h @@ -72,8 +72,6 @@ struct task_struct; /* Prepare to copy thread state - unlazy all lazy status */ #define prepare_to_copy(tsk) do { } while (0) -extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); - unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 21dfa4aa35bb..033f5aead88a 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -853,7 +853,7 @@ valid_phys_addr_range (phys_addr_t phys_addr, unsigned long size) * /dev/mem reads and writes use copy_to_user(), which implicitly * uses a granule-sized kernel identity mapping. It's really * only safe to do this for regions in kern_memmap. For more - * details, see Documentation/ia64/aliasing.rst. + * details, see Documentation/arch/ia64/aliasing.rst. */ attr = kern_mem_attribute(phys_addr, size); if (attr & EFI_MEMORY_WB || attr & EFI_MEMORY_UC) diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index 2094f3249019..cc4733e9990a 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -28,7 +28,7 @@ #include <asm/native/inst.h> /* - * See Documentation/ia64/fsys.rst for details on fsyscalls. + * See Documentation/arch/ia64/fsys.rst for details on fsyscalls. * * On entry to an fsyscall handler: * r10 = 0 (i.e., defaults to "successful syscall return") diff --git a/arch/ia64/mm/ioremap.c b/arch/ia64/mm/ioremap.c index 55fd3eb753ff..92b81bc91397 100644 --- a/arch/ia64/mm/ioremap.c +++ b/arch/ia64/mm/ioremap.c @@ -43,7 +43,7 @@ ioremap (unsigned long phys_addr, unsigned long size) /* * For things in kern_memmap, we must use the same attribute * as the rest of the kernel. For more details, see - * Documentation/ia64/aliasing.rst. + * Documentation/arch/ia64/aliasing.rst. */ attr = kern_mem_attribute(phys_addr, size); if (attr & EFI_MEMORY_WB) diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c index 211757e34198..0a0328e61bef 100644 --- a/arch/ia64/pci/pci.c +++ b/arch/ia64/pci/pci.c @@ -448,7 +448,7 @@ pci_mmap_legacy_page_range(struct pci_bus *bus, struct vm_area_struct *vma, return -ENOSYS; /* - * Avoid attribute aliasing. See Documentation/ia64/aliasing.rst + * Avoid attribute aliasing. See Documentation/arch/ia64/aliasing.rst * for more details. */ if (!valid_mmap_phys_addr_range(vma->vm_pgoff, size)) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 7fd51257e0ed..3ddde336e6a5 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -447,6 +447,22 @@ config ARCH_IOREMAP protection support. However, you can enable LoongArch DMW-based ioremap() for better performance. +config ARCH_WRITECOMBINE + bool "Enable WriteCombine (WUC) for ioremap()" + help + LoongArch maintains cache coherency in hardware, but when paired + with LS7A chipsets the WUC attribute (Weak-ordered UnCached, which + is similar to WriteCombine) is out of the scope of cache coherency + machanism for PCIe devices (this is a PCIe protocol violation, which + may be fixed in newer chipsets). + + This means WUC can only used for write-only memory regions now, so + this option is disabled by default, making WUC silently fallback to + SUC for ioremap(). You can enable this option if the kernel is ensured + to run on hardware without this bug. + + You can override this setting via writecombine=on/off boot parameter. + config ARCH_STRICT_ALIGN bool "Enable -mstrict-align to prevent unaligned accesses" if EXPERT default y diff --git a/arch/loongarch/include/asm/acpi.h b/arch/loongarch/include/asm/acpi.h index 4198753aa1d0..976a810352c6 100644 --- a/arch/loongarch/include/asm/acpi.h +++ b/arch/loongarch/include/asm/acpi.h @@ -41,8 +41,11 @@ extern void loongarch_suspend_enter(void); static inline unsigned long acpi_get_wakeup_address(void) { +#ifdef CONFIG_SUSPEND extern void loongarch_wakeup_start(void); return (unsigned long)loongarch_wakeup_start; +#endif + return 0UL; } #endif /* _ASM_LOONGARCH_ACPI_H */ diff --git a/arch/loongarch/include/asm/addrspace.h b/arch/loongarch/include/asm/addrspace.h index 8fb699b4d40a..5c9c03bdf915 100644 --- a/arch/loongarch/include/asm/addrspace.h +++ b/arch/loongarch/include/asm/addrspace.h @@ -71,9 +71,9 @@ extern unsigned long vm_map_base; #define _ATYPE32_ int #define _ATYPE64_ __s64 #ifdef CONFIG_64BIT -#define _CONST64_(x) x ## L +#define _CONST64_(x) x ## UL #else -#define _CONST64_(x) x ## LL +#define _CONST64_(x) x ## ULL #endif #endif diff --git a/arch/loongarch/include/asm/bootinfo.h b/arch/loongarch/include/asm/bootinfo.h index 0051b526ac6d..c60796869b2b 100644 --- a/arch/loongarch/include/asm/bootinfo.h +++ b/arch/loongarch/include/asm/bootinfo.h @@ -13,7 +13,6 @@ const char *get_system_type(void); extern void init_environ(void); extern void memblock_init(void); extern void platform_init(void); -extern void plat_swiotlb_setup(void); extern int __init init_numa_memory(void); struct loongson_board_info { diff --git a/arch/loongarch/include/asm/cpu-features.h b/arch/loongarch/include/asm/cpu-features.h index b07974218393..f6177f133477 100644 --- a/arch/loongarch/include/asm/cpu-features.h +++ b/arch/loongarch/include/asm/cpu-features.h @@ -42,6 +42,7 @@ #define cpu_has_fpu cpu_opt(LOONGARCH_CPU_FPU) #define cpu_has_lsx cpu_opt(LOONGARCH_CPU_LSX) #define cpu_has_lasx cpu_opt(LOONGARCH_CPU_LASX) +#define cpu_has_crc32 cpu_opt(LOONGARCH_CPU_CRC32) #define cpu_has_complex cpu_opt(LOONGARCH_CPU_COMPLEX) #define cpu_has_crypto cpu_opt(LOONGARCH_CPU_CRYPTO) #define cpu_has_lvz cpu_opt(LOONGARCH_CPU_LVZ) diff --git a/arch/loongarch/include/asm/cpu.h b/arch/loongarch/include/asm/cpu.h index c3da91759472..88773d849e33 100644 --- a/arch/loongarch/include/asm/cpu.h +++ b/arch/loongarch/include/asm/cpu.h @@ -78,25 +78,26 @@ enum cpu_type_enum { #define CPU_FEATURE_FPU 3 /* CPU has FPU */ #define CPU_FEATURE_LSX 4 /* CPU has LSX (128-bit SIMD) */ #define CPU_FEATURE_LASX 5 /* CPU has LASX (256-bit SIMD) */ -#define CPU_FEATURE_COMPLEX 6 /* CPU has Complex instructions */ -#define CPU_FEATURE_CRYPTO 7 /* CPU has Crypto instructions */ -#define CPU_FEATURE_LVZ 8 /* CPU has Virtualization extension */ -#define CPU_FEATURE_LBT_X86 9 /* CPU has X86 Binary Translation */ -#define CPU_FEATURE_LBT_ARM 10 /* CPU has ARM Binary Translation */ -#define CPU_FEATURE_LBT_MIPS 11 /* CPU has MIPS Binary Translation */ -#define CPU_FEATURE_TLB 12 /* CPU has TLB */ -#define CPU_FEATURE_CSR 13 /* CPU has CSR */ -#define CPU_FEATURE_WATCH 14 /* CPU has watchpoint registers */ -#define CPU_FEATURE_VINT 15 /* CPU has vectored interrupts */ -#define CPU_FEATURE_CSRIPI 16 /* CPU has CSR-IPI */ -#define CPU_FEATURE_EXTIOI 17 /* CPU has EXT-IOI */ -#define CPU_FEATURE_PREFETCH 18 /* CPU has prefetch instructions */ -#define CPU_FEATURE_PMP 19 /* CPU has perfermance counter */ -#define CPU_FEATURE_SCALEFREQ 20 /* CPU supports cpufreq scaling */ -#define CPU_FEATURE_FLATMODE 21 /* CPU has flat mode */ -#define CPU_FEATURE_EIODECODE 22 /* CPU has EXTIOI interrupt pin decode mode */ -#define CPU_FEATURE_GUESTID 23 /* CPU has GuestID feature */ -#define CPU_FEATURE_HYPERVISOR 24 /* CPU has hypervisor (running in VM) */ +#define CPU_FEATURE_CRC32 6 /* CPU has CRC32 instructions */ +#define CPU_FEATURE_COMPLEX 7 /* CPU has Complex instructions */ +#define CPU_FEATURE_CRYPTO 8 /* CPU has Crypto instructions */ +#define CPU_FEATURE_LVZ 9 /* CPU has Virtualization extension */ +#define CPU_FEATURE_LBT_X86 10 /* CPU has X86 Binary Translation */ +#define CPU_FEATURE_LBT_ARM 11 /* CPU has ARM Binary Translation */ +#define CPU_FEATURE_LBT_MIPS 12 /* CPU has MIPS Binary Translation */ +#define CPU_FEATURE_TLB 13 /* CPU has TLB */ +#define CPU_FEATURE_CSR 14 /* CPU has CSR */ +#define CPU_FEATURE_WATCH 15 /* CPU has watchpoint registers */ +#define CPU_FEATURE_VINT 16 /* CPU has vectored interrupts */ +#define CPU_FEATURE_CSRIPI 17 /* CPU has CSR-IPI */ +#define CPU_FEATURE_EXTIOI 18 /* CPU has EXT-IOI */ +#define CPU_FEATURE_PREFETCH 19 /* CPU has prefetch instructions */ +#define CPU_FEATURE_PMP 20 /* CPU has perfermance counter */ +#define CPU_FEATURE_SCALEFREQ 21 /* CPU supports cpufreq scaling */ +#define CPU_FEATURE_FLATMODE 22 /* CPU has flat mode */ +#define CPU_FEATURE_EIODECODE 23 /* CPU has EXTIOI interrupt pin decode mode */ +#define CPU_FEATURE_GUESTID 24 /* CPU has GuestID feature */ +#define CPU_FEATURE_HYPERVISOR 25 /* CPU has hypervisor (running in VM) */ #define LOONGARCH_CPU_CPUCFG BIT_ULL(CPU_FEATURE_CPUCFG) #define LOONGARCH_CPU_LAM BIT_ULL(CPU_FEATURE_LAM) @@ -104,6 +105,7 @@ enum cpu_type_enum { #define LOONGARCH_CPU_FPU BIT_ULL(CPU_FEATURE_FPU) #define LOONGARCH_CPU_LSX BIT_ULL(CPU_FEATURE_LSX) #define LOONGARCH_CPU_LASX BIT_ULL(CPU_FEATURE_LASX) +#define LOONGARCH_CPU_CRC32 BIT_ULL(CPU_FEATURE_CRC32) #define LOONGARCH_CPU_COMPLEX BIT_ULL(CPU_FEATURE_COMPLEX) #define LOONGARCH_CPU_CRYPTO BIT_ULL(CPU_FEATURE_CRYPTO) #define LOONGARCH_CPU_LVZ BIT_ULL(CPU_FEATURE_LVZ) diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h index 402a7d9e3a53..545e2708fbf7 100644 --- a/arch/loongarch/include/asm/io.h +++ b/arch/loongarch/include/asm/io.h @@ -54,8 +54,10 @@ static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size, * @offset: bus address of the memory * @size: size of the resource to map */ +extern pgprot_t pgprot_wc; + #define ioremap_wc(offset, size) \ - ioremap_prot((offset), (size), pgprot_val(PAGE_KERNEL_WUC)) + ioremap_prot((offset), (size), pgprot_val(pgprot_wc)) #define ioremap_cache(offset, size) \ ioremap_prot((offset), (size), pgprot_val(PAGE_KERNEL)) diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 65b7dcdea16d..83da5d29e2d1 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -117,7 +117,7 @@ static inline u32 read_cpucfg(u32 reg) #define CPUCFG1_EP BIT(22) #define CPUCFG1_RPLV BIT(23) #define CPUCFG1_HUGEPG BIT(24) -#define CPUCFG1_IOCSRBRD BIT(25) +#define CPUCFG1_CRC32 BIT(25) #define CPUCFG1_MSGINT BIT(26) #define LOONGARCH_CPUCFG2 0x2 @@ -423,9 +423,9 @@ static __always_inline void iocsr_write64(u64 val, u32 reg) #define CSR_ASID_ASID_WIDTH 10 #define CSR_ASID_ASID (_ULCAST_(0x3ff) << CSR_ASID_ASID_SHIFT) -#define LOONGARCH_CSR_PGDL 0x19 /* Page table base address when VA[47] = 0 */ +#define LOONGARCH_CSR_PGDL 0x19 /* Page table base address when VA[VALEN-1] = 0 */ -#define LOONGARCH_CSR_PGDH 0x1a /* Page table base address when VA[47] = 1 */ +#define LOONGARCH_CSR_PGDH 0x1a /* Page table base address when VA[VALEN-1] = 1 */ #define LOONGARCH_CSR_PGD 0x1b /* Page table base */ diff --git a/arch/loongarch/include/asm/module.lds.h b/arch/loongarch/include/asm/module.lds.h index 438f09d4ccf4..88554f92e010 100644 --- a/arch/loongarch/include/asm/module.lds.h +++ b/arch/loongarch/include/asm/module.lds.h @@ -2,8 +2,8 @@ /* Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ SECTIONS { . = ALIGN(4); - .got : { BYTE(0) } - .plt : { BYTE(0) } - .plt.idx : { BYTE(0) } - .ftrace_trampoline : { BYTE(0) } + .got 0 : { BYTE(0) } + .plt 0 : { BYTE(0) } + .plt.idx 0 : { BYTE(0) } + .ftrace_trampoline 0 : { BYTE(0) } } diff --git a/arch/loongarch/include/uapi/asm/ptrace.h b/arch/loongarch/include/uapi/asm/ptrace.h index cc48ed262021..82d811b5c6e9 100644 --- a/arch/loongarch/include/uapi/asm/ptrace.h +++ b/arch/loongarch/include/uapi/asm/ptrace.h @@ -47,11 +47,12 @@ struct user_fp_state { }; struct user_watch_state { - uint16_t dbg_info; + uint64_t dbg_info; struct { uint64_t addr; uint64_t mask; uint32_t ctrl; + uint32_t pad; } dbg_regs[8]; }; diff --git a/arch/loongarch/kernel/cpu-probe.c b/arch/loongarch/kernel/cpu-probe.c index 3a3fce2d7846..5adf0f736c6d 100644 --- a/arch/loongarch/kernel/cpu-probe.c +++ b/arch/loongarch/kernel/cpu-probe.c @@ -60,7 +60,7 @@ static inline void set_elf_platform(int cpu, const char *plat) /* MAP BASE */ unsigned long vm_map_base; -EXPORT_SYMBOL_GPL(vm_map_base); +EXPORT_SYMBOL(vm_map_base); static void cpu_probe_addrbits(struct cpuinfo_loongarch *c) { @@ -94,13 +94,18 @@ static void cpu_probe_common(struct cpuinfo_loongarch *c) c->options = LOONGARCH_CPU_CPUCFG | LOONGARCH_CPU_CSR | LOONGARCH_CPU_TLB | LOONGARCH_CPU_VINT | LOONGARCH_CPU_WATCH; - elf_hwcap = HWCAP_LOONGARCH_CPUCFG | HWCAP_LOONGARCH_CRC32; + elf_hwcap = HWCAP_LOONGARCH_CPUCFG; config = read_cpucfg(LOONGARCH_CPUCFG1); if (config & CPUCFG1_UAL) { c->options |= LOONGARCH_CPU_UAL; elf_hwcap |= HWCAP_LOONGARCH_UAL; } + if (config & CPUCFG1_CRC32) { + c->options |= LOONGARCH_CPU_CRC32; + elf_hwcap |= HWCAP_LOONGARCH_CRC32; + } + config = read_cpucfg(LOONGARCH_CPUCFG2); if (config & CPUCFG2_LAM) { diff --git a/arch/loongarch/kernel/proc.c b/arch/loongarch/kernel/proc.c index 5c67cc4fd56d..0d82907b5404 100644 --- a/arch/loongarch/kernel/proc.c +++ b/arch/loongarch/kernel/proc.c @@ -76,6 +76,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (cpu_has_fpu) seq_printf(m, " fpu"); if (cpu_has_lsx) seq_printf(m, " lsx"); if (cpu_has_lasx) seq_printf(m, " lasx"); + if (cpu_has_crc32) seq_printf(m, " crc32"); if (cpu_has_complex) seq_printf(m, " complex"); if (cpu_has_crypto) seq_printf(m, " crypto"); if (cpu_has_lvz) seq_printf(m, " lvz"); diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c index 06bceae7d104..5fcffb452367 100644 --- a/arch/loongarch/kernel/ptrace.c +++ b/arch/loongarch/kernel/ptrace.c @@ -391,10 +391,10 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, return 0; } -static int ptrace_hbp_get_resource_info(unsigned int note_type, u16 *info) +static int ptrace_hbp_get_resource_info(unsigned int note_type, u64 *info) { u8 num; - u16 reg = 0; + u64 reg = 0; switch (note_type) { case NT_LOONGARCH_HW_BREAK: @@ -524,15 +524,16 @@ static int ptrace_hbp_set_addr(unsigned int note_type, return modify_user_hw_breakpoint(bp, &attr); } -#define PTRACE_HBP_CTRL_SZ sizeof(u32) #define PTRACE_HBP_ADDR_SZ sizeof(u64) #define PTRACE_HBP_MASK_SZ sizeof(u64) +#define PTRACE_HBP_CTRL_SZ sizeof(u32) +#define PTRACE_HBP_PAD_SZ sizeof(u32) static int hw_break_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - u16 info; + u64 info; u32 ctrl; u64 addr, mask; int ret, idx = 0; @@ -545,7 +546,7 @@ static int hw_break_get(struct task_struct *target, membuf_write(&to, &info, sizeof(info)); - /* (address, ctrl) registers */ + /* (address, mask, ctrl) registers */ while (to.left) { ret = ptrace_hbp_get_addr(note_type, target, idx, &addr); if (ret) @@ -562,6 +563,7 @@ static int hw_break_get(struct task_struct *target, membuf_store(&to, addr); membuf_store(&to, mask); membuf_store(&to, ctrl); + membuf_zero(&to, sizeof(u32)); idx++; } @@ -582,7 +584,7 @@ static int hw_break_set(struct task_struct *target, offset = offsetof(struct user_watch_state, dbg_regs); user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 0, offset); - /* (address, ctrl) registers */ + /* (address, mask, ctrl) registers */ limit = regset->n * regset->size; while (count && offset < limit) { if (count < PTRACE_HBP_ADDR_SZ) @@ -602,7 +604,7 @@ static int hw_break_set(struct task_struct *target, break; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &mask, - offset, offset + PTRACE_HBP_ADDR_SZ); + offset, offset + PTRACE_HBP_MASK_SZ); if (ret) return ret; @@ -611,8 +613,8 @@ static int hw_break_set(struct task_struct *target, return ret; offset += PTRACE_HBP_MASK_SZ; - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &mask, - offset, offset + PTRACE_HBP_MASK_SZ); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, + offset, offset + PTRACE_HBP_CTRL_SZ); if (ret) return ret; @@ -620,6 +622,11 @@ static int hw_break_set(struct task_struct *target, if (ret) return ret; offset += PTRACE_HBP_CTRL_SZ; + + user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, + offset, offset + PTRACE_HBP_PAD_SZ); + offset += PTRACE_HBP_PAD_SZ; + idx++; } diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index bae84ccf6d36..4444b13418f0 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -160,6 +160,27 @@ static void __init smbios_parse(void) dmi_walk(find_tokens, NULL); } +#ifdef CONFIG_ARCH_WRITECOMBINE +pgprot_t pgprot_wc = PAGE_KERNEL_WUC; +#else +pgprot_t pgprot_wc = PAGE_KERNEL_SUC; +#endif + +EXPORT_SYMBOL(pgprot_wc); + +static int __init setup_writecombine(char *p) +{ + if (!strcmp(p, "on")) + pgprot_wc = PAGE_KERNEL_WUC; + else if (!strcmp(p, "off")) + pgprot_wc = PAGE_KERNEL_SUC; + else + pr_warn("Unknown writecombine setting \"%s\".\n", p); + + return 0; +} +early_param("writecombine", setup_writecombine); + static int usermem __initdata; static int __init early_parse_mem(char *p) @@ -368,8 +389,8 @@ static void __init arch_mem_init(char **cmdline_p) /* * In order to reduce the possibility of kernel panic when failed to * get IO TLB memory under CONFIG_SWIOTLB, it is better to allocate - * low memory as small as possible before plat_swiotlb_setup(), so - * make sparse_init() using top-down allocation. + * low memory as small as possible before swiotlb_init(), so make + * sparse_init() using top-down allocation. */ memblock_set_bottom_up(false); sparse_init(); diff --git a/arch/loongarch/kernel/stacktrace.c b/arch/loongarch/kernel/stacktrace.c index 3a690f96f00c..2463d2fea21f 100644 --- a/arch/loongarch/kernel/stacktrace.c +++ b/arch/loongarch/kernel/stacktrace.c @@ -30,7 +30,7 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, regs->regs[1] = 0; for (unwind_start(&state, task, regs); - !unwind_done(&state); unwind_next_frame(&state)) { + !unwind_done(&state) && !unwind_error(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); if (!addr || !consume_entry(cookie, addr)) break; diff --git a/arch/loongarch/kernel/unwind.c b/arch/loongarch/kernel/unwind.c index a463d6961344..ba324ba76fa1 100644 --- a/arch/loongarch/kernel/unwind.c +++ b/arch/loongarch/kernel/unwind.c @@ -28,5 +28,6 @@ bool default_next_frame(struct unwind_state *state) } while (!get_stack_info(state->sp, state->task, info)); + state->error = true; return false; } diff --git a/arch/loongarch/kernel/unwind_prologue.c b/arch/loongarch/kernel/unwind_prologue.c index 9095fde8e55d..55afc27320e1 100644 --- a/arch/loongarch/kernel/unwind_prologue.c +++ b/arch/loongarch/kernel/unwind_prologue.c @@ -211,7 +211,7 @@ static bool next_frame(struct unwind_state *state) pc = regs->csr_era; if (user_mode(regs) || !__kernel_text_address(pc)) - return false; + goto out; state->first = true; state->pc = pc; @@ -226,6 +226,8 @@ static bool next_frame(struct unwind_state *state) } while (!get_stack_info(state->sp, state->task, info)); +out: + state->error = true; return false; } diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index e018aed34586..3b7d8129570b 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -41,7 +41,7 @@ * don't have to care about aliases on other CPUs. */ unsigned long empty_zero_page, zero_page_mask; -EXPORT_SYMBOL_GPL(empty_zero_page); +EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(zero_page_mask); void setup_zero_pages(void) @@ -270,7 +270,7 @@ pud_t invalid_pud_table[PTRS_PER_PUD] __page_aligned_bss; #endif #ifndef __PAGETABLE_PMD_FOLDED pmd_t invalid_pmd_table[PTRS_PER_PMD] __page_aligned_bss; -EXPORT_SYMBOL_GPL(invalid_pmd_table); +EXPORT_SYMBOL(invalid_pmd_table); #endif pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned_bss; EXPORT_SYMBOL(invalid_pte_table); diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c index 288003a9f0ca..d586df48ecc6 100644 --- a/arch/loongarch/net/bpf_jit.c +++ b/arch/loongarch/net/bpf_jit.c @@ -1022,6 +1022,10 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, bool ext emit_atomic(insn, ctx); break; + /* Speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; + default: pr_err("bpf_jit: unknown opcode %02x\n", code); return -EINVAL; diff --git a/arch/loongarch/power/suspend_asm.S b/arch/loongarch/power/suspend_asm.S index 90da899c06a1..e2fc3b4e31f0 100644 --- a/arch/loongarch/power/suspend_asm.S +++ b/arch/loongarch/power/suspend_asm.S @@ -80,6 +80,10 @@ SYM_INNER_LABEL(loongarch_wakeup_start, SYM_L_GLOBAL) JUMP_VIRT_ADDR t0, t1 + /* Enable PG */ + li.w t0, 0xb0 # PLV=0, IE=0, PG=1 + csrwr t0, LOONGARCH_CSR_CRMD + la.pcrel t0, acpi_saved_sp ld.d sp, t0, 0 SETUP_WAKEUP diff --git a/arch/m68k/Kconfig.debug b/arch/m68k/Kconfig.debug index 465e28be0ce4..30638a6e8edc 100644 --- a/arch/m68k/Kconfig.debug +++ b/arch/m68k/Kconfig.debug @@ -36,11 +36,6 @@ config HIGHPROFILE help Use a fast secondary clock to produce profiling information. -config NO_KERNEL_MSG - bool "Suppress Kernel BUG Messages" - help - Do not output any debug BUG messages within the kernel. - config BDM_DISABLE bool "Disable BDM signals" depends on COLDFIRE diff --git a/arch/m68k/Kconfig.machine b/arch/m68k/Kconfig.machine index e2f961208f18..28eebabfd34b 100644 --- a/arch/m68k/Kconfig.machine +++ b/arch/m68k/Kconfig.machine @@ -11,7 +11,7 @@ config AMIGA help This option enables support for the Amiga series of computers. If you plan to use this kernel on an Amiga, say Y here and browse the - material available in <file:Documentation/m68k>; otherwise say N. + material available in <file:Documentation/arch/m68k>; otherwise say N. config ATARI bool "Atari support" @@ -23,7 +23,7 @@ config ATARI This option enables support for the 68000-based Atari series of computers (including the TT, Falcon and Medusa). If you plan to use this kernel on an Atari, say Y here and browse the material - available in <file:Documentation/m68k>; otherwise say N. + available in <file:Documentation/arch/m68k>; otherwise say N. config ATARI_KBD_CORE bool diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index ec2d792015a4..b26469a65bc1 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -214,7 +214,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -495,6 +494,7 @@ CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m +CONFIG_RPCSEC_GSS_KRB5=m CONFIG_CIFS=m # CONFIG_CIFS_STATS2 is not set # CONFIG_CIFS_DEBUG is not set @@ -621,6 +621,7 @@ CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m +CONFIG_TEST_DHRY=m CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 061a07824dc2..944a49a129be 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -210,7 +210,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -452,6 +451,7 @@ CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m +CONFIG_RPCSEC_GSS_KRB5=m CONFIG_CIFS=m # CONFIG_CIFS_STATS2 is not set # CONFIG_CIFS_DEBUG is not set @@ -577,6 +577,7 @@ CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m +CONFIG_TEST_DHRY=m CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 02af5f501dae..a32dd884fcce 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -217,7 +217,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -472,6 +471,7 @@ CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m +CONFIG_RPCSEC_GSS_KRB5=m CONFIG_CIFS=m # CONFIG_CIFS_STATS2 is not set # CONFIG_CIFS_DEBUG is not set @@ -598,6 +598,7 @@ CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m +CONFIG_TEST_DHRY=m CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 0d5832cb3e10..23b7805309bd 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -207,7 +207,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -444,6 +443,7 @@ CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m +CONFIG_RPCSEC_GSS_KRB5=m CONFIG_CIFS=m # CONFIG_CIFS_STATS2 is not set # CONFIG_CIFS_DEBUG is not set @@ -569,6 +569,7 @@ CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m +CONFIG_TEST_DHRY=m CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index c246c3538839..5605ab5c3dcf 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -209,7 +209,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -454,6 +453,7 @@ CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m +CONFIG_RPCSEC_GSS_KRB5=m CONFIG_CIFS=m # CONFIG_CIFS_STATS2 is not set # CONFIG_CIFS_DEBUG is not set @@ -579,6 +579,7 @@ CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m +CONFIG_TEST_DHRY=m CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 98d2d0599e5a..d0d1f9c33756 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -208,7 +208,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -474,6 +473,7 @@ CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m +CONFIG_RPCSEC_GSS_KRB5=m CONFIG_CIFS=m # CONFIG_CIFS_STATS2 is not set # CONFIG_CIFS_DEBUG is not set @@ -600,6 +600,7 @@ CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m +CONFIG_TEST_DHRY=m CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index b2d5ec6ba625..6d04314ce7ea 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -228,7 +228,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -314,7 +313,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_PCCARD=y CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m @@ -561,6 +559,7 @@ CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m +CONFIG_RPCSEC_GSS_KRB5=m CONFIG_CIFS=m # CONFIG_CIFS_STATS2 is not set # CONFIG_CIFS_DEBUG is not set @@ -687,6 +686,7 @@ CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m +CONFIG_TEST_DHRY=m CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index d3420c642992..e6f5ae526d08 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -206,7 +206,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -443,6 +442,7 @@ CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m +CONFIG_RPCSEC_GSS_KRB5=m CONFIG_CIFS=m # CONFIG_CIFS_STATS2 is not set # CONFIG_CIFS_DEBUG is not set @@ -568,6 +568,7 @@ CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m +CONFIG_TEST_DHRY=m CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index e294b0b67695..f2d4dff4787a 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -207,7 +207,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -444,6 +443,7 @@ CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m +CONFIG_RPCSEC_GSS_KRB5=m CONFIG_CIFS=m # CONFIG_CIFS_STATS2 is not set # CONFIG_CIFS_DEBUG is not set @@ -569,6 +569,7 @@ CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m +CONFIG_TEST_DHRY=m CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 764a94b08936..907eedecd040 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -208,7 +208,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -461,6 +460,7 @@ CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m +CONFIG_RPCSEC_GSS_KRB5=m CONFIG_CIFS=m # CONFIG_CIFS_STATS2 is not set # CONFIG_CIFS_DEBUG is not set @@ -587,6 +587,7 @@ CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m +CONFIG_TEST_DHRY=m CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index d4eeddac6bb8..9e3d47008f21 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -204,7 +204,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -443,6 +442,7 @@ CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m +CONFIG_RPCSEC_GSS_KRB5=m CONFIG_CIFS=m # CONFIG_CIFS_STATS2 is not set # CONFIG_CIFS_DEBUG is not set @@ -567,6 +567,7 @@ CONFIG_TEST_LOCKUP=m CONFIG_WW_MUTEX_SELFTEST=m CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m +CONFIG_TEST_DHRY=m CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index ca359b880683..f6540078cb4b 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -204,7 +204,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -442,6 +441,7 @@ CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m +CONFIG_RPCSEC_GSS_KRB5=m CONFIG_CIFS=m # CONFIG_CIFS_STATS2 is not set # CONFIG_CIFS_DEBUG is not set @@ -567,6 +567,7 @@ CONFIG_WW_MUTEX_SELFTEST=m CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m +CONFIG_TEST_DHRY=m CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/kernel/machine_kexec.c b/arch/m68k/kernel/machine_kexec.c index 206f84983120..739875540e89 100644 --- a/arch/m68k/kernel/machine_kexec.c +++ b/arch/m68k/kernel/machine_kexec.c @@ -6,6 +6,7 @@ #include <linux/kexec.h> #include <linux/mm.h> #include <linux/delay.h> +#include <linux/reboot.h> #include <asm/cacheflush.h> #include <asm/page.h> diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 52cbde60edf5..9ff55cb80a64 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -15,6 +15,8 @@ #define EMITS_PT_NOTE #endif +#define RUNTIME_DISCARD_EXIT + #include <asm-generic/vmlinux.lds.h> #undef mips diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 29e51649203b..a8cdba75f98d 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -26,7 +26,6 @@ config KVM select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_MMIO select MMU_NOTIFIER - select SRCU select INTERVAL_TREE select KVM_GENERIC_HARDWARE_ENABLING help diff --git a/arch/nios2/include/asm/thread_info.h b/arch/nios2/include/asm/thread_info.h index bcc0e9915ebd..5abac9893b32 100644 --- a/arch/nios2/include/asm/thread_info.h +++ b/arch/nios2/include/asm/thread_info.h @@ -96,9 +96,6 @@ static inline struct thread_info *current_thread_info(void) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK 0x0000FFFE -/* work to do on any return to u-space */ -# define _TIF_ALLWORK_MASK 0x0000FFFF - #endif /* __KERNEL__ */ #endif /* _ASM_NIOS2_THREAD_INFO_H */ diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index a9f57dad6d91..902611954200 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM select PREEMPT_NOTIFIERS select HAVE_KVM_EVENTFD select HAVE_KVM_VCPU_ASYNC_IOCTL - select SRCU select KVM_VFIO select IRQ_BYPASS_MANAGER select HAVE_KVM_IRQ_BYPASS diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index b44ce71917d7..16cfe56be05b 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -366,6 +366,7 @@ void update_numa_distance(struct device_node *node) WARN(numa_distance_table[nid][nid] == -1, "NUMA distance details for node %d not provided\n", nid); } +EXPORT_SYMBOL_GPL(update_numa_distance); /* * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 2f8385523a13..1a53e048ceb7 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -1428,6 +1428,13 @@ static int papr_scm_probe(struct platform_device *pdev) return -ENODEV; } + /* + * open firmware platform device create won't update the NUMA + * distance table. For PAPR SCM devices we use numa_map_to_online_node() + * to find the nearest online NUMA node and that requires correct + * distance table information. + */ + update_numa_distance(dn); p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) diff --git a/arch/riscv/boot/dts/canaan/k210.dtsi b/arch/riscv/boot/dts/canaan/k210.dtsi index 07e2e2649604..f87c5164d9cf 100644 --- a/arch/riscv/boot/dts/canaan/k210.dtsi +++ b/arch/riscv/boot/dts/canaan/k210.dtsi @@ -259,7 +259,6 @@ <&sysclk K210_CLK_APB0>; clock-names = "ssi_clk", "pclk"; resets = <&sysrst K210_RST_SPI2>; - spi-max-frequency = <25000000>; }; i2s0: i2s@50250000 { diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h index 5c3e7b97fcc6..0a55099bb734 100644 --- a/arch/riscv/include/asm/fixmap.h +++ b/arch/riscv/include/asm/fixmap.h @@ -22,6 +22,14 @@ */ enum fixed_addresses { FIX_HOLE, + /* + * The fdt fixmap mapping must be PMD aligned and will be mapped + * using PMD entries in fixmap_pmd in 64-bit and a PGD entry in 32-bit. + */ + FIX_FDT_END, + FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1, + + /* Below fixmaps will be mapped using fixmap_pte */ FIX_PTE, FIX_PMD, FIX_PUD, diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index ab05f892d317..f641837ccf31 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -87,9 +87,13 @@ #define FIXADDR_TOP PCI_IO_START #ifdef CONFIG_64BIT -#define FIXADDR_SIZE PMD_SIZE +#define MAX_FDT_SIZE PMD_SIZE +#define FIX_FDT_SIZE (MAX_FDT_SIZE + SZ_2M) +#define FIXADDR_SIZE (PMD_SIZE + FIX_FDT_SIZE) #else -#define FIXADDR_SIZE PGDIR_SIZE +#define MAX_FDT_SIZE PGDIR_SIZE +#define FIX_FDT_SIZE MAX_FDT_SIZE +#define FIXADDR_SIZE (PGDIR_SIZE + FIX_FDT_SIZE) #endif #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 376d2827e736..a059b73f4ddb 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -278,12 +278,8 @@ void __init setup_arch(char **cmdline_p) #if IS_ENABLED(CONFIG_BUILTIN_DTB) unflatten_and_copy_device_tree(); #else - if (early_init_dt_verify(__va(XIP_FIXUP(dtb_early_pa)))) - unflatten_device_tree(); - else - pr_err("No DTB found in kernel mappings\n"); + unflatten_device_tree(); #endif - early_init_fdt_scan_reserved_mem(); misc_mem_init(); init_resources(); diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index bfb2afa4135f..dee66c9290cc 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -19,6 +19,7 @@ #include <asm/signal32.h> #include <asm/switch_to.h> #include <asm/csr.h> +#include <asm/cacheflush.h> extern u32 __user_rt_sigreturn[2]; @@ -181,6 +182,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, { struct rt_sigframe __user *frame; long err = 0; + unsigned long __maybe_unused addr; frame = get_sigframe(ksig, regs, sizeof(*frame)); if (!access_ok(frame, sizeof(*frame))) @@ -209,7 +211,12 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, if (copy_to_user(&frame->sigreturn_code, __user_rt_sigreturn, sizeof(frame->sigreturn_code))) return -EFAULT; - regs->ra = (unsigned long)&frame->sigreturn_code; + + addr = (unsigned long)&frame->sigreturn_code; + /* Make sure the two instructions are pushed to icache. */ + flush_icache_range(addr, addr + sizeof(frame->sigreturn_code)); + + regs->ra = addr; #endif /* CONFIG_MMU */ /* diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index d5a658a047a7..5682d8c017b3 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -28,7 +28,6 @@ config KVM select KVM_XFER_TO_GUEST_WORK select HAVE_KVM_VCPU_ASYNC_IOCTL select HAVE_KVM_EVENTFD - select SRCU help Support hosting virtualized guest machines. diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 478d6763a01a..0f14f4a8d179 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -57,7 +57,6 @@ unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] EXPORT_SYMBOL(empty_zero_page); extern char _start[]; -#define DTB_EARLY_BASE_VA PGDIR_SIZE void *_dtb_early_va __initdata; uintptr_t _dtb_early_pa __initdata; @@ -236,31 +235,22 @@ static void __init setup_bootmem(void) set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET); reserve_initrd_mem(); + + /* + * No allocation should be done before reserving the memory as defined + * in the device tree, otherwise the allocation could end up in a + * reserved region. + */ + early_init_fdt_scan_reserved_mem(); + /* * If DTB is built in, no need to reserve its memblock. * Otherwise, do reserve it but avoid using * early_init_fdt_reserve_self() since __pa() does * not work for DTB pointers that are fixmap addresses */ - if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) { - /* - * In case the DTB is not located in a memory region we won't - * be able to locate it later on via the linear mapping and - * get a segfault when accessing it via __va(dtb_early_pa). - * To avoid this situation copy DTB to a memory region. - * Note that memblock_phys_alloc will also reserve DTB region. - */ - if (!memblock_is_memory(dtb_early_pa)) { - size_t fdt_size = fdt_totalsize(dtb_early_va); - phys_addr_t new_dtb_early_pa = memblock_phys_alloc(fdt_size, PAGE_SIZE); - void *new_dtb_early_va = early_memremap(new_dtb_early_pa, fdt_size); - - memcpy(new_dtb_early_va, dtb_early_va, fdt_size); - early_memunmap(new_dtb_early_va, fdt_size); - _dtb_early_pa = new_dtb_early_pa; - } else - memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); - } + if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) + memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); dma_contiguous_reserve(dma32_phys_limit); if (IS_ENABLED(CONFIG_64BIT)) @@ -279,9 +269,6 @@ pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); -static p4d_t __maybe_unused early_dtb_p4d[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); -static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); -static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); #ifdef CONFIG_XIP_KERNEL #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&pt_ops)) @@ -626,9 +613,6 @@ static void __init create_p4d_mapping(p4d_t *p4dp, #define trampoline_pgd_next (pgtable_l5_enabled ? \ (uintptr_t)trampoline_p4d : (pgtable_l4_enabled ? \ (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd)) -#define early_dtb_pgd_next (pgtable_l5_enabled ? \ - (uintptr_t)early_dtb_p4d : (pgtable_l4_enabled ? \ - (uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd)) #else #define pgd_next_t pte_t #define alloc_pgd_next(__va) pt_ops.alloc_pte(__va) @@ -636,7 +620,6 @@ static void __init create_p4d_mapping(p4d_t *p4dp, #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ create_pte_mapping(__nextp, __va, __pa, __sz, __prot) #define fixmap_pgd_next ((uintptr_t)fixmap_pte) -#define early_dtb_pgd_next ((uintptr_t)early_dtb_pmd) #define create_p4d_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0) #define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0) #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0) @@ -860,32 +843,28 @@ static void __init create_kernel_page_table(pgd_t *pgdir, bool early) * this means 2 PMD entries whereas for 32-bit kernel, this is only 1 PGDIR * entry. */ -static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) +static void __init create_fdt_early_page_table(pgd_t *pgdir, + uintptr_t fix_fdt_va, + uintptr_t dtb_pa) { -#ifndef CONFIG_BUILTIN_DTB uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1); - create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, - IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa, - PGDIR_SIZE, - IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL); - - if (pgtable_l5_enabled) - create_p4d_mapping(early_dtb_p4d, DTB_EARLY_BASE_VA, - (uintptr_t)early_dtb_pud, P4D_SIZE, PAGE_TABLE); - - if (pgtable_l4_enabled) - create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA, - (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE); +#ifndef CONFIG_BUILTIN_DTB + /* Make sure the fdt fixmap address is always aligned on PMD size */ + BUILD_BUG_ON(FIX_FDT % (PMD_SIZE / PAGE_SIZE)); - if (IS_ENABLED(CONFIG_64BIT)) { - create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, + /* In 32-bit only, the fdt lies in its own PGD */ + if (!IS_ENABLED(CONFIG_64BIT)) { + create_pgd_mapping(early_pg_dir, fix_fdt_va, + pa, MAX_FDT_SIZE, PAGE_KERNEL); + } else { + create_pmd_mapping(fixmap_pmd, fix_fdt_va, pa, PMD_SIZE, PAGE_KERNEL); - create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA + PMD_SIZE, + create_pmd_mapping(fixmap_pmd, fix_fdt_va + PMD_SIZE, pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL); } - dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1)); + dtb_early_va = (void *)fix_fdt_va + (dtb_pa & (PMD_SIZE - 1)); #else /* * For 64-bit kernel, __va can't be used since it would return a linear @@ -1055,7 +1034,8 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) create_kernel_page_table(early_pg_dir, true); /* Setup early mapping for FDT early scan */ - create_fdt_early_page_table(early_pg_dir, dtb_pa); + create_fdt_early_page_table(early_pg_dir, + __fix_to_virt(FIX_FDT), dtb_pa); /* * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap @@ -1097,6 +1077,16 @@ static void __init setup_vm_final(void) u64 i; /* Setup swapper PGD for fixmap */ +#if !defined(CONFIG_64BIT) + /* + * In 32-bit, the device tree lies in a pgd entry, so it must be copied + * directly in swapper_pg_dir in addition to the pgd entry that points + * to fixmap_pte. + */ + unsigned long idx = pgd_index(__fix_to_virt(FIX_FDT)); + + set_pgd(&swapper_pg_dir[idx], early_pg_dir[idx]); +#endif create_pgd_mapping(swapper_pg_dir, FIXADDR_START, __pa_symbol(fixmap_pgd_next), PGDIR_SIZE, PAGE_TABLE); diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile index d16bf715a586..5730797a6b40 100644 --- a/arch/riscv/purgatory/Makefile +++ b/arch/riscv/purgatory/Makefile @@ -84,12 +84,7 @@ CFLAGS_string.o += $(PURGATORY_CFLAGS) CFLAGS_REMOVE_ctype.o += $(PURGATORY_CFLAGS_REMOVE) CFLAGS_ctype.o += $(PURGATORY_CFLAGS) -AFLAGS_REMOVE_entry.o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_memcpy.o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_memset.o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_strcmp.o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_strlen.o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_strncmp.o += -Wa,-gdwarf-2 +asflags-remove-y += $(foreach x, -g -gdwarf-4 -gdwarf-5, $(x) -Wa,$(x)) $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(call if_changed,ld) diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 33f4ff909476..45fdf2a9b2e3 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -31,7 +31,6 @@ config KVM select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_INVALID_WAKEUPS select HAVE_KVM_NO_POLL - select SRCU select KVM_VFIO select INTERVAL_TREE select MMU_NOTIFIER diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index d0846ba818ee..6b1876e4ad3f 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -539,7 +539,7 @@ static void bpf_jit_plt(void *plt, void *ret, void *target) { memcpy(plt, bpf_plt, BPF_PLT_SIZE); *(void **)((char *)plt + (bpf_plt_ret - bpf_plt)) = ret; - *(void **)((char *)plt + (bpf_plt_target - bpf_plt)) = target; + *(void **)((char *)plt + (bpf_plt_target - bpf_plt)) = target ?: ret; } /* @@ -2010,7 +2010,9 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, } __packed insn; char expected_plt[BPF_PLT_SIZE]; char current_plt[BPF_PLT_SIZE]; + char new_plt[BPF_PLT_SIZE]; char *plt; + char *ret; int err; /* Verify the branch to be patched. */ @@ -2032,12 +2034,15 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, err = copy_from_kernel_nofault(current_plt, plt, BPF_PLT_SIZE); if (err < 0) return err; - bpf_jit_plt(expected_plt, (char *)ip + 6, old_addr); + ret = (char *)ip + 6; + bpf_jit_plt(expected_plt, ret, old_addr); if (memcmp(current_plt, expected_plt, BPF_PLT_SIZE)) return -EINVAL; /* Adjust the call address. */ + bpf_jit_plt(new_plt, ret, new_addr); s390_kernel_write(plt + (bpf_plt_target - bpf_plt), - &new_addr, sizeof(void *)); + new_plt + (bpf_plt_target - bpf_plt), + sizeof(void *)); } /* Adjust the mask of the branch. */ diff --git a/arch/sh/Kconfig.cpu b/arch/sh/Kconfig.cpu index fff419f3d757..336c54369636 100644 --- a/arch/sh/Kconfig.cpu +++ b/arch/sh/Kconfig.cpu @@ -85,7 +85,7 @@ config CPU_HAS_SR_RB that are lacking this bit must have another method in place for accomplishing what is taken care of by the banked registers. - See <file:Documentation/sh/register-banks.rst> for further + See <file:Documentation/arch/sh/register-banks.rst> for further information on SR.RB and register banking in the kernel in general. config CPU_HAS_PTEAEX diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a825bf031f49..57a14206ad8f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -283,7 +283,6 @@ config X86 select RTC_LIB select RTC_MC146818_LIB select SPARSE_IRQ - select SRCU select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT @@ -434,7 +433,7 @@ config SMP Y to "Enhanced Real Time Clock Support", below. The "Advanced Power Management" code will be disabled if you say Y here. - See also <file:Documentation/x86/i386/IO-APIC.rst>, + See also <file:Documentation/arch/x86/i386/IO-APIC.rst>, <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO available at <http://www.tldp.org/docs.html#howto>. @@ -1324,7 +1323,7 @@ config MICROCODE the Linux kernel. The preferred method to load microcode from a detached initrd is described - in Documentation/x86/microcode.rst. For that you need to enable + in Documentation/arch/x86/microcode.rst. For that you need to enable CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the initrd for microcode blobs. @@ -1510,7 +1509,7 @@ config X86_5LEVEL A kernel with the option enabled can be booted on machines that support 4- or 5-level paging. - See Documentation/x86/x86_64/5level-paging.rst for more + See Documentation/arch/x86/x86_64/5level-paging.rst for more information. Say N if unsure. @@ -1774,7 +1773,7 @@ config MTRR You can safely say Y even if your machine doesn't have MTRRs, you'll just add about 9 KB to your kernel. - See <file:Documentation/x86/mtrr.rst> for more information. + See <file:Documentation/arch/x86/mtrr.rst> for more information. config MTRR_SANITIZER def_bool y @@ -1938,7 +1937,6 @@ config X86_SGX depends on X86_64 && CPU_SUP_INTEL && X86_X2APIC depends on CRYPTO=y depends on CRYPTO_SHA256=y - select SRCU select MMU_NOTIFIER select NUMA_KEEP_MEMINFO if NUMA select XARRAY_MULTI @@ -2551,7 +2549,7 @@ config PAGE_TABLE_ISOLATION ensuring that the majority of kernel addresses are not mapped into userspace. - See Documentation/x86/pti.rst for more details. + See Documentation/arch/x86/pti.rst for more details. config RETPOLINE bool "Avoid speculative indirect branches in kernel" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index bdfe08f1a930..c5d614d28a75 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -97,7 +97,7 @@ config IOMMU_DEBUG code. When you use it make sure you have a big enough IOMMU/AGP aperture. Most of the options enabled by this can be set more finegrained using the iommu= command line - options. See Documentation/x86/x86_64/boot-options.rst for more + options. See Documentation/arch/x86/x86_64/boot-options.rst for more details. config IOMMU_LEAK diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index b70559b821df..2106a2bd152b 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -3,9 +3,14 @@ core-y += arch/x86/crypto/ # # Disable SSE and other FP/SIMD instructions to match normal x86 +# This is required to work around issues in older LLVM versions, but breaks +# GCC versions < 11. See: +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99652 # +ifeq ($(CONFIG_CC_IS_CLANG),y) KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 +endif ifeq ($(CONFIG_X86_32),y) START := 0x8048000 diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 9338c68e7413..b04ca8e2b213 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -321,7 +321,7 @@ start_sys_seg: .word SYSSEG # obsolete and meaningless, but just type_of_loader: .byte 0 # 0 means ancient bootloader, newer # bootloaders know to change this. - # See Documentation/x86/boot.rst for + # See Documentation/arch/x86/boot.rst for # assigned ids # flags, unused bits must be zero (RFU) bit within loadflags diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index 49b44f881484..f4f0625691fd 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -30,6 +30,22 @@ static bool intel_cc_platform_has(enum cc_attr attr) } /* + * Handle the SEV-SNP vTOM case where sme_me_mask is zero, and + * the other levels of SME/SEV functionality, including C-bit + * based SEV-SNP, are not enabled. + */ +static __maybe_unused bool amd_cc_platform_vtom(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_GUEST_MEM_ENCRYPT: + case CC_ATTR_MEM_ENCRYPT: + return true; + default: + return false; + } +} + +/* * SME and SEV are very similar but they are not the same, so there are * times that the kernel will need to distinguish between SME and SEV. The * cc_platform_has() function is used for this. When a distinction isn't @@ -41,9 +57,14 @@ static bool intel_cc_platform_has(enum cc_attr attr) * up under SME the trampoline area cannot be encrypted, whereas under SEV * the trampoline area must be encrypted. */ + static bool amd_cc_platform_has(enum cc_attr attr) { #ifdef CONFIG_AMD_MEM_ENCRYPT + + if (sev_status & MSR_AMD64_SNP_VTOM) + return amd_cc_platform_vtom(attr); + switch (attr) { case CC_ATTR_MEM_ENCRYPT: return sme_me_mask; @@ -76,11 +97,6 @@ static bool amd_cc_platform_has(enum cc_attr attr) #endif } -static bool hyperv_cc_platform_has(enum cc_attr attr) -{ - return attr == CC_ATTR_GUEST_MEM_ENCRYPT; -} - bool cc_platform_has(enum cc_attr attr) { switch (vendor) { @@ -88,8 +104,6 @@ bool cc_platform_has(enum cc_attr attr) return amd_cc_platform_has(attr); case CC_VENDOR_INTEL: return intel_cc_platform_has(attr); - case CC_VENDOR_HYPERV: - return hyperv_cc_platform_has(attr); default: return false; } @@ -103,11 +117,14 @@ u64 cc_mkenc(u64 val) * encryption status of the page. * * - for AMD, bit *set* means the page is encrypted - * - for Intel *clear* means encrypted. + * - for AMD with vTOM and for Intel, *clear* means encrypted */ switch (vendor) { case CC_VENDOR_AMD: - return val | cc_mask; + if (sev_status & MSR_AMD64_SNP_VTOM) + return val & ~cc_mask; + else + return val | cc_mask; case CC_VENDOR_INTEL: return val & ~cc_mask; default: @@ -120,7 +137,10 @@ u64 cc_mkdec(u64 val) /* See comment in cc_mkenc() */ switch (vendor) { case CC_VENDOR_AMD: - return val & ~cc_mask; + if (sev_status & MSR_AMD64_SNP_VTOM) + return val | cc_mask; + else + return val & ~cc_mask; case CC_VENDOR_INTEL: return val | cc_mask; default: diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index eccc3431e515..d94d361f506f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -8,7 +8,7 @@ * * entry.S contains the system-call and fault low-level handling routines. * - * Some of this is documented in Documentation/x86/entry_64.rst + * Some of this is documented in Documentation/arch/x86/entry_64.rst * * A note on terminology: * - iret frame: Architecture defined interrupt frame from SS to RIP diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 41ef036ebb7b..edbc67ec1f3e 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -29,7 +29,6 @@ #include <linux/syscore_ops.h> #include <clocksource/hyperv_timer.h> #include <linux/highmem.h> -#include <linux/swiotlb.h> int hyperv_init_cpuhp; u64 hv_current_partition_id = ~0ull; @@ -504,16 +503,6 @@ void __init hyperv_init(void) /* Query the VMs extended capability once, so that it can be cached. */ hv_query_ext_cap(0); -#ifdef CONFIG_SWIOTLB - /* - * Swiotlb bounce buffer needs to be mapped in extra address - * space. Map function doesn't work in the early place and so - * call swiotlb_update_mem_attributes() here. - */ - if (hv_is_isolation_supported()) - swiotlb_update_mem_attributes(); -#endif - return; clean_guest_os_id: diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index 1dbcbd9da74d..f6a020cb1a24 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -13,6 +13,8 @@ #include <asm/svm.h> #include <asm/sev.h> #include <asm/io.h> +#include <asm/coco.h> +#include <asm/mem_encrypt.h> #include <asm/mshyperv.h> #include <asm/hypervisor.h> @@ -233,41 +235,6 @@ void hv_ghcb_msr_read(u64 msr, u64 *value) local_irq_restore(flags); } EXPORT_SYMBOL_GPL(hv_ghcb_msr_read); -#endif - -enum hv_isolation_type hv_get_isolation_type(void) -{ - if (!(ms_hyperv.priv_high & HV_ISOLATION)) - return HV_ISOLATION_TYPE_NONE; - return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b); -} -EXPORT_SYMBOL_GPL(hv_get_isolation_type); - -/* - * hv_is_isolation_supported - Check system runs in the Hyper-V - * isolation VM. - */ -bool hv_is_isolation_supported(void) -{ - if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) - return false; - - if (!hypervisor_is_type(X86_HYPER_MS_HYPERV)) - return false; - - return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE; -} - -DEFINE_STATIC_KEY_FALSE(isolation_type_snp); - -/* - * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based - * isolation VM. - */ -bool hv_isolation_type_snp(void) -{ - return static_branch_unlikely(&isolation_type_snp); -} /* * hv_mark_gpa_visibility - Set pages visible to host via hvcall. @@ -320,27 +287,25 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], } /* - * hv_set_mem_host_visibility - Set specified memory visible to host. + * hv_vtom_set_host_visibility - Set specified memory visible to host. * * In Isolation VM, all guest memory is encrypted from host and guest * needs to set memory visible to host via hvcall before sharing memory * with host. This function works as wrap of hv_mark_gpa_visibility() * with memory base and size. */ -int hv_set_mem_host_visibility(unsigned long kbuffer, int pagecount, bool visible) +static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc) { - enum hv_mem_host_visibility visibility = visible ? - VMBUS_PAGE_VISIBLE_READ_WRITE : VMBUS_PAGE_NOT_VISIBLE; + enum hv_mem_host_visibility visibility = enc ? + VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE; u64 *pfn_array; int ret = 0; + bool result = true; int i, pfn; - if (!hv_is_isolation_supported() || !hv_hypercall_pg) - return 0; - pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); if (!pfn_array) - return -ENOMEM; + return false; for (i = 0, pfn = 0; i < pagecount; i++) { pfn_array[pfn] = virt_to_hvpfn((void *)kbuffer + i * HV_HYP_PAGE_SIZE); @@ -349,17 +314,68 @@ int hv_set_mem_host_visibility(unsigned long kbuffer, int pagecount, bool visibl if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) { ret = hv_mark_gpa_visibility(pfn, pfn_array, visibility); - if (ret) + if (ret) { + result = false; goto err_free_pfn_array; + } pfn = 0; } } err_free_pfn_array: kfree(pfn_array); - return ret; + return result; +} + +static bool hv_vtom_tlb_flush_required(bool private) +{ + return true; } +static bool hv_vtom_cache_flush_required(void) +{ + return false; +} + +static bool hv_is_private_mmio(u64 addr) +{ + /* + * Hyper-V always provides a single IO-APIC in a guest VM. + * When a paravisor is used, it is emulated by the paravisor + * in the guest context and must be mapped private. + */ + if (addr >= HV_IOAPIC_BASE_ADDRESS && + addr < (HV_IOAPIC_BASE_ADDRESS + PAGE_SIZE)) + return true; + + /* Same with a vTPM */ + if (addr >= VTPM_BASE_ADDRESS && + addr < (VTPM_BASE_ADDRESS + PAGE_SIZE)) + return true; + + return false; +} + +void __init hv_vtom_init(void) +{ + /* + * By design, a VM using vTOM doesn't see the SEV setting, + * so SEV initialization is bypassed and sev_status isn't set. + * Set it here to indicate a vTOM VM. + */ + sev_status = MSR_AMD64_SNP_VTOM; + cc_set_vendor(CC_VENDOR_AMD); + cc_set_mask(ms_hyperv.shared_gpa_boundary); + physical_mask &= ms_hyperv.shared_gpa_boundary - 1; + + x86_platform.hyper.is_private_mmio = hv_is_private_mmio; + x86_platform.guest.enc_cache_flush_required = hv_vtom_cache_flush_required; + x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required; + x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility; +} + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + /* * hv_map_memory - map memory to extra space in the AMD SEV-SNP Isolation VM. */ @@ -377,7 +393,7 @@ void *hv_map_memory(void *addr, unsigned long size) pfns[i] = vmalloc_to_pfn(addr + i * PAGE_SIZE) + (ms_hyperv.shared_gpa_boundary >> PAGE_SHIFT); - vaddr = vmap_pfn(pfns, size / PAGE_SIZE, PAGE_KERNEL_IO); + vaddr = vmap_pfn(pfns, size / PAGE_SIZE, pgprot_decrypted(PAGE_KERNEL)); kfree(pfns); return vaddr; @@ -387,3 +403,37 @@ void hv_unmap_memory(void *addr) { vunmap(addr); } + +enum hv_isolation_type hv_get_isolation_type(void) +{ + if (!(ms_hyperv.priv_high & HV_ISOLATION)) + return HV_ISOLATION_TYPE_NONE; + return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b); +} +EXPORT_SYMBOL_GPL(hv_get_isolation_type); + +/* + * hv_is_isolation_supported - Check system runs in the Hyper-V + * isolation VM. + */ +bool hv_is_isolation_supported(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return false; + + if (!hypervisor_is_type(X86_HYPER_MS_HYPERV)) + return false; + + return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE; +} + +DEFINE_STATIC_KEY_FALSE(isolation_type_snp); + +/* + * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based + * isolation VM. + */ +bool hv_isolation_type_snp(void) +{ + return static_branch_unlikely(&isolation_type_snp); +} diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index e2975a32d443..d7da28fada87 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -8,7 +8,7 @@ #define ALT_FLAGS_SHIFT 16 -#define ALT_FLAG_NOT BIT(0) +#define ALT_FLAG_NOT (1 << 0) #define ALT_NOT(feature) ((ALT_FLAG_NOT << ALT_FLAGS_SHIFT) | (feature)) #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index 53e9b0620d96..d90ae472fb76 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -38,7 +38,7 @@ static void sanitize_boot_params(struct boot_params *boot_params) * IMPORTANT NOTE TO BOOTLOADER AUTHORS: do not simply clear * this field. The purpose of this field is to guarantee * compliance with the x86 boot spec located in - * Documentation/x86/boot.rst . That spec says that the + * Documentation/arch/x86/boot.rst . That spec says that the * *whole* structure should be cleared, after which only the * portion defined by struct setup_header (boot_params->hdr) * should be copied in. diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index 3d98c3a60d34..d2c6a2e8d04d 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -7,7 +7,6 @@ enum cc_vendor { CC_VENDOR_NONE, CC_VENDOR_AMD, - CC_VENDOR_HYPERV, CC_VENDOR_INTEL, }; diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 72ca90552b6a..b7126701574c 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -56,6 +56,7 @@ void __init sev_es_init_vc_handling(void); #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask 0ULL +#define sev_status 0ULL static inline void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) { } diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index e01aa74a6de7..c3ad8a526378 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -16,13 +16,6 @@ extern atomic64_t last_mm_ctx_id; -#ifndef CONFIG_PARAVIRT_XXL -static inline void paravirt_activate_mm(struct mm_struct *prev, - struct mm_struct *next) -{ -} -#endif /* !CONFIG_PARAVIRT_XXL */ - #ifdef CONFIG_PERF_EVENTS DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key); DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key); @@ -135,7 +128,7 @@ extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, #define activate_mm(prev, next) \ do { \ - paravirt_activate_mm((prev), (next)); \ + paravirt_enter_mmap(next); \ switch_mm((prev), (next), NULL); \ } while (0); @@ -168,7 +161,7 @@ static inline void arch_dup_pkeys(struct mm_struct *oldmm, static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { arch_dup_pkeys(oldmm, mm); - paravirt_arch_dup_mmap(oldmm, mm); + paravirt_enter_mmap(mm); return ldt_dup_context(oldmm, mm); } diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 4c4c0ec3b62e..e3cef98a0142 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -11,6 +11,14 @@ #include <asm/paravirt.h> #include <asm/mshyperv.h> +/* + * Hyper-V always provides a single IO-APIC at this MMIO address. + * Ideally, the value should be looked up in ACPI tables, but it + * is needed for mapping the IO-APIC early in boot on Confidential + * VMs, before ACPI functions can be used. + */ +#define HV_IOAPIC_BASE_ADDRESS 0xfec00000 + union hv_ghcb; DECLARE_STATIC_KEY_FALSE(isolation_type_snp); @@ -206,18 +214,19 @@ struct irq_domain *hv_create_pci_msi_domain(void); int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector, struct hv_interrupt_entry *entry); int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry); -int hv_set_mem_host_visibility(unsigned long addr, int numpages, bool visible); #ifdef CONFIG_AMD_MEM_ENCRYPT void hv_ghcb_msr_write(u64 msr, u64 value); void hv_ghcb_msr_read(u64 msr, u64 *value); bool hv_ghcb_negotiate_protocol(void); void hv_ghcb_terminate(unsigned int set, unsigned int reason); +void hv_vtom_init(void); #else static inline void hv_ghcb_msr_write(u64 msr, u64 value) {} static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {} static inline bool hv_ghcb_negotiate_protocol(void) { return false; } static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {} +static inline void hv_vtom_init(void) {} #endif extern bool hv_isolation_type_snp(void); @@ -259,11 +268,6 @@ static inline void hv_set_register(unsigned int reg, u64 value) { } static inline u64 hv_get_register(unsigned int reg) { return 0; } static inline void hv_set_non_nested_register(unsigned int reg, u64 value) { } static inline u64 hv_get_non_nested_register(unsigned int reg) { return 0; } -static inline int hv_set_mem_host_visibility(unsigned long addr, int numpages, - bool visible) -{ - return -1; -} #endif /* CONFIG_HYPERV */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index e9e2c3ba5923..06ef25411d62 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -49,7 +49,7 @@ #define __START_KERNEL_map _AC(0xffffffff80000000, UL) -/* See Documentation/x86/x86_64/mm.rst for a description of the memory map. */ +/* See Documentation/arch/x86/x86_64/mm.rst for a description of the memory map. */ #define __PHYSICAL_MASK_SHIFT 52 diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cf40e813b3d7..b49778664d2b 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -334,16 +334,9 @@ static inline void tss_update_io_bitmap(void) } #endif -static inline void paravirt_activate_mm(struct mm_struct *prev, - struct mm_struct *next) +static inline void paravirt_enter_mmap(struct mm_struct *next) { - PVOP_VCALL2(mmu.activate_mm, prev, next); -} - -static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) -{ - PVOP_VCALL2(mmu.dup_mmap, oldmm, mm); + PVOP_VCALL1(mmu.enter_mmap, next); } static inline int paravirt_pgd_alloc(struct mm_struct *mm) @@ -789,8 +782,7 @@ extern void default_banner(void); #ifndef __ASSEMBLY__ #ifndef CONFIG_PARAVIRT_XXL -static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) +static inline void paravirt_enter_mmap(struct mm_struct *mm) { } #endif diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 8c1da419260f..4acbcddddc29 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -164,11 +164,8 @@ struct pv_mmu_ops { unsigned long (*read_cr3)(void); void (*write_cr3)(unsigned long); - /* Hooks for intercepting the creation/use of an mm_struct. */ - void (*activate_mm)(struct mm_struct *prev, - struct mm_struct *next); - void (*dup_mmap)(struct mm_struct *oldmm, - struct mm_struct *mm); + /* Hook for intercepting the creation/use of an mm_struct. */ + void (*enter_mmap)(struct mm_struct *mm); /* Hooks for allocating and freeing a pagetable top-level */ int (*pgd_alloc)(struct mm_struct *mm); @@ -562,8 +559,14 @@ void paravirt_flush_lazy_mmu(void); void _paravirt_nop(void); void paravirt_BUG(void); -u64 _paravirt_ident_64(u64); unsigned long paravirt_ret0(void); +#ifdef CONFIG_PARAVIRT_XXL +u64 _paravirt_ident_64(u64); +unsigned long pv_native_save_fl(void); +void pv_native_irq_disable(void); +void pv_native_irq_enable(void); +unsigned long pv_native_read_cr2(void); +#endif #define paravirt_nop ((void *)_paravirt_nop) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 38bf837e3554..38b54b992f32 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -104,7 +104,7 @@ extern unsigned int ptrs_per_p4d; #define PGDIR_MASK (~(PGDIR_SIZE - 1)) /* - * See Documentation/x86/x86_64/mm.rst for a description of the memory map. + * See Documentation/arch/x86/x86_64/mm.rst for a description of the memory map. * * Be very careful vs. KASLR when changing anything here. The KASLR address * range must not overlap with anything except the KASAN shadow area, which diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index b63be696b776..0759af9b1acf 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -128,10 +128,6 @@ struct snp_psc_desc { struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY]; } __packed; -/* Guest message request error codes */ -#define SNP_GUEST_REQ_INVALID_LEN BIT_ULL(32) -#define SNP_GUEST_REQ_ERR_BUSY BIT_ULL(33) - #define GHCB_MSR_TERM_REQ 0x100 #define GHCB_MSR_TERM_REASON_SET_POS 12 #define GHCB_MSR_TERM_REASON_SET_MASK 0xf diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index ebc271bb6d8e..13dc2a9d23c1 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -9,6 +9,8 @@ #define __ASM_ENCRYPTED_STATE_H #include <linux/types.h> +#include <linux/sev-guest.h> + #include <asm/insn.h> #include <asm/sev-common.h> #include <asm/bootparam.h> @@ -185,6 +187,9 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) return rc; } + +struct snp_guest_request_ioctl; + void setup_ghcb(void); void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned int npages); @@ -196,7 +201,7 @@ void snp_set_memory_private(unsigned long vaddr, unsigned int npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); void __init __noreturn snp_abort(void); -int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err); +int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -216,8 +221,7 @@ static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npag static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } static inline void snp_abort(void) { } -static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, - unsigned long *fw_err) +static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio) { return -ENOTTY; } diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index d13d71af5cf6..0a49a8de9f3c 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -18,32 +18,26 @@ /* Handles exceptions in both to and from, but doesn't do access_ok */ __must_check unsigned long -copy_user_enhanced_fast_string(void *to, const void *from, unsigned len); -__must_check unsigned long -copy_user_generic_string(void *to, const void *from, unsigned len); -__must_check unsigned long -copy_user_generic_unrolled(void *to, const void *from, unsigned len); +rep_movs_alternative(void *to, const void *from, unsigned len); static __always_inline __must_check unsigned long -copy_user_generic(void *to, const void *from, unsigned len) +copy_user_generic(void *to, const void *from, unsigned long len) { - unsigned ret; - + stac(); /* - * If CPU has ERMS feature, use copy_user_enhanced_fast_string. - * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. - * Otherwise, use copy_user_generic_unrolled. + * If CPU has FSRM feature, use 'rep movs'. + * Otherwise, use rep_movs_alternative. */ - alternative_call_2(copy_user_generic_unrolled, - copy_user_generic_string, - X86_FEATURE_REP_GOOD, - copy_user_enhanced_fast_string, - X86_FEATURE_ERMS, - ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), - "=d" (len)), - "1" (to), "2" (from), "3" (len) - : "memory", "rcx", "r8", "r9", "r10", "r11"); - return ret; + asm volatile( + "1:\n\t" + ALTERNATIVE("rep movsb", + "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM)) + "2:\n" + _ASM_EXTABLE_UA(1b, 2b) + :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT + : : "memory", "rax", "r8", "r9", "r10", "r11"); + clac(); + return len; } static __always_inline __must_check unsigned long @@ -58,9 +52,7 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size) return copy_user_generic((__force void *)dst, src, size); } -extern long __copy_user_nocache(void *dst, const void __user *src, - unsigned size, int zerorest); - +extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size); extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size); extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset, size_t len); @@ -69,8 +61,12 @@ static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) { + long ret; kasan_check_write(dst, size); - return __copy_user_nocache(dst, src, size, 0); + stac(); + ret = __copy_user_nocache(dst, src, size); + clac(); + return ret; } static inline int @@ -85,11 +81,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) */ __must_check unsigned long -clear_user_original(void __user *addr, unsigned long len); -__must_check unsigned long -clear_user_rep_good(void __user *addr, unsigned long len); -__must_check unsigned long -clear_user_erms(void __user *addr, unsigned long len); +rep_stos_alternative(void __user *addr, unsigned long len); static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size) { @@ -102,16 +94,12 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr */ asm volatile( "1:\n\t" - ALTERNATIVE_3("rep stosb", - "call clear_user_erms", ALT_NOT(X86_FEATURE_FSRM), - "call clear_user_rep_good", ALT_NOT(X86_FEATURE_ERMS), - "call clear_user_original", ALT_NOT(X86_FEATURE_REP_GOOD)) + ALTERNATIVE("rep stosb", + "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS)) "2:\n" _ASM_EXTABLE_UA(1b, 2b) : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT - : "a" (0) - /* rep_good clobbers %rdx */ - : "rdx"); + : "a" (0)); clac(); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index c1c8c581759d..acc20ae4079d 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -259,11 +259,15 @@ struct x86_legacy_features { * VMMCALL under SEV-ES. Needs to return 'false' * if the checks fail. Called from the #VC * exception handler. + * @is_private_mmio: For CoCo VMs, must map MMIO address as private. + * Used when device is emulated by a paravisor + * layer in the VM context. */ struct x86_hyper_runtime { void (*pin_vcpu)(int cpu); void (*sev_es_hcall_prepare)(struct ghcb *ghcb, struct pt_regs *regs); bool (*sev_es_hcall_finish)(struct ghcb *ghcb, struct pt_regs *regs); + bool (*is_private_mmio)(u64 addr); }; /** diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0dac4ab5b55b..21b542a6866c 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1858,13 +1858,18 @@ early_param("acpi_sci", setup_acpi_sci); int __acpi_acquire_global_lock(unsigned int *lock) { - unsigned int old, new; + unsigned int old, new, val; old = READ_ONCE(*lock); do { - new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1)); + val = (old >> 1) & 0x1; + new = (old & ~0x3) + 2 + val; } while (!try_cmpxchg(lock, &old, new)); - return ((new & 0x3) < 3) ? -1 : 0; + + if (val) + return 0; + + return -1; } int __acpi_release_global_lock(unsigned int *lock) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1f83b052bb74..146671de9ddc 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -66,6 +66,7 @@ #include <asm/hw_irq.h> #include <asm/apic.h> #include <asm/pgtable.h> +#include <asm/x86_init.h> #define for_each_ioapic(idx) \ for ((idx) = 0; (idx) < nr_ioapics; (idx)++) @@ -2680,10 +2681,15 @@ static void io_apic_set_fixmap(enum fixed_addresses idx, phys_addr_t phys) pgprot_t flags = FIXMAP_PAGE_NOCACHE; /* - * Ensure fixmaps for IOAPIC MMIO respect memory encryption pgprot + * Ensure fixmaps for IO-APIC MMIO respect memory encryption pgprot * bits, just like normal ioremap(): */ - flags = pgprot_decrypted(flags); + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { + if (x86_platform.hyper.is_private_mmio(phys)) + flags = pgprot_encrypted(flags); + else + flags = pgprot_decrypted(flags); + } __set_fixmap(idx, phys, flags); } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 95cdd08c4cbb..571abf808ea3 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -929,6 +929,10 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 0x10) set_cpu_cap(c, X86_FEATURE_REP_GOOD); + /* AMD FSRM also implies FSRS */ + if (cpu_has(c, X86_FEATURE_FSRM)) + set_cpu_cap(c, X86_FEATURE_FSRS); + /* get apicid instead of initial apic id from cpuid */ c->apicid = hard_smp_processor_id(); @@ -1005,6 +1009,17 @@ static void init_amd(struct cpuinfo_x86 *c) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); check_null_seg_clears_base(c); + + /* + * Make sure EFER[AIBRSE - Automatic IBRS Enable] is set. The APs are brought up + * using the trampoline code and as part of it, MSR_EFER gets prepared there in + * order to be replicated onto them. Regardless, set it here again, if not set, + * to protect against any future refactoring/code reorganization which might + * miss setting this important bit. + */ + if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + cpu_has(c, X86_FEATURE_AUTOIBRS)) + WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS)); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index f9d060e71c3e..182af64387d0 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -784,8 +784,7 @@ static int __init nospectre_v1_cmdline(char *str) } early_param("nospectre_v1", nospectre_v1_cmdline); -static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = - SPECTRE_V2_NONE; +enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; #undef pr_fmt #define pr_fmt(fmt) "RETBleed: " fmt @@ -1133,13 +1132,6 @@ spectre_v2_parse_user_cmdline(void) return SPECTRE_V2_USER_CMD_AUTO; } -static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) -{ - return mode == SPECTRE_V2_EIBRS || - mode == SPECTRE_V2_EIBRS_RETPOLINE || - mode == SPECTRE_V2_EIBRS_LFENCE; -} - static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) { return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8cd4126d8253..80710a68ef7d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -121,6 +121,7 @@ static const struct x86_cpu_id ppin_cpuids[] = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]), diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 57a5349e6954..f97b0fe13da8 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -83,4 +83,12 @@ unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); extern void update_srbds_msr(void); +extern enum spectre_v2_mitigation spectre_v2_enabled; + +static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) +{ + return mode == SPECTRE_V2_EIBRS || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_EIBRS_LFENCE; +} #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 291d4167fab8..1c648b09e053 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1451,31 +1451,13 @@ void handle_bus_lock(struct pt_regs *regs) } /* - * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should - * only be trusted if it is confirmed that a CPU model implements a - * specific feature at a particular bit position. - * - * The possible driver data field values: - * - * - 0: CPU models that are known to have the per-core split-lock detection - * feature even though they do not enumerate IA32_CORE_CAPABILITIES. - * - * - 1: CPU models which may enumerate IA32_CORE_CAPABILITIES and if so use - * bit 5 to enumerate the per-core split-lock detection feature. + * CPU models that are known to have the per-core split-lock detection + * feature even though they do not enumerate IA32_CORE_CAPABILITIES. */ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, 1), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, 1), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1), - X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, 1), - X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1), - X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, 1), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0), {} }; @@ -1487,24 +1469,27 @@ static void __init split_lock_setup(struct cpuinfo_x86 *c) if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return; + /* Check for CPUs that have support but do not enumerate it: */ m = x86_match_cpu(split_lock_cpu_ids); - if (!m) - return; + if (m) + goto supported; - switch (m->driver_data) { - case 0: - break; - case 1: - if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) - return; - rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); - if (!(ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)) - return; - break; - default: + if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) return; - } + /* + * Not all bits in MSR_IA32_CORE_CAPS are architectural, but + * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set + * it have split lock detection. + */ + rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); + if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) + goto supported; + + /* CPU is not in the model list and does not have the MSR bit: */ + return; + +supported: cpu_model_supports_sld = true; __split_lock_setup(); } diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 23c5072fbbb7..0b971f974096 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -235,10 +235,10 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); * A list of the banks enabled on each logical CPU. Controls which respective * descriptors to initialize later in mce_threshold_create_device(). */ -static DEFINE_PER_CPU(unsigned int, bank_map); +static DEFINE_PER_CPU(u64, bank_map); /* Map of banks that have more than MCA_MISC0 available. */ -static DEFINE_PER_CPU(u32, smca_misc_banks_map); +static DEFINE_PER_CPU(u64, smca_misc_banks_map); static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); @@ -267,7 +267,7 @@ static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) return; if (low & MASK_BLKPTR_LO) - per_cpu(smca_misc_banks_map, cpu) |= BIT(bank); + per_cpu(smca_misc_banks_map, cpu) |= BIT_ULL(bank); } @@ -530,7 +530,7 @@ static u32 smca_get_block_address(unsigned int bank, unsigned int block, if (!block) return MSR_AMD64_SMCA_MCx_MISC(bank); - if (!(per_cpu(smca_misc_banks_map, cpu) & BIT(bank))) + if (!(per_cpu(smca_misc_banks_map, cpu) & BIT_ULL(bank))) return 0; return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); @@ -574,7 +574,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, int new; if (!block) - per_cpu(bank_map, cpu) |= (1 << bank); + per_cpu(bank_map, cpu) |= BIT_ULL(bank); memset(&b, 0, sizeof(b)); b.cpu = cpu; @@ -878,7 +878,7 @@ static void amd_threshold_interrupt(void) return; for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) + if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) continue; first_block = bp[bank]->blocks; @@ -1029,7 +1029,7 @@ static const struct sysfs_ops threshold_ops = { static void threshold_block_release(struct kobject *kobj); -static struct kobj_type threshold_ktype = { +static const struct kobj_type threshold_ktype = { .sysfs_ops = &threshold_ops, .default_groups = default_groups, .release = threshold_block_release, @@ -1356,7 +1356,7 @@ int mce_threshold_create_device(unsigned int cpu) return -ENOMEM; for (bank = 0; bank < numbanks; ++bank) { - if (!(this_cpu_read(bank_map) & (1 << bank))) + if (!(this_cpu_read(bank_map) & BIT_ULL(bank))) continue; err = threshold_create_bank(bp, cpu, bank); if (err) { diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 91a415553c27..d2412ce2d312 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -244,11 +244,11 @@ noinstr void pentium_machine_check(struct pt_regs *regs); noinstr void winchip_machine_check(struct pt_regs *regs); static inline void enable_p5_mce(void) { mce_p5_enabled = 1; } #else -static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} -static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} -static inline void enable_p5_mce(void) {} -static inline void pentium_machine_check(struct pt_regs *regs) {} -static inline void winchip_machine_check(struct pt_regs *regs) {} +static __always_inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} +static __always_inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} +static __always_inline void enable_p5_mce(void) {} +static __always_inline void pentium_machine_check(struct pt_regs *regs) {} +static __always_inline void winchip_machine_check(struct pt_regs *regs) {} #endif noinstr u64 mce_rdmsrl(u32 msr); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 9eb457b10341..f5fdeb1e3606 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -61,7 +61,7 @@ static u8 amd_ucode_patch[MAX_NUMNODES][PATCH_MAX_SIZE]; /* * Microcode patch container file is prepended to the initrd in cpio - * format. See Documentation/x86/microcode.rst + * format. See Documentation/arch/x86/microcode.rst */ static const char ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index f1197366a97d..315fc358e584 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -33,7 +33,6 @@ #include <asm/nmi.h> #include <clocksource/hyperv_timer.h> #include <asm/numa.h> -#include <asm/coco.h> /* Is Linux running as the root partition? */ bool hv_root_partition; @@ -401,8 +400,10 @@ static void __init ms_hyperv_init_platform(void) if (ms_hyperv.priv_high & HV_ISOLATION) { ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG); ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG); - ms_hyperv.shared_gpa_boundary = - BIT_ULL(ms_hyperv.shared_gpa_boundary_bits); + + if (ms_hyperv.shared_gpa_boundary_active) + ms_hyperv.shared_gpa_boundary = + BIT_ULL(ms_hyperv.shared_gpa_boundary_bits); pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n", ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); @@ -413,11 +414,6 @@ static void __init ms_hyperv_init_platform(void) swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary; #endif } - /* Isolation VMs are unenlightened SEV-based VMs, thus this check: */ - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { - if (hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE) - cc_set_vendor(CC_VENDOR_HYPERV); - } } if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) { @@ -486,6 +482,9 @@ static void __init ms_hyperv_init_platform(void) i8253_clear_counter_on_shutdown = false; #if IS_ENABLED(CONFIG_HYPERV) + if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) || + (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)) + hv_vtom_init(); /* * Setup the hook to get control post apic initialization. */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 7fe51488e136..0e7b6afe2fa6 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -76,7 +76,7 @@ unsigned int resctrl_rmid_realloc_limit; #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) /* - * The correction factor table is documented in Documentation/x86/resctrl.rst. + * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. * If rmid > rmid threshold, MBM total and local values should be multiplied * by the correction factor. * diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index e5a37b6e9aa5..166692f2d501 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -892,20 +892,19 @@ static struct miscdevice sgx_dev_provision = { int sgx_set_attribute(unsigned long *allowed_attributes, unsigned int attribute_fd) { - struct file *file; + struct fd f = fdget(attribute_fd); - file = fget(attribute_fd); - if (!file) + if (!f.file) return -EINVAL; - if (file->f_op != &sgx_provision_fops) { - fput(file); + if (f.file->f_op != &sgx_provision_fops) { + fdput(f); return -EINVAL; } *allowed_attributes |= SGX_ATTR_PROVISIONKEY; - fput(file); + fdput(f); return 0; } EXPORT_SYMBOL_GPL(sgx_set_attribute); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 0f2020653fba..d2dad21259a8 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -15,7 +15,7 @@ #define EREMOVE_ERROR_MESSAGE \ "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \ - "Refer to Documentation/x86/sgx.rst for more information." + "Refer to Documentation/arch/x86/sgx.rst for more information." #define SGX_MAX_EPC_SECTIONS 8 #define SGX_EEXTEND_BLOCK_SIZE 256 diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 6b58610a1552..a61c12c01270 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -476,7 +476,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, efi_map_offset = params_cmdline_sz; efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16); - /* Copy setup header onto bootparams. Documentation/x86/boot.rst */ + /* Copy setup header onto bootparams. Documentation/arch/x86/boot.rst */ setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset; /* Is there a limit on setup header size? */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 42e182868873..ac10b46c5832 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -64,11 +64,11 @@ static unsigned paravirt_patch_call(void *insn_buff, const void *target, } #ifdef CONFIG_PARAVIRT_XXL -/* identity function, which can be inlined */ -u64 notrace _paravirt_ident_64(u64 x) -{ - return x; -} +DEFINE_PARAVIRT_ASM(_paravirt_ident_64, "mov %rdi, %rax", .text); +DEFINE_PARAVIRT_ASM(pv_native_save_fl, "pushf; pop %rax", .noinstr.text); +DEFINE_PARAVIRT_ASM(pv_native_irq_disable, "cli", .noinstr.text); +DEFINE_PARAVIRT_ASM(pv_native_irq_enable, "sti", .noinstr.text); +DEFINE_PARAVIRT_ASM(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); #endif DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); @@ -197,11 +197,6 @@ void paravirt_end_context_switch(struct task_struct *next) arch_enter_lazy_mmu_mode(); } -static noinstr unsigned long pv_native_read_cr2(void) -{ - return native_read_cr2(); -} - static noinstr void pv_native_write_cr2(unsigned long val) { native_write_cr2(val); @@ -222,16 +217,6 @@ noinstr void pv_native_wbinvd(void) native_wbinvd(); } -static noinstr void pv_native_irq_enable(void) -{ - native_irq_enable(); -} - -static noinstr void pv_native_irq_disable(void) -{ - native_irq_disable(); -} - static noinstr void pv_native_safe_halt(void) { native_safe_halt(); @@ -298,7 +283,7 @@ struct paravirt_patch_template pv_ops = { .cpu.end_context_switch = paravirt_nop, /* Irq ops. */ - .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), + .irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), .irq.safe_halt = pv_native_safe_halt, @@ -363,8 +348,7 @@ struct paravirt_patch_template pv_ops = { .mmu.make_pte = PTE_IDENT, .mmu.make_pgd = PTE_IDENT, - .mmu.dup_mmap = paravirt_nop, - .mmu.activate_mm = paravirt_nop, + .mmu.enter_mmap = paravirt_nop, .mmu.lazy_mode = { .enter = paravirt_nop, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 30bbe4abb5d6..de6be0a3965e 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -124,7 +124,7 @@ void __init pci_iommu_alloc(void) } /* - * See <Documentation/x86/x86_64/boot-options.rst> for the iommu kernel + * See <Documentation/arch/x86/x86_64/boot-options.rst> for the iommu kernel * parameter documentation. */ static __init int iommu_setup(char *p) diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 3f664ab277c4..b031244d6d2d 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -22,6 +22,8 @@ #include <linux/efi.h> #include <linux/platform_device.h> #include <linux/io.h> +#include <linux/psp-sev.h> +#include <uapi/linux/sev-guest.h> #include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> @@ -2175,7 +2177,7 @@ static int __init init_sev_config(char *str) } __setup("sev=", init_sev_config); -int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err) +int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio) { struct ghcb_state state; struct es_em_ctxt ctxt; @@ -2183,8 +2185,7 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned struct ghcb *ghcb; int ret; - if (!fw_err) - return -EINVAL; + rio->exitinfo2 = SEV_RET_NO_FW_CALL; /* * __sev_get_ghcb() needs to run with IRQs disabled because it is using @@ -2209,16 +2210,16 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned if (ret) goto e_put; - *fw_err = ghcb->save.sw_exit_info_2; - switch (*fw_err) { + rio->exitinfo2 = ghcb->save.sw_exit_info_2; + switch (rio->exitinfo2) { case 0: break; - case SNP_GUEST_REQ_ERR_BUSY: + case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY): ret = -EAGAIN; break; - case SNP_GUEST_REQ_INVALID_LEN: + case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN): /* Number of expected pages are returned in RBX */ if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { input->data_npages = ghcb_get_rbx(ghcb); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ef80d361b463..ecdeb0974a87 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -33,8 +33,8 @@ static int __init iommu_init_noop(void) { return 0; } static void iommu_shutdown_noop(void) { } bool __init bool_x86_init_noop(void) { return false; } void x86_op_int_noop(int cpu) { } -static __init int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } -static __init void get_rtc_noop(struct timespec64 *now) { } +static int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } +static void get_rtc_noop(struct timespec64 *now) { } static __initconst const struct of_device_id of_cmos_match[] = { { .compatible = "motorola,mc146818" }, @@ -134,6 +134,7 @@ static void enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return false; } static bool enc_tlb_flush_required_noop(bool enc) { return false; } static bool enc_cache_flush_required_noop(void) { return false; } +static bool is_private_mmio_noop(u64 addr) {return false; } struct x86_platform_ops x86_platform __ro_after_init = { .calibrate_cpu = native_calibrate_cpu_early, @@ -149,6 +150,7 @@ struct x86_platform_ops x86_platform __ro_after_init = { .realmode_reserve = reserve_real_mode, .realmode_init = init_real_mode, .hyper.pin_vcpu = x86_op_int_noop, + .hyper.is_private_mmio = is_private_mmio_noop, .guest = { .enc_status_change_prepare = enc_status_change_prepare_noop, diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8e578311ca9d..89ca7f4c1464 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -46,7 +46,6 @@ config KVM select KVM_XFER_TO_GUEST_WORK select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO - select SRCU select INTERVAL_TREE select HAVE_KVM_PM_NOTIFIER if PM select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index c25aeb550cd9..52398d49bc2f 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1767,18 +1767,20 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) { struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_info *src_sev, *cg_cleanup_sev; - struct file *source_kvm_file; + struct fd f = fdget(source_fd); struct kvm *source_kvm; bool charged = false; int ret; - source_kvm_file = fget(source_fd); - if (!file_is_kvm(source_kvm_file)) { + if (!f.file) + return -EBADF; + + if (!file_is_kvm(f.file)) { ret = -EBADF; goto out_fput; } - source_kvm = source_kvm_file->private_data; + source_kvm = f.file->private_data; ret = sev_lock_two_vms(kvm, source_kvm); if (ret) goto out_fput; @@ -1828,8 +1830,7 @@ out_dst_cgroup: out_unlock: sev_unlock_two_vms(kvm, source_kvm); out_fput: - if (source_kvm_file) - fput(source_kvm_file); + fdput(f); return ret; } @@ -2046,18 +2047,20 @@ failed: int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) { - struct file *source_kvm_file; + struct fd f = fdget(source_fd); struct kvm *source_kvm; struct kvm_sev_info *source_sev, *mirror_sev; int ret; - source_kvm_file = fget(source_fd); - if (!file_is_kvm(source_kvm_file)) { + if (!f.file) + return -EBADF; + + if (!file_is_kvm(f.file)) { ret = -EBADF; goto e_source_fput; } - source_kvm = source_kvm_file->private_data; + source_kvm = f.file->private_data; ret = sev_lock_two_vms(kvm, source_kvm); if (ret) goto e_source_fput; @@ -2103,8 +2106,7 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) e_unlock: sev_unlock_two_vms(kvm, source_kvm); e_source_fput: - if (source_kvm_file) - fput(source_kvm_file); + fdput(f); return ret; } diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 4f1a40a86534..01932af64193 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -71,6 +71,6 @@ ifneq ($(CONFIG_GENERIC_CSUM),y) endif lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o - lib-y += copy_user_64.o + lib-y += copy_user_64.o copy_user_uncached_64.o lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index ecbfb4dd3b01..f74a3e704a1c 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -57,134 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms) * Input: * rdi destination * rcx count + * rax is zero * * Output: * rcx: uncleared bytes or 0 if successful. */ -SYM_FUNC_START(clear_user_original) - /* - * Copy only the lower 32 bits of size as that is enough to handle the rest bytes, - * i.e., no need for a 'q' suffix and thus a REX prefix. - */ - mov %ecx,%eax - shr $3,%rcx - jz .Lrest_bytes +SYM_FUNC_START(rep_stos_alternative) + cmpq $64,%rcx + jae .Lunrolled - # do the qwords first - .p2align 4 -.Lqwords: - movq $0,(%rdi) - lea 8(%rdi),%rdi - dec %rcx - jnz .Lqwords + cmp $8,%ecx + jae .Lword -.Lrest_bytes: - and $7, %eax - jz .Lexit + testl %ecx,%ecx + je .Lexit - # now do the rest bytes -.Lbytes: - movb $0,(%rdi) +.Lclear_user_tail: +0: movb %al,(%rdi) inc %rdi - dec %eax - jnz .Lbytes - + dec %rcx + jnz .Lclear_user_tail .Lexit: - /* - * %rax still needs to be cleared in the exception case because this function is called - * from inline asm and the compiler expects %rax to be zero when exiting the inline asm, - * in case it might reuse it somewhere. - */ - xor %eax,%eax - RET - -.Lqwords_exception: - # convert remaining qwords back into bytes to return to caller - shl $3, %rcx - and $7, %eax - add %rax,%rcx - jmp .Lexit - -.Lbytes_exception: - mov %eax,%ecx - jmp .Lexit - - _ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception) - _ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception) -SYM_FUNC_END(clear_user_original) -EXPORT_SYMBOL(clear_user_original) - -/* - * Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is - * present. - * Input: - * rdi destination - * rcx count - * - * Output: - * rcx: uncleared bytes or 0 if successful. - */ -SYM_FUNC_START(clear_user_rep_good) - # call the original thing for less than a cacheline - cmp $64, %rcx - jb clear_user_original - -.Lprep: - # copy lower 32-bits for rest bytes - mov %ecx, %edx - shr $3, %rcx - jz .Lrep_good_rest_bytes - -.Lrep_good_qwords: - rep stosq - -.Lrep_good_rest_bytes: - and $7, %edx - jz .Lrep_good_exit - -.Lrep_good_bytes: - mov %edx, %ecx - rep stosb - -.Lrep_good_exit: - # see .Lexit comment above - xor %eax, %eax RET -.Lrep_good_qwords_exception: - # convert remaining qwords back into bytes to return to caller - shl $3, %rcx - and $7, %edx - add %rdx, %rcx - jmp .Lrep_good_exit + _ASM_EXTABLE_UA( 0b, .Lexit) - _ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception) - _ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit) -SYM_FUNC_END(clear_user_rep_good) -EXPORT_SYMBOL(clear_user_rep_good) +.Lword: +1: movq %rax,(%rdi) + addq $8,%rdi + sub $8,%ecx + je .Lexit + cmp $8,%ecx + jae .Lword + jmp .Lclear_user_tail -/* - * Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present. - * Input: - * rdi destination - * rcx count - * - * Output: - * rcx: uncleared bytes or 0 if successful. - * - */ -SYM_FUNC_START(clear_user_erms) - # call the original thing for less than a cacheline - cmp $64, %rcx - jb clear_user_original - -.Lerms_bytes: - rep stosb - -.Lerms_exit: - xorl %eax,%eax + .p2align 4 +.Lunrolled: +10: movq %rax,(%rdi) +11: movq %rax,8(%rdi) +12: movq %rax,16(%rdi) +13: movq %rax,24(%rdi) +14: movq %rax,32(%rdi) +15: movq %rax,40(%rdi) +16: movq %rax,48(%rdi) +17: movq %rax,56(%rdi) + addq $64,%rdi + subq $64,%rcx + cmpq $64,%rcx + jae .Lunrolled + cmpl $8,%ecx + jae .Lword + testl %ecx,%ecx + jne .Lclear_user_tail RET - _ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit) -SYM_FUNC_END(clear_user_erms) -EXPORT_SYMBOL(clear_user_erms) + /* + * If we take an exception on any of the + * word stores, we know that %rcx isn't zero, + * so we can just go to the tail clearing to + * get the exact count. + * + * The unrolled case might end up clearing + * some bytes twice. Don't care. + * + * We could use the value in %rdi to avoid + * a second fault on the exact count case, + * but do we really care? No. + * + * Finally, we could try to align %rdi at the + * top of the unrolling. But unaligned stores + * just aren't that common or expensive. + */ + _ASM_EXTABLE_UA( 1b, .Lclear_user_tail) + _ASM_EXTABLE_UA(10b, .Lclear_user_tail) + _ASM_EXTABLE_UA(11b, .Lclear_user_tail) + _ASM_EXTABLE_UA(12b, .Lclear_user_tail) + _ASM_EXTABLE_UA(13b, .Lclear_user_tail) + _ASM_EXTABLE_UA(14b, .Lclear_user_tail) + _ASM_EXTABLE_UA(15b, .Lclear_user_tail) + _ASM_EXTABLE_UA(16b, .Lclear_user_tail) + _ASM_EXTABLE_UA(17b, .Lclear_user_tail) +SYM_FUNC_END(rep_stos_alternative) +EXPORT_SYMBOL(rep_stos_alternative) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 9dec1b38a98f..4fc5c2de2de4 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -7,404 +7,108 @@ */ #include <linux/linkage.h> -#include <asm/current.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> -#include <asm/cpufeatures.h> -#include <asm/alternative.h> #include <asm/asm.h> -#include <asm/smap.h> #include <asm/export.h> -#include <asm/trapnr.h> - -.macro ALIGN_DESTINATION - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jz 102f /* already aligned */ - subl $8,%ecx - negl %ecx - subl %ecx,%edx -100: movb (%rsi),%al -101: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 100b -102: - - _ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align) - _ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align) -.endm /* - * copy_user_generic_unrolled - memory copy with exception handling. - * This version is for CPUs like P4 that don't have efficient micro - * code for rep movsq - * - * Input: - * rdi destination - * rsi source - * rdx count - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -SYM_FUNC_START(copy_user_generic_unrolled) - ASM_STAC - cmpl $8,%edx - jb .Lcopy_user_short_string_bytes - ALIGN_DESTINATION - movl %edx,%ecx - andl $63,%edx - shrl $6,%ecx - jz copy_user_short_string -1: movq (%rsi),%r8 -2: movq 1*8(%rsi),%r9 -3: movq 2*8(%rsi),%r10 -4: movq 3*8(%rsi),%r11 -5: movq %r8,(%rdi) -6: movq %r9,1*8(%rdi) -7: movq %r10,2*8(%rdi) -8: movq %r11,3*8(%rdi) -9: movq 4*8(%rsi),%r8 -10: movq 5*8(%rsi),%r9 -11: movq 6*8(%rsi),%r10 -12: movq 7*8(%rsi),%r11 -13: movq %r8,4*8(%rdi) -14: movq %r9,5*8(%rdi) -15: movq %r10,6*8(%rdi) -16: movq %r11,7*8(%rdi) - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - decl %ecx - jnz 1b - jmp copy_user_short_string - -30: shll $6,%ecx - addl %ecx,%edx - jmp .Lcopy_user_handle_tail - - _ASM_EXTABLE_CPY(1b, 30b) - _ASM_EXTABLE_CPY(2b, 30b) - _ASM_EXTABLE_CPY(3b, 30b) - _ASM_EXTABLE_CPY(4b, 30b) - _ASM_EXTABLE_CPY(5b, 30b) - _ASM_EXTABLE_CPY(6b, 30b) - _ASM_EXTABLE_CPY(7b, 30b) - _ASM_EXTABLE_CPY(8b, 30b) - _ASM_EXTABLE_CPY(9b, 30b) - _ASM_EXTABLE_CPY(10b, 30b) - _ASM_EXTABLE_CPY(11b, 30b) - _ASM_EXTABLE_CPY(12b, 30b) - _ASM_EXTABLE_CPY(13b, 30b) - _ASM_EXTABLE_CPY(14b, 30b) - _ASM_EXTABLE_CPY(15b, 30b) - _ASM_EXTABLE_CPY(16b, 30b) -SYM_FUNC_END(copy_user_generic_unrolled) -EXPORT_SYMBOL(copy_user_generic_unrolled) - -/* Some CPUs run faster using the string copy instructions. - * This is also a lot simpler. Use them when possible. - * - * Only 4GB of copy is supported. This shouldn't be a problem - * because the kernel normally only writes from/to page sized chunks - * even if user space passed a longer buffer. - * And more would be dangerous because both Intel and AMD have - * errata with rep movsq > 4GB. If someone feels the need to fix - * this please consider this. + * rep_movs_alternative - memory copy with exception handling. + * This version is for CPUs that don't have FSRM (Fast Short Rep Movs) * * Input: * rdi destination * rsi source - * rdx count + * rcx count * * Output: - * eax uncopied bytes or 0 if successful. - */ -SYM_FUNC_START(copy_user_generic_string) - ASM_STAC - cmpl $8,%edx - jb 2f /* less than 8 bytes, go to byte copy loop */ - ALIGN_DESTINATION - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx -1: rep movsq -2: movl %edx,%ecx -3: rep movsb - xorl %eax,%eax - ASM_CLAC - RET - -11: leal (%rdx,%rcx,8),%ecx -12: movl %ecx,%edx /* ecx is zerorest also */ - jmp .Lcopy_user_handle_tail - - _ASM_EXTABLE_CPY(1b, 11b) - _ASM_EXTABLE_CPY(3b, 12b) -SYM_FUNC_END(copy_user_generic_string) -EXPORT_SYMBOL(copy_user_generic_string) - -/* - * Some CPUs are adding enhanced REP MOVSB/STOSB instructions. - * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled. - * - * Input: - * rdi destination - * rsi source - * rdx count + * rcx uncopied bytes or 0 if successful. * - * Output: - * eax uncopied bytes or 0 if successful. + * NOTE! The calling convention is very intentionally the same as + * for 'rep movs', so that we can rewrite the function call with + * just a plain 'rep movs' on machines that have FSRM. But to make + * it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely. */ -SYM_FUNC_START(copy_user_enhanced_fast_string) - ASM_STAC - /* CPUs without FSRM should avoid rep movsb for short copies */ - ALTERNATIVE "cmpl $64, %edx; jb copy_user_short_string", "", X86_FEATURE_FSRM - movl %edx,%ecx -1: rep movsb - xorl %eax,%eax - ASM_CLAC +SYM_FUNC_START(rep_movs_alternative) + cmpq $64,%rcx + jae .Lunrolled + + cmp $8,%ecx + jae .Lword + + testl %ecx,%ecx + je .Lexit + +.Lcopy_user_tail: +0: movb (%rsi),%al +1: movb %al,(%rdi) + inc %rdi + inc %rsi + dec %rcx + jne .Lcopy_user_tail +.Lexit: RET -12: movl %ecx,%edx /* ecx is zerorest also */ - jmp .Lcopy_user_handle_tail - - _ASM_EXTABLE_CPY(1b, 12b) -SYM_FUNC_END(copy_user_enhanced_fast_string) -EXPORT_SYMBOL(copy_user_enhanced_fast_string) - -/* - * Try to copy last bytes and clear the rest if needed. - * Since protection fault in copy_from/to_user is not a normal situation, - * it is not necessary to optimize tail handling. - * Don't try to copy the tail if machine check happened - * - * Input: - * eax trap number written by ex_handler_copy() - * rdi destination - * rsi source - * rdx count - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) - cmp $X86_TRAP_MC,%eax - je 3f - - movl %edx,%ecx -1: rep movsb -2: mov %ecx,%eax - ASM_CLAC + _ASM_EXTABLE_UA( 0b, .Lexit) + _ASM_EXTABLE_UA( 1b, .Lexit) + + .p2align 4 +.Lword: +2: movq (%rsi),%rax +3: movq %rax,(%rdi) + addq $8,%rsi + addq $8,%rdi + sub $8,%ecx + je .Lexit + cmp $8,%ecx + jae .Lword + jmp .Lcopy_user_tail + + _ASM_EXTABLE_UA( 2b, .Lcopy_user_tail) + _ASM_EXTABLE_UA( 3b, .Lcopy_user_tail) + + .p2align 4 +.Lunrolled: +10: movq (%rsi),%r8 +11: movq 8(%rsi),%r9 +12: movq 16(%rsi),%r10 +13: movq 24(%rsi),%r11 +14: movq %r8,(%rdi) +15: movq %r9,8(%rdi) +16: movq %r10,16(%rdi) +17: movq %r11,24(%rdi) +20: movq 32(%rsi),%r8 +21: movq 40(%rsi),%r9 +22: movq 48(%rsi),%r10 +23: movq 56(%rsi),%r11 +24: movq %r8,32(%rdi) +25: movq %r9,40(%rdi) +26: movq %r10,48(%rdi) +27: movq %r11,56(%rdi) + addq $64,%rsi + addq $64,%rdi + subq $64,%rcx + cmpq $64,%rcx + jae .Lunrolled + cmpl $8,%ecx + jae .Lword + testl %ecx,%ecx + jne .Lcopy_user_tail RET -3: - movl %edx,%eax - ASM_CLAC - RET - - _ASM_EXTABLE_CPY(1b, 2b) - -.Lcopy_user_handle_align: - addl %ecx,%edx /* ecx is zerorest also */ - jmp .Lcopy_user_handle_tail - -SYM_CODE_END(.Lcopy_user_handle_tail) - -/* - * Finish memcpy of less than 64 bytes. #AC should already be set. - * - * Input: - * rdi destination - * rsi source - * rdx count (< 64) - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -SYM_CODE_START_LOCAL(copy_user_short_string) - movl %edx,%ecx - andl $7,%edx - shrl $3,%ecx - jz .Lcopy_user_short_string_bytes -18: movq (%rsi),%r8 -19: movq %r8,(%rdi) - leaq 8(%rsi),%rsi - leaq 8(%rdi),%rdi - decl %ecx - jnz 18b -.Lcopy_user_short_string_bytes: - andl %edx,%edx - jz 23f - movl %edx,%ecx -21: movb (%rsi),%al -22: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 21b -23: xor %eax,%eax - ASM_CLAC - RET - -40: leal (%rdx,%rcx,8),%edx - jmp 60f -50: movl %ecx,%edx /* ecx is zerorest also */ -60: jmp .Lcopy_user_handle_tail - - _ASM_EXTABLE_CPY(18b, 40b) - _ASM_EXTABLE_CPY(19b, 40b) - _ASM_EXTABLE_CPY(21b, 50b) - _ASM_EXTABLE_CPY(22b, 50b) -SYM_CODE_END(copy_user_short_string) - -/* - * copy_user_nocache - Uncached memory copy with exception handling - * This will force destination out of cache for more performance. - * - * Note: Cached memory copy is used when destination or size is not - * naturally aligned. That is: - * - Require 8-byte alignment when size is 8 bytes or larger. - * - Require 4-byte alignment when size is 4 bytes. - */ -SYM_FUNC_START(__copy_user_nocache) - ASM_STAC - - /* If size is less than 8 bytes, go to 4-byte copy */ - cmpl $8,%edx - jb .L_4b_nocache_copy_entry - - /* If destination is not 8-byte aligned, "cache" copy to align it */ - ALIGN_DESTINATION - - /* Set 4x8-byte copy count and remainder */ - movl %edx,%ecx - andl $63,%edx - shrl $6,%ecx - jz .L_8b_nocache_copy_entry /* jump if count is 0 */ - - /* Perform 4x8-byte nocache loop-copy */ -.L_4x8b_nocache_copy_loop: -1: movq (%rsi),%r8 -2: movq 1*8(%rsi),%r9 -3: movq 2*8(%rsi),%r10 -4: movq 3*8(%rsi),%r11 -5: movnti %r8,(%rdi) -6: movnti %r9,1*8(%rdi) -7: movnti %r10,2*8(%rdi) -8: movnti %r11,3*8(%rdi) -9: movq 4*8(%rsi),%r8 -10: movq 5*8(%rsi),%r9 -11: movq 6*8(%rsi),%r10 -12: movq 7*8(%rsi),%r11 -13: movnti %r8,4*8(%rdi) -14: movnti %r9,5*8(%rdi) -15: movnti %r10,6*8(%rdi) -16: movnti %r11,7*8(%rdi) - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - decl %ecx - jnz .L_4x8b_nocache_copy_loop - - /* Set 8-byte copy count and remainder */ -.L_8b_nocache_copy_entry: - movl %edx,%ecx - andl $7,%edx - shrl $3,%ecx - jz .L_4b_nocache_copy_entry /* jump if count is 0 */ - - /* Perform 8-byte nocache loop-copy */ -.L_8b_nocache_copy_loop: -20: movq (%rsi),%r8 -21: movnti %r8,(%rdi) - leaq 8(%rsi),%rsi - leaq 8(%rdi),%rdi - decl %ecx - jnz .L_8b_nocache_copy_loop - - /* If no byte left, we're done */ -.L_4b_nocache_copy_entry: - andl %edx,%edx - jz .L_finish_copy - - /* If destination is not 4-byte aligned, go to byte copy: */ - movl %edi,%ecx - andl $3,%ecx - jnz .L_1b_cache_copy_entry - - /* Set 4-byte copy count (1 or 0) and remainder */ - movl %edx,%ecx - andl $3,%edx - shrl $2,%ecx - jz .L_1b_cache_copy_entry /* jump if count is 0 */ - - /* Perform 4-byte nocache copy: */ -30: movl (%rsi),%r8d -31: movnti %r8d,(%rdi) - leaq 4(%rsi),%rsi - leaq 4(%rdi),%rdi - - /* If no bytes left, we're done: */ - andl %edx,%edx - jz .L_finish_copy - - /* Perform byte "cache" loop-copy for the remainder */ -.L_1b_cache_copy_entry: - movl %edx,%ecx -.L_1b_cache_copy_loop: -40: movb (%rsi),%al -41: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .L_1b_cache_copy_loop - - /* Finished copying; fence the prior stores */ -.L_finish_copy: - xorl %eax,%eax - ASM_CLAC - sfence - RET - -.L_fixup_4x8b_copy: - shll $6,%ecx - addl %ecx,%edx - jmp .L_fixup_handle_tail -.L_fixup_8b_copy: - lea (%rdx,%rcx,8),%rdx - jmp .L_fixup_handle_tail -.L_fixup_4b_copy: - lea (%rdx,%rcx,4),%rdx - jmp .L_fixup_handle_tail -.L_fixup_1b_copy: - movl %ecx,%edx -.L_fixup_handle_tail: - sfence - jmp .Lcopy_user_handle_tail - - _ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(3b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(4b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(5b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(6b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(7b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(8b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(9b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(10b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(11b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(12b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(13b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(14b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(15b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(16b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(20b, .L_fixup_8b_copy) - _ASM_EXTABLE_CPY(21b, .L_fixup_8b_copy) - _ASM_EXTABLE_CPY(30b, .L_fixup_4b_copy) - _ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy) - _ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy) - _ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy) -SYM_FUNC_END(__copy_user_nocache) -EXPORT_SYMBOL(__copy_user_nocache) + _ASM_EXTABLE_UA(10b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(11b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(12b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(13b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(14b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(15b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(16b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(17b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(20b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(21b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(22b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(23b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(24b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(25b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(26b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(27b, .Lcopy_user_tail) +SYM_FUNC_END(rep_movs_alternative) +EXPORT_SYMBOL(rep_movs_alternative) diff --git a/arch/x86/lib/copy_user_uncached_64.S b/arch/x86/lib/copy_user_uncached_64.S new file mode 100644 index 000000000000..5c5f38d32672 --- /dev/null +++ b/arch/x86/lib/copy_user_uncached_64.S @@ -0,0 +1,242 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org> + */ + +#include <linux/linkage.h> +#include <asm/asm.h> +#include <asm/export.h> + +/* + * copy_user_nocache - Uncached memory copy with exception handling + * + * This copies from user space into kernel space, but the kernel + * space accesses can take a machine check exception, so they too + * need exception handling. + * + * Note: only 32-bit and 64-bit stores have non-temporal versions, + * and we only use aligned versions. Any unaligned parts at the + * start or end of the copy will be done using normal cached stores. + * + * Input: + * rdi destination + * rsi source + * edx count + * + * Output: + * rax uncopied bytes or 0 if successful. + */ +SYM_FUNC_START(__copy_user_nocache) + /* If destination is not 7-byte aligned, we'll have to align it */ + testb $7,%dil + jne .Lalign + +.Lis_aligned: + cmp $64,%edx + jb .Lquadwords + + .p2align 4,0x90 +.Lunrolled: +10: movq (%rsi),%r8 +11: movq 8(%rsi),%r9 +12: movq 16(%rsi),%r10 +13: movq 24(%rsi),%r11 +20: movnti %r8,(%rdi) +21: movnti %r9,8(%rdi) +22: movnti %r10,16(%rdi) +23: movnti %r11,24(%rdi) +30: movq 32(%rsi),%r8 +31: movq 40(%rsi),%r9 +32: movq 48(%rsi),%r10 +33: movq 56(%rsi),%r11 +40: movnti %r8,32(%rdi) +41: movnti %r9,40(%rdi) +42: movnti %r10,48(%rdi) +43: movnti %r11,56(%rdi) + + addq $64,%rsi + addq $64,%rdi + sub $64,%edx + cmp $64,%edx + jae .Lunrolled + +/* + * First set of user mode loads have been done + * without any stores, so if they fail, we can + * just try the non-unrolled loop. + */ +_ASM_EXTABLE_UA(10b, .Lquadwords) +_ASM_EXTABLE_UA(11b, .Lquadwords) +_ASM_EXTABLE_UA(12b, .Lquadwords) +_ASM_EXTABLE_UA(13b, .Lquadwords) + +/* + * The second set of user mode loads have been + * done with 32 bytes stored to the destination, + * so we need to take that into account before + * falling back to the unrolled loop. + */ +_ASM_EXTABLE_UA(30b, .Lfixup32) +_ASM_EXTABLE_UA(31b, .Lfixup32) +_ASM_EXTABLE_UA(32b, .Lfixup32) +_ASM_EXTABLE_UA(33b, .Lfixup32) + +/* + * An exception on a write means that we're + * done, but we need to update the count + * depending on where in the unrolled loop + * we were. + */ +_ASM_EXTABLE_UA(20b, .Ldone0) +_ASM_EXTABLE_UA(21b, .Ldone8) +_ASM_EXTABLE_UA(22b, .Ldone16) +_ASM_EXTABLE_UA(23b, .Ldone24) +_ASM_EXTABLE_UA(40b, .Ldone32) +_ASM_EXTABLE_UA(41b, .Ldone40) +_ASM_EXTABLE_UA(42b, .Ldone48) +_ASM_EXTABLE_UA(43b, .Ldone56) + +.Lquadwords: + cmp $8,%edx + jb .Llong +50: movq (%rsi),%rax +51: movnti %rax,(%rdi) + addq $8,%rsi + addq $8,%rdi + sub $8,%edx + jmp .Lquadwords + +/* + * If we fail on the last full quadword, we will + * not try to do any byte-wise cached accesses. + * We will try to do one more 4-byte uncached + * one, though. + */ +_ASM_EXTABLE_UA(50b, .Llast4) +_ASM_EXTABLE_UA(51b, .Ldone0) + +.Llong: + test $4,%dl + je .Lword +60: movl (%rsi),%eax +61: movnti %eax,(%rdi) + addq $4,%rsi + addq $4,%rdi + sub $4,%edx +.Lword: + sfence + test $2,%dl + je .Lbyte +70: movw (%rsi),%ax +71: movw %ax,(%rdi) + addq $2,%rsi + addq $2,%rdi + sub $2,%edx +.Lbyte: + test $1,%dl + je .Ldone +80: movb (%rsi),%al +81: movb %al,(%rdi) + dec %edx +.Ldone: + mov %edx,%eax + RET + +/* + * If we fail on the last four bytes, we won't + * bother with any fixups. It's dead, Jim. Note + * that there's no need for 'sfence' for any + * of this, since the exception will have been + * serializing. + */ +_ASM_EXTABLE_UA(60b, .Ldone) +_ASM_EXTABLE_UA(61b, .Ldone) +_ASM_EXTABLE_UA(70b, .Ldone) +_ASM_EXTABLE_UA(71b, .Ldone) +_ASM_EXTABLE_UA(80b, .Ldone) +_ASM_EXTABLE_UA(81b, .Ldone) + +/* + * This is the "head needs aliging" case when + * the destination isn't 8-byte aligned. The + * 4-byte case can be done uncached, but any + * smaller alignment is done with regular stores. + */ +.Lalign: + test $1,%dil + je .Lalign_word + test %edx,%edx + je .Ldone +90: movb (%rsi),%al +91: movb %al,(%rdi) + inc %rsi + inc %rdi + dec %edx +.Lalign_word: + test $2,%dil + je .Lalign_long + cmp $2,%edx + jb .Lbyte +92: movw (%rsi),%ax +93: movw %ax,(%rdi) + addq $2,%rsi + addq $2,%rdi + sub $2,%edx +.Lalign_long: + test $4,%dil + je .Lis_aligned + cmp $4,%edx + jb .Lword +94: movl (%rsi),%eax +95: movnti %eax,(%rdi) + addq $4,%rsi + addq $4,%rdi + sub $4,%edx + jmp .Lis_aligned + +/* + * If we fail on the initial alignment accesses, + * we're all done. Again, no point in trying to + * do byte-by-byte probing if the 4-byte load + * fails - we're not doing any uncached accesses + * any more. + */ +_ASM_EXTABLE_UA(90b, .Ldone) +_ASM_EXTABLE_UA(91b, .Ldone) +_ASM_EXTABLE_UA(92b, .Ldone) +_ASM_EXTABLE_UA(93b, .Ldone) +_ASM_EXTABLE_UA(94b, .Ldone) +_ASM_EXTABLE_UA(95b, .Ldone) + +/* + * Exception table fixups for faults in the middle + */ +.Ldone56: sub $8,%edx +.Ldone48: sub $8,%edx +.Ldone40: sub $8,%edx +.Ldone32: sub $8,%edx +.Ldone24: sub $8,%edx +.Ldone16: sub $8,%edx +.Ldone8: sub $8,%edx +.Ldone0: + mov %edx,%eax + RET + +.Lfixup32: + addq $32,%rsi + addq $32,%rdi + sub $32,%edx + jmp .Lquadwords + +.Llast4: +52: movl (%rsi),%eax +53: movnti %eax,(%rdi) + sfence + sub $4,%edx + mov %edx,%eax + RET +_ASM_EXTABLE_UA(52b, .Ldone0) +_ASM_EXTABLE_UA(53b, .Ldone0) + +SYM_FUNC_END(__copy_user_nocache) +EXPORT_SYMBOL(__copy_user_nocache) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index a64017602010..8f95fb267caa 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -11,13 +11,6 @@ .section .noinstr.text, "ax" /* - * We build a jump to memcpy_orig by default which gets NOPped out on - * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which - * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs - * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. - */ - -/* * memcpy - Copy a memory block. * * Input: @@ -27,17 +20,21 @@ * * Output: * rax original destination + * + * The FSRM alternative should be done inline (avoiding the call and + * the disgusting return handling), but that would require some help + * from the compiler for better calling conventions. + * + * The 'rep movsb' itself is small enough to replace the call, but the + * two register moves blow up the code. And one of them is "needed" + * only for the return value that is the same as the source input, + * which the compiler could/should do much better anyway. */ SYM_TYPED_FUNC_START(__memcpy) - ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ - "jmp memcpy_erms", X86_FEATURE_ERMS + ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM movq %rdi, %rax movq %rdx, %rcx - shrq $3, %rcx - andl $7, %edx - rep movsq - movl %edx, %ecx rep movsb RET SYM_FUNC_END(__memcpy) @@ -46,17 +43,6 @@ EXPORT_SYMBOL(__memcpy) SYM_FUNC_ALIAS(memcpy, __memcpy) EXPORT_SYMBOL(memcpy) -/* - * memcpy_erms() - enhanced fast string memcpy. This is faster and - * simpler than memcpy. Use memcpy_erms when possible. - */ -SYM_FUNC_START_LOCAL(memcpy_erms) - movq %rdi, %rax - movq %rdx, %rcx - rep movsb - RET -SYM_FUNC_END(memcpy_erms) - SYM_FUNC_START_LOCAL(memcpy_orig) movq %rdi, %rax diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 6143b1a6fa2c..7c59a704c458 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -18,27 +18,22 @@ * rdx count (bytes) * * rax original destination + * + * The FSRS alternative should be done inline (avoiding the call and + * the disgusting return handling), but that would require some help + * from the compiler for better calling conventions. + * + * The 'rep stosb' itself is small enough to replace the call, but all + * the register moves blow up the code. And two of them are "needed" + * only for the return value that is the same as the source input, + * which the compiler could/should do much better anyway. */ SYM_FUNC_START(__memset) - /* - * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended - * to use it when possible. If not available, use fast string instructions. - * - * Otherwise, use original memset function. - */ - ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ - "jmp memset_erms", X86_FEATURE_ERMS + ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS movq %rdi,%r9 + movb %sil,%al movq %rdx,%rcx - andl $7,%edx - shrq $3,%rcx - /* expand byte value */ - movzbl %sil,%esi - movabs $0x0101010101010101,%rax - imulq %rsi,%rax - rep stosq - movl %edx,%ecx rep stosb movq %r9,%rax RET @@ -48,26 +43,6 @@ EXPORT_SYMBOL(__memset) SYM_FUNC_ALIAS(memset, __memset) EXPORT_SYMBOL(memset) -/* - * ISO C memset - set a memory block to a byte value. This function uses - * enhanced rep stosb to override the fast string function. - * The code is simpler and shorter than the fast string function as well. - * - * rdi destination - * rsi value (char) - * rdx count (bytes) - * - * rax original destination - */ -SYM_FUNC_START_LOCAL(memset_erms) - movq %rdi,%r9 - movb %sil,%al - movq %rdx,%rcx - rep stosb - movq %r9,%rax - RET -SYM_FUNC_END(memset_erms) - SYM_FUNC_START_LOCAL(memset_orig) movq %rdi,%r10 diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 6c1f8ac5e721..c3a5bbc0b41e 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -45,7 +45,11 @@ EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); long __copy_user_flushcache(void *dst, const void __user *src, unsigned size) { unsigned long flushed, dest = (unsigned long) dst; - long rc = __copy_user_nocache(dst, src, size, 0); + long rc; + + stac(); + rc = __copy_user_nocache(dst, src, size); + clac(); /* * __copy_user_nocache() uses non-temporal stores for the bulk diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index cb258f58fdc8..cbc53da4c1b4 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -806,7 +806,7 @@ void __init poking_init(void) BUG_ON(!poking_mm); /* Xen PV guests need the PGD to be pinned. */ - paravirt_arch_dup_mmap(NULL, poking_mm); + paravirt_enter_mmap(poking_mm); /* * Randomize the poking address, but make sure that the following page diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 6453fbaedb08..aa7d279321ea 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -116,6 +116,11 @@ static void __ioremap_check_other(resource_size_t addr, struct ioremap_desc *des if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return; + if (x86_platform.hyper.is_private_mmio(addr)) { + desc->flags |= IORES_MAP_ENCRYPTED; + return; + } + if (!IS_ENABLED(CONFIG_EFI)) return; diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 9c4d8dbcb129..e0b51c09109f 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -513,10 +513,14 @@ void __init mem_encrypt_free_decrypted_mem(void) npages = (vaddr_end - vaddr) >> PAGE_SHIFT; /* - * The unused memory range was mapped decrypted, change the encryption - * attribute from decrypted to encrypted before freeing it. + * If the unused memory range was mapped decrypted, change the encryption + * attribute from decrypted to encrypted before freeing it. Base the + * re-encryption on the same condition used for the decryption in + * sme_postprocess_startup(). Higher level abstractions, such as + * CC_ATTR_MEM_ENCRYPT, aren't necessarily equivalent in a Hyper-V VM + * using vTOM, where sme_me_mask is always zero. */ - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) { + if (sme_me_mask) { r = set_memory_encrypted(vaddr, npages); if (r) { pr_warn("failed to free unused decrypted pages\n"); diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 356758b7d4b4..7159cf787613 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -234,7 +234,7 @@ within_inclusive(unsigned long addr, unsigned long start, unsigned long end) * take full advantage of the the limited (s32) immediate addressing range (2G) * of x86_64. * - * See Documentation/x86/x86_64/mm.rst for more detail. + * See Documentation/arch/x86/x86_64/mm.rst for more detail. */ static inline unsigned long highmap_start_pfn(void) @@ -2175,9 +2175,6 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { - if (hv_is_isolation_supported()) - return hv_set_mem_host_visibility(addr, numpages, !enc); - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) return __set_memory_enc_pgtable(addr, numpages, enc); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 92d73ccede70..16c5292d227d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -925,7 +925,7 @@ void flush_tlb_multi(const struct cpumask *cpumask, } /* - * See Documentation/x86/tlb.rst for details. We choose 33 + * See Documentation/arch/x86/tlb.rst for details. We choose 33 * because it is large enough to cover the vast majority (at * least 95%) of allocations, and is small enough that we are * confident it will not cause too much overhead. Each single diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 615a76d70019..bf5161dcf89e 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -7,6 +7,7 @@ #include <linux/dmi.h> #include <linux/pci.h> #include <linux/vgaarb.h> +#include <asm/amd_nb.h> #include <asm/hpet.h> #include <asm/pci_x86.h> @@ -824,3 +825,23 @@ static void rs690_fix_64bit_dma(struct pci_dev *pdev) DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7910, rs690_fix_64bit_dma); #endif + +#ifdef CONFIG_AMD_NB + +#define AMD_15B8_RCC_DEV2_EPF0_STRAP2 0x10136008 +#define AMD_15B8_RCC_DEV2_EPF0_STRAP2_NO_SOFT_RESET_DEV2_F0_MASK 0x00000080L + +static void quirk_clear_strap_no_soft_reset_dev2_f0(struct pci_dev *dev) +{ + u32 data; + + if (!amd_smn_read(0, AMD_15B8_RCC_DEV2_EPF0_STRAP2, &data)) { + data &= ~AMD_15B8_RCC_DEV2_EPF0_STRAP2_NO_SOFT_RESET_DEV2_F0_MASK; + if (amd_smn_write(0, AMD_15B8_RCC_DEV2_EPF0_STRAP2, data)) + pci_err(dev, "Failed to write data 0x%x\n", data); + } else { + pci_err(dev, "Failed to read data\n"); + } +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x15b8, quirk_clear_strap_no_soft_reset_dev2_f0); +#endif diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c index ed0442e35434..00a92cb2c814 100644 --- a/arch/x86/platform/pvh/enlighten.c +++ b/arch/x86/platform/pvh/enlighten.c @@ -86,7 +86,7 @@ static void __init init_pvh_bootparams(bool xen_guest) } /* - * See Documentation/x86/boot.rst. + * See Documentation/arch/x86/boot.rst. * * Version 2.12 supports Xen entry point but we will use default x86/PC * environment (i.e. hardware_subarch 0). diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 17f09dc26381..82fec66d46d2 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -69,8 +69,7 @@ CFLAGS_sha256.o += $(PURGATORY_CFLAGS) CFLAGS_REMOVE_string.o += $(PURGATORY_CFLAGS_REMOVE) CFLAGS_string.o += $(PURGATORY_CFLAGS) -AFLAGS_REMOVE_setup-x86_$(BITS).o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_entry64.o += -Wa,-gdwarf-2 +asflags-remove-y += $(foreach x, -g -gdwarf-4 -gdwarf-5, $(x) -Wa,$(x)) $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(call if_changed,ld) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index ee29fb558f2e..b3b8d289b9ab 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -885,14 +885,7 @@ void xen_mm_unpin_all(void) spin_unlock(&pgd_lock); } -static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) -{ - spin_lock(&next->page_table_lock); - xen_pgd_pin(next); - spin_unlock(&next->page_table_lock); -} - -static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +static void xen_enter_mmap(struct mm_struct *mm) { spin_lock(&mm->page_table_lock); xen_pgd_pin(mm); @@ -2153,8 +2146,7 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = { .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), #endif - .activate_mm = xen_activate_mm, - .dup_mmap = xen_dup_mmap, + .enter_mmap = xen_enter_mmap, .exit_mmap = xen_exit_mmap, .lazy_mode = { diff --git a/arch/xtensa/include/asm/initialize_mmu.h b/arch/xtensa/include/asm/initialize_mmu.h index 9793b49fc641..574795a20d6f 100644 --- a/arch/xtensa/include/asm/initialize_mmu.h +++ b/arch/xtensa/include/asm/initialize_mmu.h @@ -43,7 +43,7 @@ #if XCHAL_HAVE_S32C1I && (XCHAL_HW_MIN_VERSION >= XTENSA_HWVERSION_RC_2009_0) /* * We Have Atomic Operation Control (ATOMCTL) Register; Initialize it. - * For details see Documentation/xtensa/atomctl.rst + * For details see Documentation/arch/xtensa/atomctl.rst */ #if XCHAL_DCACHE_IS_COHERENT movi a3, 0x25 /* For SMP/MX -- internal for writeback, diff --git a/block/blk-map.c b/block/blk-map.c index 9137d16cecdc..04c55f1c492e 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -29,10 +29,11 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); if (!bmd) return NULL; - memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); bmd->iter = *data; - if (iter_is_iovec(data)) - bmd->iter.iov = bmd->iov; + if (iter_is_iovec(data)) { + memcpy(bmd->iov, iter_iov(data), sizeof(struct iovec) * data->nr_segs); + bmd->iter.__iov = bmd->iov; + } return bmd; } diff --git a/block/blk-mq.c b/block/blk-mq.c index f0ea9dcfb966..2831f78f86a0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2878,16 +2878,15 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, if (!plug) return NULL; + rq = rq_list_peek(&plug->cached_rq); + if (!rq || rq->q != q) + return NULL; if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) { *bio = NULL; return NULL; } - rq = rq_list_peek(&plug->cached_rq); - if (!rq || rq->q != q) - return NULL; - type = blk_mq_get_hctx_type((*bio)->bi_opf); hctx_type = rq->mq_hctx->type; if (type != hctx_type && diff --git a/certs/system_keyring.c b/certs/system_keyring.c index 5042cc54fa5e..a7a49b17ceb1 100644 --- a/certs/system_keyring.c +++ b/certs/system_keyring.c @@ -33,7 +33,11 @@ extern __initconst const unsigned long system_certificate_list_size; extern __initconst const unsigned long module_cert_size; /** - * restrict_link_to_builtin_trusted - Restrict keyring addition by built in CA + * restrict_link_by_builtin_trusted - Restrict keyring addition by built-in CA + * @dest_keyring: Keyring being linked to. + * @type: The type of key being added. + * @payload: The payload of the new key. + * @restriction_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in the built in system keyring. @@ -50,7 +54,11 @@ int restrict_link_by_builtin_trusted(struct key *dest_keyring, #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING /** * restrict_link_by_builtin_and_secondary_trusted - Restrict keyring - * addition by both builtin and secondary keyrings + * addition by both built-in and secondary keyrings. + * @dest_keyring: Keyring being linked to. + * @type: The type of key being added. + * @payload: The payload of the new key. + * @restrict_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in either the built-in or the secondary system @@ -75,7 +83,7 @@ int restrict_link_by_builtin_and_secondary_trusted( secondary_trusted_keys); } -/** +/* * Allocate a struct key_restriction for the "builtin and secondary trust" * keyring. Only for use in system_trusted_keyring_init(). */ diff --git a/crypto/asymmetric_keys/restrict.c b/crypto/asymmetric_keys/restrict.c index 6b1ac5f5896a..276bdb627498 100644 --- a/crypto/asymmetric_keys/restrict.c +++ b/crypto/asymmetric_keys/restrict.c @@ -108,6 +108,46 @@ int restrict_link_by_signature(struct key *dest_keyring, return ret; } +/** + * restrict_link_by_ca - Restrict additions to a ring of CA keys + * @dest_keyring: Keyring being linked to. + * @type: The type of key being added. + * @payload: The payload of the new key. + * @trust_keyring: Unused. + * + * Check if the new certificate is a CA. If it is a CA, then mark the new + * certificate as being ok to link. + * + * Returns 0 if the new certificate was accepted, -ENOKEY if the + * certificate is not a CA. -ENOPKG if the signature uses unsupported + * crypto, or some other error if there is a matching certificate but + * the signature check cannot be performed. + */ +int restrict_link_by_ca(struct key *dest_keyring, + const struct key_type *type, + const union key_payload *payload, + struct key *trust_keyring) +{ + const struct public_key *pkey; + + if (type != &key_type_asymmetric) + return -EOPNOTSUPP; + + pkey = payload->data[asym_crypto]; + if (!pkey) + return -ENOPKG; + if (!test_bit(KEY_EFLAG_CA, &pkey->key_eflags)) + return -ENOKEY; + if (!test_bit(KEY_EFLAG_KEYCERTSIGN, &pkey->key_eflags)) + return -ENOKEY; + if (!IS_ENABLED(CONFIG_INTEGRITY_CA_MACHINE_KEYRING_MAX)) + return 0; + if (test_bit(KEY_EFLAG_DIGITALSIG, &pkey->key_eflags)) + return -ENOKEY; + + return 0; +} + static bool match_either_id(const struct asymmetric_key_id **pair, const struct asymmetric_key_id *single) { diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c index 7a9b084e2043..0a7049b470c1 100644 --- a/crypto/asymmetric_keys/x509_cert_parser.c +++ b/crypto/asymmetric_keys/x509_cert_parser.c @@ -579,6 +579,34 @@ int x509_process_extension(void *context, size_t hdrlen, return 0; } + if (ctx->last_oid == OID_keyUsage) { + /* + * Get hold of the keyUsage bit string + * v[1] is the encoding size + * (Expect either 0x02 or 0x03, making it 1 or 2 bytes) + * v[2] is the number of unused bits in the bit string + * (If >= 3 keyCertSign is missing when v[1] = 0x02) + * v[3] and possibly v[4] contain the bit string + * + * From RFC 5280 4.2.1.3: + * 0x04 is where keyCertSign lands in this bit string + * 0x80 is where digitalSignature lands in this bit string + */ + if (v[0] != ASN1_BTS) + return -EBADMSG; + if (vlen < 4) + return -EBADMSG; + if (v[2] >= 8) + return -EBADMSG; + if (v[3] & 0x80) + ctx->cert->pub->key_eflags |= 1 << KEY_EFLAG_DIGITALSIG; + if (v[1] == 0x02 && v[2] <= 2 && (v[3] & 0x04)) + ctx->cert->pub->key_eflags |= 1 << KEY_EFLAG_KEYCERTSIGN; + else if (vlen > 4 && v[1] == 0x03 && (v[3] & 0x04)) + ctx->cert->pub->key_eflags |= 1 << KEY_EFLAG_KEYCERTSIGN; + return 0; + } + if (ctx->last_oid == OID_authorityKeyIdentifier) { /* Get hold of the CA key fingerprint */ ctx->raw_akid = v; @@ -586,6 +614,28 @@ int x509_process_extension(void *context, size_t hdrlen, return 0; } + if (ctx->last_oid == OID_basicConstraints) { + /* + * Get hold of the basicConstraints + * v[1] is the encoding size + * (Expect 0x2 or greater, making it 1 or more bytes) + * v[2] is the encoding type + * (Expect an ASN1_BOOL for the CA) + * v[3] is the contents of the ASN1_BOOL + * (Expect 1 if the CA is TRUE) + * vlen should match the entire extension size + */ + if (v[0] != (ASN1_CONS_BIT | ASN1_SEQ)) + return -EBADMSG; + if (vlen < 2) + return -EBADMSG; + if (v[1] != vlen - 2) + return -EBADMSG; + if (vlen >= 4 && v[1] != 0 && v[2] == ASN1_BOOL && v[3] == 1) + ctx->cert->pub->key_eflags |= 1 << KEY_EFLAG_CA; + return 0; + } + return 0; } diff --git a/drivers/acpi/acpica/evevent.c b/drivers/acpi/acpica/evevent.c index 82d1728b9bc6..df596d46dd97 100644 --- a/drivers/acpi/acpica/evevent.c +++ b/drivers/acpi/acpica/evevent.c @@ -142,9 +142,6 @@ static acpi_status acpi_ev_fixed_event_initialize(void) status = acpi_write_bit_register(acpi_gbl_fixed_event_info [i].enable_register_id, - (i == - ACPI_EVENT_PCIE_WAKE) ? - ACPI_ENABLE_EVENT : ACPI_DISABLE_EVENT); if (ACPI_FAILURE(status)) { return (status); @@ -188,11 +185,6 @@ u32 acpi_ev_fixed_event_detect(void) return (int_status); } - if (fixed_enable & ACPI_BITMASK_PCIEXP_WAKE_DISABLE) - fixed_enable &= ~ACPI_BITMASK_PCIEXP_WAKE_DISABLE; - else - fixed_enable |= ACPI_BITMASK_PCIEXP_WAKE_DISABLE; - ACPI_DEBUG_PRINT((ACPI_DB_INTERRUPTS, "Fixed Event Block: Enable %08X Status %08X\n", fixed_enable, fixed_status)); @@ -258,9 +250,6 @@ static u32 acpi_ev_fixed_event_dispatch(u32 event) if (!acpi_gbl_fixed_event_handlers[event].handler) { (void)acpi_write_bit_register(acpi_gbl_fixed_event_info[event]. enable_register_id, - (event == - ACPI_EVENT_PCIE_WAKE) ? - ACPI_ENABLE_EVENT : ACPI_DISABLE_EVENT); ACPI_ERROR((AE_INFO, diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index 37b3f641feaa..bd936476dda9 100644 --- a/drivers/acpi/acpica/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -311,20 +311,6 @@ acpi_status acpi_hw_legacy_wake(u8 sleep_state) [ACPI_EVENT_SLEEP_BUTTON]. status_register_id, ACPI_CLEAR_STATUS); - /* Enable pcie wake event if support */ - if ((acpi_gbl_FADT.flags & ACPI_FADT_PCI_EXPRESS_WAKE)) { - (void) - acpi_write_bit_register(acpi_gbl_fixed_event_info - [ACPI_EVENT_PCIE_WAKE]. - enable_register_id, - ACPI_DISABLE_EVENT); - (void) - acpi_write_bit_register(acpi_gbl_fixed_event_info - [ACPI_EVENT_PCIE_WAKE]. - status_register_id, - ACPI_CLEAR_STATUS); - } - acpi_hw_execute_sleep_method(METHOD_PATHNAME__SST, ACPI_SST_WORKING); return_ACPI_STATUS(status); } diff --git a/drivers/acpi/acpica/utglobal.c b/drivers/acpi/acpica/utglobal.c index 53afa5edb6ec..cda6e16dddf7 100644 --- a/drivers/acpi/acpica/utglobal.c +++ b/drivers/acpi/acpica/utglobal.c @@ -186,10 +186,6 @@ struct acpi_fixed_event_info acpi_gbl_fixed_event_info[ACPI_NUM_FIXED_EVENTS] = ACPI_BITREG_RT_CLOCK_ENABLE, ACPI_BITMASK_RT_CLOCK_STATUS, ACPI_BITMASK_RT_CLOCK_ENABLE}, - /* ACPI_EVENT_PCIE_WAKE */ {ACPI_BITREG_PCIEXP_WAKE_STATUS, - ACPI_BITREG_PCIEXP_WAKE_DISABLE, - ACPI_BITMASK_PCIEXP_WAKE_STATUS, - ACPI_BITMASK_PCIEXP_WAKE_DISABLE}, }; #endif /* !ACPI_REDUCED_HARDWARE */ diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index 7b4801ce62d6..e8492b3a393a 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -440,6 +440,13 @@ static const struct dmi_system_id asus_laptop[] = { }, }, { + .ident = "Asus ExpertBook B1502CBA", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_BOARD_NAME, "B1502CBA"), + }, + }, + { .ident = "Asus ExpertBook B2402CBA", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), diff --git a/drivers/acpi/x86/utils.c b/drivers/acpi/x86/utils.c index da5727069d85..ba420a28a4aa 100644 --- a/drivers/acpi/x86/utils.c +++ b/drivers/acpi/x86/utils.c @@ -213,6 +213,7 @@ bool acpi_device_override_status(struct acpi_device *adev, unsigned long long *s disk in the system. */ static const struct x86_cpu_id storage_d3_cpu_ids[] = { + X86_MATCH_VENDOR_FAM_MODEL(AMD, 23, 24, NULL), /* Picasso */ X86_MATCH_VENDOR_FAM_MODEL(AMD, 23, 96, NULL), /* Renoir */ X86_MATCH_VENDOR_FAM_MODEL(AMD, 23, 104, NULL), /* Lucienne */ X86_MATCH_VENDOR_FAM_MODEL(AMD, 25, 80, NULL), /* Cezanne */ diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 182c6122f815..c1815b9dae68 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -487,7 +487,8 @@ static const struct attribute_group *cpu_root_attr_groups[] = { bool cpu_is_hotpluggable(unsigned int cpu) { struct device *dev = get_cpu_device(cpu); - return dev && container_of(dev, struct cpu, dev)->hotpluggable; + return dev && container_of(dev, struct cpu, dev)->hotpluggable + && tick_nohz_cpu_hotpluggable(cpu); } EXPORT_SYMBOL_GPL(cpu_is_hotpluggable); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 60757ac31701..f49f2a5282e1 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1615,7 +1615,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) drbd_send_sync_param(peer_device); } - kvfree_rcu(old_disk_conf); + kvfree_rcu_mightsleep(old_disk_conf); kfree(old_plan); mod_timer(&device->request_timer, jiffies + HZ); goto success; @@ -2446,7 +2446,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) mutex_unlock(&connection->resource->conf_update); mutex_unlock(&connection->data.mutex); - kvfree_rcu(old_net_conf); + kvfree_rcu_mightsleep(old_net_conf); if (connection->cstate >= C_WF_REPORT_PARAMS) { struct drbd_peer_device *peer_device; @@ -2860,7 +2860,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) new_disk_conf->disk_size = (sector_t)rs.resize_size; rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); mutex_unlock(&device->resource->conf_update); - kvfree_rcu(old_disk_conf); + kvfree_rcu_mightsleep(old_disk_conf); new_disk_conf = NULL; } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 757f4692b5bd..e197b2a465d2 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3759,7 +3759,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in drbd_info(connection, "peer data-integrity-alg: %s\n", integrity_alg[0] ? integrity_alg : "(none)"); - kvfree_rcu(old_net_conf); + kvfree_rcu_mightsleep(old_net_conf); return 0; disconnect_rcu_unlock: @@ -4127,7 +4127,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); mutex_unlock(&connection->resource->conf_update); - kvfree_rcu(old_disk_conf); + kvfree_rcu_mightsleep(old_disk_conf); drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n", (unsigned long)p_usize, (unsigned long)my_usize); diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 75d13ea0024f..2aeea295fa28 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -2071,7 +2071,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) conn_free_crypto(connection); mutex_unlock(&connection->resource->conf_update); - kvfree_rcu(old_conf); + kvfree_rcu_mightsleep(old_conf); } if (ns_max.susp_fen) { diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 2723eede6f21..2b918e28acaa 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -96,16 +96,14 @@ struct virtblk_req { /* * The zone append command has an extended in header. - * The status field in zone_append_in_hdr must have - * the same offset in virtblk_req as the non-zoned - * status field above. + * The status field in zone_append_in_hdr must always + * be the last byte. */ struct { + __virtio64 sector; u8 status; - u8 reserved[7]; - __le64 append_sector; - } zone_append_in_hdr; - }; + } zone_append; + } in_hdr; size_t in_hdr_len; @@ -154,7 +152,7 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr) sgs[num_out + num_in++] = vbr->sg_table.sgl; } - sg_init_one(&in_hdr, &vbr->status, vbr->in_hdr_len); + sg_init_one(&in_hdr, &vbr->in_hdr.status, vbr->in_hdr_len); sgs[num_out + num_in++] = &in_hdr; return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); @@ -242,11 +240,14 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, struct request *req, struct virtblk_req *vbr) { - size_t in_hdr_len = sizeof(vbr->status); + size_t in_hdr_len = sizeof(vbr->in_hdr.status); bool unmap = false; u32 type; u64 sector = 0; + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) && op_is_zone_mgmt(req_op(req))) + return BLK_STS_NOTSUPP; + /* Set fields for all request types */ vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req)); @@ -287,7 +288,7 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, case REQ_OP_ZONE_APPEND: type = VIRTIO_BLK_T_ZONE_APPEND; sector = blk_rq_pos(req); - in_hdr_len = sizeof(vbr->zone_append_in_hdr); + in_hdr_len = sizeof(vbr->in_hdr.zone_append); break; case REQ_OP_ZONE_RESET: type = VIRTIO_BLK_T_ZONE_RESET; @@ -297,7 +298,10 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, type = VIRTIO_BLK_T_ZONE_RESET_ALL; break; case REQ_OP_DRV_IN: - /* Out header already filled in, nothing to do */ + /* + * Out header has already been prepared by the caller (virtblk_get_id() + * or virtblk_submit_zone_report()), nothing to do here. + */ return 0; default: WARN_ON_ONCE(1); @@ -318,16 +322,28 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, return 0; } +/* + * The status byte is always the last byte of the virtblk request + * in-header. This helper fetches its value for all in-header formats + * that are currently defined. + */ +static inline u8 virtblk_vbr_status(struct virtblk_req *vbr) +{ + return *((u8 *)&vbr->in_hdr + vbr->in_hdr_len - 1); +} + static inline void virtblk_request_done(struct request *req) { struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); - blk_status_t status = virtblk_result(vbr->status); + blk_status_t status = virtblk_result(virtblk_vbr_status(vbr)); + struct virtio_blk *vblk = req->mq_hctx->queue->queuedata; virtblk_unmap_data(req, vbr); virtblk_cleanup_cmd(req); if (req_op(req) == REQ_OP_ZONE_APPEND) - req->__sector = le64_to_cpu(vbr->zone_append_in_hdr.append_sector); + req->__sector = virtio64_to_cpu(vblk->vdev, + vbr->in_hdr.zone_append.sector); blk_mq_end_request(req, status); } @@ -355,7 +371,7 @@ static int virtblk_handle_req(struct virtio_blk_vq *vq, if (likely(!blk_should_fake_timeout(req->q)) && !blk_mq_complete_request_remote(req) && - !blk_mq_add_to_batch(req, iob, vbr->status, + !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) virtblk_request_done(req); req_done++; @@ -550,7 +566,6 @@ static void virtio_queue_rqs(struct request **rqlist) #ifdef CONFIG_BLK_DEV_ZONED static void *virtblk_alloc_report_buffer(struct virtio_blk *vblk, unsigned int nr_zones, - unsigned int zone_sectors, size_t *buflen) { struct request_queue *q = vblk->disk->queue; @@ -558,7 +573,7 @@ static void *virtblk_alloc_report_buffer(struct virtio_blk *vblk, void *buf; nr_zones = min_t(unsigned int, nr_zones, - get_capacity(vblk->disk) >> ilog2(zone_sectors)); + get_capacity(vblk->disk) >> ilog2(vblk->zone_sectors)); bufsize = sizeof(struct virtio_blk_zone_report) + nr_zones * sizeof(struct virtio_blk_zone_descriptor); @@ -592,7 +607,7 @@ static int virtblk_submit_zone_report(struct virtio_blk *vblk, return PTR_ERR(req); vbr = blk_mq_rq_to_pdu(req); - vbr->in_hdr_len = sizeof(vbr->status); + vbr->in_hdr_len = sizeof(vbr->in_hdr.status); vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_ZONE_REPORT); vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, sector); @@ -601,7 +616,7 @@ static int virtblk_submit_zone_report(struct virtio_blk *vblk, goto out; blk_execute_rq(req, false); - err = blk_status_to_errno(virtblk_result(vbr->status)); + err = blk_status_to_errno(virtblk_result(vbr->in_hdr.status)); out: blk_mq_free_request(req); return err; @@ -609,29 +624,72 @@ out: static int virtblk_parse_zone(struct virtio_blk *vblk, struct virtio_blk_zone_descriptor *entry, - unsigned int idx, unsigned int zone_sectors, - report_zones_cb cb, void *data) + unsigned int idx, report_zones_cb cb, void *data) { struct blk_zone zone = { }; - if (entry->z_type != VIRTIO_BLK_ZT_SWR && - entry->z_type != VIRTIO_BLK_ZT_SWP && - entry->z_type != VIRTIO_BLK_ZT_CONV) { - dev_err(&vblk->vdev->dev, "invalid zone type %#x\n", - entry->z_type); - return -EINVAL; + zone.start = virtio64_to_cpu(vblk->vdev, entry->z_start); + if (zone.start + vblk->zone_sectors <= get_capacity(vblk->disk)) + zone.len = vblk->zone_sectors; + else + zone.len = get_capacity(vblk->disk) - zone.start; + zone.capacity = virtio64_to_cpu(vblk->vdev, entry->z_cap); + zone.wp = virtio64_to_cpu(vblk->vdev, entry->z_wp); + + switch (entry->z_type) { + case VIRTIO_BLK_ZT_SWR: + zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ; + break; + case VIRTIO_BLK_ZT_SWP: + zone.type = BLK_ZONE_TYPE_SEQWRITE_PREF; + break; + case VIRTIO_BLK_ZT_CONV: + zone.type = BLK_ZONE_TYPE_CONVENTIONAL; + break; + default: + dev_err(&vblk->vdev->dev, "zone %llu: invalid type %#x\n", + zone.start, entry->z_type); + return -EIO; } - zone.type = entry->z_type; - zone.cond = entry->z_state; - zone.len = zone_sectors; - zone.capacity = le64_to_cpu(entry->z_cap); - zone.start = le64_to_cpu(entry->z_start); - if (zone.cond == BLK_ZONE_COND_FULL) + switch (entry->z_state) { + case VIRTIO_BLK_ZS_EMPTY: + zone.cond = BLK_ZONE_COND_EMPTY; + break; + case VIRTIO_BLK_ZS_CLOSED: + zone.cond = BLK_ZONE_COND_CLOSED; + break; + case VIRTIO_BLK_ZS_FULL: + zone.cond = BLK_ZONE_COND_FULL; zone.wp = zone.start + zone.len; - else - zone.wp = le64_to_cpu(entry->z_wp); + break; + case VIRTIO_BLK_ZS_EOPEN: + zone.cond = BLK_ZONE_COND_EXP_OPEN; + break; + case VIRTIO_BLK_ZS_IOPEN: + zone.cond = BLK_ZONE_COND_IMP_OPEN; + break; + case VIRTIO_BLK_ZS_NOT_WP: + zone.cond = BLK_ZONE_COND_NOT_WP; + break; + case VIRTIO_BLK_ZS_RDONLY: + zone.cond = BLK_ZONE_COND_READONLY; + zone.wp = ULONG_MAX; + break; + case VIRTIO_BLK_ZS_OFFLINE: + zone.cond = BLK_ZONE_COND_OFFLINE; + zone.wp = ULONG_MAX; + break; + default: + dev_err(&vblk->vdev->dev, "zone %llu: invalid condition %#x\n", + zone.start, entry->z_state); + return -EIO; + } + /* + * The callback below checks the validity of the reported + * entry data, no need to further validate it here. + */ return cb(&zone, idx, data); } @@ -641,39 +699,47 @@ static int virtblk_report_zones(struct gendisk *disk, sector_t sector, { struct virtio_blk *vblk = disk->private_data; struct virtio_blk_zone_report *report; - unsigned int zone_sectors = vblk->zone_sectors; - unsigned int nz, i; - int ret, zone_idx = 0; + unsigned long long nz, i; size_t buflen; + unsigned int zone_idx = 0; + int ret; if (WARN_ON_ONCE(!vblk->zone_sectors)) return -EOPNOTSUPP; - report = virtblk_alloc_report_buffer(vblk, nr_zones, - zone_sectors, &buflen); + report = virtblk_alloc_report_buffer(vblk, nr_zones, &buflen); if (!report) return -ENOMEM; + mutex_lock(&vblk->vdev_mutex); + + if (!vblk->vdev) { + ret = -ENXIO; + goto fail_report; + } + while (zone_idx < nr_zones && sector < get_capacity(vblk->disk)) { memset(report, 0, buflen); ret = virtblk_submit_zone_report(vblk, (char *)report, buflen, sector); - if (ret) { - if (ret > 0) - ret = -EIO; - goto out_free; - } - nz = min((unsigned int)le64_to_cpu(report->nr_zones), nr_zones); + if (ret) + goto fail_report; + + nz = min_t(u64, virtio64_to_cpu(vblk->vdev, report->nr_zones), + nr_zones); if (!nz) break; for (i = 0; i < nz && zone_idx < nr_zones; i++) { ret = virtblk_parse_zone(vblk, &report->zones[i], - zone_idx, zone_sectors, cb, data); + zone_idx, cb, data); if (ret) - goto out_free; - sector = le64_to_cpu(report->zones[i].z_start) + zone_sectors; + goto fail_report; + + sector = virtio64_to_cpu(vblk->vdev, + report->zones[i].z_start) + + vblk->zone_sectors; zone_idx++; } } @@ -682,7 +748,8 @@ static int virtblk_report_zones(struct gendisk *disk, sector_t sector, ret = zone_idx; else ret = -EINVAL; -out_free: +fail_report: + mutex_unlock(&vblk->vdev_mutex); kvfree(report); return ret; } @@ -691,20 +758,28 @@ static void virtblk_revalidate_zones(struct virtio_blk *vblk) { u8 model; - if (!vblk->zone_sectors) - return; - virtio_cread(vblk->vdev, struct virtio_blk_config, zoned.model, &model); - if (!blk_revalidate_disk_zones(vblk->disk, NULL)) - set_capacity_and_notify(vblk->disk, 0); + switch (model) { + default: + dev_err(&vblk->vdev->dev, "unknown zone model %d\n", model); + fallthrough; + case VIRTIO_BLK_Z_NONE: + case VIRTIO_BLK_Z_HA: + disk_set_zoned(vblk->disk, BLK_ZONED_NONE); + return; + case VIRTIO_BLK_Z_HM: + WARN_ON_ONCE(!vblk->zone_sectors); + if (!blk_revalidate_disk_zones(vblk->disk, NULL)) + set_capacity_and_notify(vblk->disk, 0); + } } static int virtblk_probe_zoned_device(struct virtio_device *vdev, struct virtio_blk *vblk, struct request_queue *q) { - u32 v; + u32 v, wg; u8 model; int ret; @@ -713,16 +788,11 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev, switch (model) { case VIRTIO_BLK_Z_NONE: + case VIRTIO_BLK_Z_HA: + /* Present the host-aware device as non-zoned */ return 0; case VIRTIO_BLK_Z_HM: break; - case VIRTIO_BLK_Z_HA: - /* - * Present the host-aware device as a regular drive. - * TODO It is possible to add an option to make it appear - * in the system as a zoned drive. - */ - return 0; default: dev_err(&vdev->dev, "unsupported zone model %d\n", model); return -EINVAL; @@ -735,32 +805,31 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev, virtio_cread(vdev, struct virtio_blk_config, zoned.max_open_zones, &v); - disk_set_max_open_zones(vblk->disk, le32_to_cpu(v)); - - dev_dbg(&vdev->dev, "max open zones = %u\n", le32_to_cpu(v)); + disk_set_max_open_zones(vblk->disk, v); + dev_dbg(&vdev->dev, "max open zones = %u\n", v); virtio_cread(vdev, struct virtio_blk_config, zoned.max_active_zones, &v); - disk_set_max_active_zones(vblk->disk, le32_to_cpu(v)); - dev_dbg(&vdev->dev, "max active zones = %u\n", le32_to_cpu(v)); + disk_set_max_active_zones(vblk->disk, v); + dev_dbg(&vdev->dev, "max active zones = %u\n", v); virtio_cread(vdev, struct virtio_blk_config, - zoned.write_granularity, &v); - if (!v) { + zoned.write_granularity, &wg); + if (!wg) { dev_warn(&vdev->dev, "zero write granularity reported\n"); return -ENODEV; } - blk_queue_physical_block_size(q, le32_to_cpu(v)); - blk_queue_io_min(q, le32_to_cpu(v)); + blk_queue_physical_block_size(q, wg); + blk_queue_io_min(q, wg); - dev_dbg(&vdev->dev, "write granularity = %u\n", le32_to_cpu(v)); + dev_dbg(&vdev->dev, "write granularity = %u\n", wg); /* * virtio ZBD specification doesn't require zones to be a power of * two sectors in size, but the code in this driver expects that. */ - virtio_cread(vdev, struct virtio_blk_config, zoned.zone_sectors, &v); - vblk->zone_sectors = le32_to_cpu(v); + virtio_cread(vdev, struct virtio_blk_config, zoned.zone_sectors, + &vblk->zone_sectors); if (vblk->zone_sectors == 0 || !is_power_of_2(vblk->zone_sectors)) { dev_err(&vdev->dev, "zoned device with non power of two zone size %u\n", @@ -783,36 +852,46 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev, dev_warn(&vdev->dev, "zero max_append_sectors reported\n"); return -ENODEV; } - blk_queue_max_zone_append_sectors(q, le32_to_cpu(v)); - dev_dbg(&vdev->dev, "max append sectors = %u\n", le32_to_cpu(v)); + if ((v << SECTOR_SHIFT) < wg) { + dev_err(&vdev->dev, + "write granularity %u exceeds max_append_sectors %u limit\n", + wg, v); + return -ENODEV; + } + + blk_queue_max_zone_append_sectors(q, v); + dev_dbg(&vdev->dev, "max append sectors = %u\n", v); } return ret; } -static inline bool virtblk_has_zoned_feature(struct virtio_device *vdev) -{ - return virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED); -} #else /* * Zoned block device support is not configured in this kernel. - * We only need to define a few symbols to avoid compilation errors. + * Host-managed zoned devices can't be supported, but others are + * good to go as regular block devices. */ #define virtblk_report_zones NULL + static inline void virtblk_revalidate_zones(struct virtio_blk *vblk) { } + static inline int virtblk_probe_zoned_device(struct virtio_device *vdev, struct virtio_blk *vblk, struct request_queue *q) { - return -EOPNOTSUPP; -} + u8 model; -static inline bool virtblk_has_zoned_feature(struct virtio_device *vdev) -{ - return false; + virtio_cread(vdev, struct virtio_blk_config, zoned.model, &model); + if (model == VIRTIO_BLK_Z_HM) { + dev_err(&vdev->dev, + "virtio_blk: zoned devices are not supported"); + return -EOPNOTSUPP; + } + + return 0; } #endif /* CONFIG_BLK_DEV_ZONED */ @@ -831,7 +910,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) return PTR_ERR(req); vbr = blk_mq_rq_to_pdu(req); - vbr->in_hdr_len = sizeof(vbr->status); + vbr->in_hdr_len = sizeof(vbr->in_hdr.status); vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID); vbr->out_hdr.sector = 0; @@ -840,7 +919,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) goto out; blk_execute_rq(req, false); - err = blk_status_to_errno(virtblk_result(vbr->status)); + err = blk_status_to_errno(virtblk_result(vbr->in_hdr.status)); out: blk_mq_free_request(req); return err; @@ -1498,15 +1577,16 @@ static int virtblk_probe(struct virtio_device *vdev) virtblk_update_capacity(vblk, false); virtio_device_ready(vdev); - if (virtblk_has_zoned_feature(vdev)) { + /* + * All steps that follow use the VQs therefore they need to be + * placed after the virtio_device_ready() call above. + */ + if (virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED)) { err = virtblk_probe_zoned_device(vdev, vblk, q); if (err) goto out_cleanup_disk; } - dev_info(&vdev->dev, "blk config size: %zu\n", - sizeof(struct virtio_blk_config)); - err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups); if (err) goto out_cleanup_disk; @@ -1607,10 +1687,7 @@ static unsigned int features[] = { VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE, VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES, - VIRTIO_BLK_F_SECURE_ERASE, -#ifdef CONFIG_BLK_DEV_ZONED - VIRTIO_BLK_F_ZONED, -#endif /* CONFIG_BLK_DEV_ZONED */ + VIRTIO_BLK_F_SECURE_ERASE, VIRTIO_BLK_F_ZONED, }; static struct virtio_driver virtio_blk = { diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c index 3006e2a0f37e..43e98a598bd9 100644 --- a/drivers/bluetooth/btbcm.c +++ b/drivers/bluetooth/btbcm.c @@ -511,7 +511,7 @@ static const char *btbcm_get_board_name(struct device *dev) len = strlen(tmp) + 1; board_type = devm_kzalloc(dev, len, GFP_KERNEL); strscpy(board_type, tmp, len); - for (i = 0; i < board_type[i]; i++) { + for (i = 0; i < len; i++) { if (board_type[i] == '/') board_type[i] = '-'; } diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c index 02893600db39..51000320e1ea 100644 --- a/drivers/bluetooth/btsdio.c +++ b/drivers/bluetooth/btsdio.c @@ -358,6 +358,7 @@ static void btsdio_remove(struct sdio_func *func) if (!data) return; + cancel_work_sync(&data->work); hdev = data->hdev; sdio_set_drvdata(func, NULL); diff --git a/drivers/bus/imx-weim.c b/drivers/bus/imx-weim.c index 36d42484142a..cf463c1d2102 100644 --- a/drivers/bus/imx-weim.c +++ b/drivers/bus/imx-weim.c @@ -329,6 +329,12 @@ static int of_weim_notify(struct notifier_block *nb, unsigned long action, "Failed to setup timing for '%pOF'\n", rd->dn); if (!of_node_check_flag(rd->dn, OF_POPULATED)) { + /* + * Clear the flag before adding the device so that + * fw_devlink doesn't skip adding consumers to this + * device. + */ + rd->dn->fwnode.flags &= ~FWNODE_FLAG_NOT_DEVICE; if (!of_platform_device_create(rd->dn, NULL, &pdev->dev)) { dev_err(&pdev->dev, "Failed to create child device '%pOF'\n", diff --git a/drivers/char/tpm/eventlog/common.c b/drivers/char/tpm/eventlog/common.c index 8512ec76d526..639c3f395a5a 100644 --- a/drivers/char/tpm/eventlog/common.c +++ b/drivers/char/tpm/eventlog/common.c @@ -36,7 +36,7 @@ static int tpm_bios_measurements_open(struct inode *inode, inode_unlock(inode); return -ENODEV; } - chip_seqops = (struct tpm_chip_seqops *)inode->i_private; + chip_seqops = inode->i_private; seqops = chip_seqops->seqops; chip = chip_seqops->chip; get_device(&chip->dev); @@ -55,8 +55,8 @@ static int tpm_bios_measurements_open(struct inode *inode, static int tpm_bios_measurements_release(struct inode *inode, struct file *file) { - struct seq_file *seq = (struct seq_file *)file->private_data; - struct tpm_chip *chip = (struct tpm_chip *)seq->private; + struct seq_file *seq = file->private_data; + struct tpm_chip *chip = seq->private; put_device(&chip->dev); diff --git a/drivers/char/tpm/st33zp24/i2c.c b/drivers/char/tpm/st33zp24/i2c.c index c4d0b744e3cc..2d28f55ef490 100644 --- a/drivers/char/tpm/st33zp24/i2c.c +++ b/drivers/char/tpm/st33zp24/i2c.c @@ -138,13 +138,13 @@ static const struct i2c_device_id st33zp24_i2c_id[] = { }; MODULE_DEVICE_TABLE(i2c, st33zp24_i2c_id); -static const struct of_device_id of_st33zp24_i2c_match[] = { +static const struct of_device_id of_st33zp24_i2c_match[] __maybe_unused = { { .compatible = "st,st33zp24-i2c", }, {} }; MODULE_DEVICE_TABLE(of, of_st33zp24_i2c_match); -static const struct acpi_device_id st33zp24_i2c_acpi_match[] = { +static const struct acpi_device_id st33zp24_i2c_acpi_match[] __maybe_unused = { {"SMO3324"}, {} }; diff --git a/drivers/char/tpm/st33zp24/spi.c b/drivers/char/tpm/st33zp24/spi.c index 2154059f0235..f5811b301d3b 100644 --- a/drivers/char/tpm/st33zp24/spi.c +++ b/drivers/char/tpm/st33zp24/spi.c @@ -255,13 +255,13 @@ static const struct spi_device_id st33zp24_spi_id[] = { }; MODULE_DEVICE_TABLE(spi, st33zp24_spi_id); -static const struct of_device_id of_st33zp24_spi_match[] = { +static const struct of_device_id of_st33zp24_spi_match[] __maybe_unused = { { .compatible = "st,st33zp24-spi", }, {} }; MODULE_DEVICE_TABLE(of, of_st33zp24_spi_match); -static const struct acpi_device_id st33zp24_spi_acpi_match[] = { +static const struct acpi_device_id st33zp24_spi_acpi_match[] __maybe_unused = { {"SMO3324"}, {} }; diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c index 0601e6e5e326..6fdfa65a00c3 100644 --- a/drivers/char/tpm/tpm-chip.c +++ b/drivers/char/tpm/tpm-chip.c @@ -606,6 +606,30 @@ static int tpm_get_pcr_allocation(struct tpm_chip *chip) } /* + * tpm_chip_startup() - performs auto startup and allocates the PCRs + * @chip: TPM chip to use. + */ +int tpm_chip_startup(struct tpm_chip *chip) +{ + int rc; + + rc = tpm_chip_start(chip); + if (rc) + return rc; + + rc = tpm_auto_startup(chip); + if (rc) + goto stop; + + rc = tpm_get_pcr_allocation(chip); +stop: + tpm_chip_stop(chip); + + return rc; +} +EXPORT_SYMBOL_GPL(tpm_chip_startup); + +/* * tpm_chip_register() - create a character device for the TPM chip * @chip: TPM chip to use. * @@ -620,20 +644,6 @@ int tpm_chip_register(struct tpm_chip *chip) { int rc; - rc = tpm_chip_start(chip); - if (rc) - return rc; - rc = tpm_auto_startup(chip); - if (rc) { - tpm_chip_stop(chip); - return rc; - } - - rc = tpm_get_pcr_allocation(chip); - tpm_chip_stop(chip); - if (rc) - return rc; - tpm_sysfs_add_device(chip); tpm_bios_log_setup(chip); @@ -682,7 +692,8 @@ EXPORT_SYMBOL_GPL(tpm_chip_register); void tpm_chip_unregister(struct tpm_chip *chip) { tpm_del_legacy_sysfs(chip); - if (IS_ENABLED(CONFIG_HW_RANDOM_TPM) && !tpm_is_firmware_upgrade(chip)) + if (IS_ENABLED(CONFIG_HW_RANDOM_TPM) && !tpm_is_firmware_upgrade(chip) && + !tpm_amd_is_rng_defective(chip)) hwrng_unregister(&chip->hwrng); tpm_bios_log_teardown(chip); if (chip->flags & TPM_CHIP_FLAG_TPM2 && !tpm_is_firmware_upgrade(chip)) diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h index 830014a26609..88d3bd76e076 100644 --- a/drivers/char/tpm/tpm.h +++ b/drivers/char/tpm/tpm.h @@ -263,6 +263,7 @@ static inline void tpm_msleep(unsigned int delay_msec) delay_msec * 1000); }; +int tpm_chip_startup(struct tpm_chip *chip); int tpm_chip_start(struct tpm_chip *chip); void tpm_chip_stop(struct tpm_chip *chip); struct tpm_chip *tpm_find_get_ops(struct tpm_chip *chip); diff --git a/drivers/char/tpm/tpm_ftpm_tee.c b/drivers/char/tpm/tpm_ftpm_tee.c index deff23bb54bf..528f35b14fb6 100644 --- a/drivers/char/tpm/tpm_ftpm_tee.c +++ b/drivers/char/tpm/tpm_ftpm_tee.c @@ -334,11 +334,11 @@ static int ftpm_tee_remove(struct device *dev) return 0; } -static int ftpm_plat_tee_remove(struct platform_device *pdev) +static void ftpm_plat_tee_remove(struct platform_device *pdev) { struct device *dev = &pdev->dev; - return ftpm_tee_remove(dev); + ftpm_tee_remove(dev); } /** @@ -367,7 +367,7 @@ static struct platform_driver ftpm_tee_plat_driver = { }, .shutdown = ftpm_plat_tee_shutdown, .probe = ftpm_plat_tee_probe, - .remove = ftpm_plat_tee_remove, + .remove_new = ftpm_plat_tee_remove, }; /* UUID of the fTPM TA */ diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index ed5dabd3c72d..7af389806643 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -50,6 +50,45 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da return container_of(data, struct tpm_tis_tcg_phy, priv); } +#ifdef CONFIG_PREEMPT_RT +/* + * Flush previous write operations with a dummy read operation to the + * TPM MMIO base address. + */ +static inline void tpm_tis_flush(void __iomem *iobase) +{ + ioread8(iobase + TPM_ACCESS(0)); +} +#else +#define tpm_tis_flush(iobase) do { } while (0) +#endif + +/* + * Write a byte word to the TPM MMIO address, and flush the write queue. + * The flush ensures that the data is sent immediately over the bus and not + * aggregated with further requests and transferred later in a batch. The large + * write requests can lead to unwanted latency spikes by blocking the CPU until + * the complete batch has been transferred. + */ +static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr) +{ + iowrite8(b, iobase + addr); + tpm_tis_flush(iobase); +} + +/* + * Write a 32-bit word to the TPM MMIO address, and flush the write queue. + * The flush ensures that the data is sent immediately over the bus and not + * aggregated with further requests and transferred later in a batch. The large + * write requests can lead to unwanted latency spikes by blocking the CPU until + * the complete batch has been transferred. + */ +static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr) +{ + iowrite32(b, iobase + addr); + tpm_tis_flush(iobase); +} + static int interrupts = -1; module_param(interrupts, int, 0444); MODULE_PARM_DESC(interrupts, "Enable interrupts"); @@ -186,12 +225,12 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, switch (io_mode) { case TPM_TIS_PHYS_8: while (len--) - iowrite8(*value++, phy->iobase + addr); + tpm_tis_iowrite8(*value++, phy->iobase, addr); break; case TPM_TIS_PHYS_16: return -EINVAL; case TPM_TIS_PHYS_32: - iowrite32(le32_to_cpu(*((__le32 *)value)), phy->iobase + addr); + tpm_tis_iowrite32(le32_to_cpu(*((__le32 *)value)), phy->iobase, addr); break; } @@ -227,7 +266,7 @@ static int tpm_tis_init(struct device *dev, struct tpm_info *tpm_info) irq = tpm_info->irq; if (itpm || is_itpm(ACPI_COMPANION(dev))) - phy->priv.flags |= TPM_TIS_ITPM_WORKAROUND; + set_bit(TPM_TIS_ITPM_WORKAROUND, &phy->priv.flags); return tpm_tis_core_init(dev, &phy->priv, irq, &tpm_tcg, ACPI_HANDLE(dev)); @@ -324,14 +363,12 @@ static int tpm_tis_plat_probe(struct platform_device *pdev) return tpm_tis_init(&pdev->dev, &tpm_info); } -static int tpm_tis_plat_remove(struct platform_device *pdev) +static void tpm_tis_plat_remove(struct platform_device *pdev) { struct tpm_chip *chip = dev_get_drvdata(&pdev->dev); tpm_chip_unregister(chip); tpm_tis_remove(chip); - - return 0; } #ifdef CONFIG_OF @@ -344,7 +381,7 @@ MODULE_DEVICE_TABLE(of, tis_of_platform_match); static struct platform_driver tis_drv = { .probe = tpm_tis_plat_probe, - .remove = tpm_tis_plat_remove, + .remove_new = tpm_tis_plat_remove, .driver = { .name = "tpm_tis", .pm = &tpm_tis_pm, diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c index 3f98e587b3e8..c2421162cf34 100644 --- a/drivers/char/tpm/tpm_tis_core.c +++ b/drivers/char/tpm/tpm_tis_core.c @@ -44,6 +44,20 @@ static bool wait_for_tpm_stat_cond(struct tpm_chip *chip, u8 mask, return false; } +static u8 tpm_tis_filter_sts_mask(u8 int_mask, u8 sts_mask) +{ + if (!(int_mask & TPM_INTF_STS_VALID_INT)) + sts_mask &= ~TPM_STS_VALID; + + if (!(int_mask & TPM_INTF_DATA_AVAIL_INT)) + sts_mask &= ~TPM_STS_DATA_AVAIL; + + if (!(int_mask & TPM_INTF_CMD_READY_INT)) + sts_mask &= ~TPM_STS_COMMAND_READY; + + return sts_mask; +} + static int wait_for_tpm_stat(struct tpm_chip *chip, u8 mask, unsigned long timeout, wait_queue_head_t *queue, bool check_cancel) @@ -53,41 +67,56 @@ static int wait_for_tpm_stat(struct tpm_chip *chip, u8 mask, long rc; u8 status; bool canceled = false; + u8 sts_mask; + int ret = 0; /* check current status */ status = chip->ops->status(chip); if ((status & mask) == mask) return 0; - stop = jiffies + timeout; + sts_mask = mask & (TPM_STS_VALID | TPM_STS_DATA_AVAIL | + TPM_STS_COMMAND_READY); + /* check what status changes can be handled by irqs */ + sts_mask = tpm_tis_filter_sts_mask(priv->int_mask, sts_mask); - if (chip->flags & TPM_CHIP_FLAG_IRQ) { + stop = jiffies + timeout; + /* process status changes with irq support */ + if (sts_mask) { + ret = -ETIME; again: timeout = stop - jiffies; if ((long)timeout <= 0) return -ETIME; rc = wait_event_interruptible_timeout(*queue, - wait_for_tpm_stat_cond(chip, mask, check_cancel, + wait_for_tpm_stat_cond(chip, sts_mask, check_cancel, &canceled), timeout); if (rc > 0) { if (canceled) return -ECANCELED; - return 0; + ret = 0; } if (rc == -ERESTARTSYS && freezing(current)) { clear_thread_flag(TIF_SIGPENDING); goto again; } - } else { - do { - usleep_range(priv->timeout_min, - priv->timeout_max); - status = chip->ops->status(chip); - if ((status & mask) == mask) - return 0; - } while (time_before(jiffies, stop)); } + + if (ret) + return ret; + + mask &= ~sts_mask; + if (!mask) /* all done */ + return 0; + /* process status changes without irq support */ + do { + status = chip->ops->status(chip); + if ((status & mask) == mask) + return 0; + usleep_range(priv->timeout_min, + priv->timeout_max); + } while (time_before(jiffies, stop)); return -ETIME; } @@ -136,16 +165,27 @@ static bool check_locality(struct tpm_chip *chip, int l) return false; } -static int release_locality(struct tpm_chip *chip, int l) +static int __tpm_tis_relinquish_locality(struct tpm_tis_data *priv, int l) +{ + tpm_tis_write8(priv, TPM_ACCESS(l), TPM_ACCESS_ACTIVE_LOCALITY); + + return 0; +} + +static int tpm_tis_relinquish_locality(struct tpm_chip *chip, int l) { struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev); - tpm_tis_write8(priv, TPM_ACCESS(l), TPM_ACCESS_ACTIVE_LOCALITY); + mutex_lock(&priv->locality_count_mutex); + priv->locality_count--; + if (priv->locality_count == 0) + __tpm_tis_relinquish_locality(priv, l); + mutex_unlock(&priv->locality_count_mutex); return 0; } -static int request_locality(struct tpm_chip *chip, int l) +static int __tpm_tis_request_locality(struct tpm_chip *chip, int l) { struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev); unsigned long stop, timeout; @@ -186,6 +226,20 @@ again: return -1; } +static int tpm_tis_request_locality(struct tpm_chip *chip, int l) +{ + struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev); + int ret = 0; + + mutex_lock(&priv->locality_count_mutex); + if (priv->locality_count == 0) + ret = __tpm_tis_request_locality(chip, l); + if (!ret) + priv->locality_count++; + mutex_unlock(&priv->locality_count_mutex); + return ret; +} + static u8 tpm_tis_status(struct tpm_chip *chip) { struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev); @@ -351,7 +405,7 @@ static int tpm_tis_send_data(struct tpm_chip *chip, const u8 *buf, size_t len) struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev); int rc, status, burstcnt; size_t count = 0; - bool itpm = priv->flags & TPM_TIS_ITPM_WORKAROUND; + bool itpm = test_bit(TPM_TIS_ITPM_WORKAROUND, &priv->flags); status = tpm_tis_status(chip); if ((status & TPM_STS_COMMAND_READY) == 0) { @@ -484,7 +538,8 @@ static int tpm_tis_send(struct tpm_chip *chip, u8 *buf, size_t len) int rc, irq; struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev); - if (!(chip->flags & TPM_CHIP_FLAG_IRQ) || priv->irq_tested) + if (!(chip->flags & TPM_CHIP_FLAG_IRQ) || + test_bit(TPM_TIS_IRQ_TESTED, &priv->flags)) return tpm_tis_send_main(chip, buf, len); /* Verify receipt of the expected IRQ */ @@ -494,11 +549,11 @@ static int tpm_tis_send(struct tpm_chip *chip, u8 *buf, size_t len) rc = tpm_tis_send_main(chip, buf, len); priv->irq = irq; chip->flags |= TPM_CHIP_FLAG_IRQ; - if (!priv->irq_tested) + if (!test_bit(TPM_TIS_IRQ_TESTED, &priv->flags)) tpm_msleep(1); - if (!priv->irq_tested) + if (!test_bit(TPM_TIS_IRQ_TESTED, &priv->flags)) disable_interrupts(chip); - priv->irq_tested = true; + set_bit(TPM_TIS_IRQ_TESTED, &priv->flags); return rc; } @@ -641,7 +696,7 @@ static int probe_itpm(struct tpm_chip *chip) size_t len = sizeof(cmd_getticks); u16 vendor; - if (priv->flags & TPM_TIS_ITPM_WORKAROUND) + if (test_bit(TPM_TIS_ITPM_WORKAROUND, &priv->flags)) return 0; rc = tpm_tis_read16(priv, TPM_DID_VID(0), &vendor); @@ -652,7 +707,7 @@ static int probe_itpm(struct tpm_chip *chip) if (vendor != TPM_VID_INTEL) return 0; - if (request_locality(chip, 0) != 0) + if (tpm_tis_request_locality(chip, 0) != 0) return -EBUSY; rc = tpm_tis_send_data(chip, cmd_getticks, len); @@ -661,19 +716,19 @@ static int probe_itpm(struct tpm_chip *chip) tpm_tis_ready(chip); - priv->flags |= TPM_TIS_ITPM_WORKAROUND; + set_bit(TPM_TIS_ITPM_WORKAROUND, &priv->flags); rc = tpm_tis_send_data(chip, cmd_getticks, len); if (rc == 0) dev_info(&chip->dev, "Detected an iTPM.\n"); else { - priv->flags &= ~TPM_TIS_ITPM_WORKAROUND; + clear_bit(TPM_TIS_ITPM_WORKAROUND, &priv->flags); rc = -EFAULT; } out: tpm_tis_ready(chip); - release_locality(chip, priv->locality); + tpm_tis_relinquish_locality(chip, priv->locality); return rc; } @@ -702,7 +757,7 @@ static irqreturn_t tis_int_handler(int dummy, void *dev_id) struct tpm_chip *chip = dev_id; struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev); u32 interrupt; - int i, rc; + int rc; rc = tpm_tis_read32(priv, TPM_INT_STATUS(priv->locality), &interrupt); if (rc < 0) @@ -711,20 +766,19 @@ static irqreturn_t tis_int_handler(int dummy, void *dev_id) if (interrupt == 0) return IRQ_NONE; - priv->irq_tested = true; + set_bit(TPM_TIS_IRQ_TESTED, &priv->flags); if (interrupt & TPM_INTF_DATA_AVAIL_INT) wake_up_interruptible(&priv->read_queue); - if (interrupt & TPM_INTF_LOCALITY_CHANGE_INT) - for (i = 0; i < 5; i++) - if (check_locality(chip, i)) - break; + if (interrupt & (TPM_INTF_LOCALITY_CHANGE_INT | TPM_INTF_STS_VALID_INT | TPM_INTF_CMD_READY_INT)) wake_up_interruptible(&priv->int_queue); /* Clear interrupts handled with TPM_EOI */ + tpm_tis_request_locality(chip, 0); rc = tpm_tis_write32(priv, TPM_INT_STATUS(priv->locality), interrupt); + tpm_tis_relinquish_locality(chip, 0); if (rc < 0) return IRQ_NONE; @@ -732,25 +786,22 @@ static irqreturn_t tis_int_handler(int dummy, void *dev_id) return IRQ_HANDLED; } -static int tpm_tis_gen_interrupt(struct tpm_chip *chip) +static void tpm_tis_gen_interrupt(struct tpm_chip *chip) { const char *desc = "attempting to generate an interrupt"; u32 cap2; cap_t cap; int ret; - ret = request_locality(chip, 0); - if (ret < 0) - return ret; + chip->flags |= TPM_CHIP_FLAG_IRQ; if (chip->flags & TPM_CHIP_FLAG_TPM2) ret = tpm2_get_tpm_pt(chip, 0x100, &cap2, desc); else ret = tpm1_getcap(chip, TPM_CAP_PROP_TIS_TIMEOUT, &cap, desc, 0); - release_locality(chip, 0); - - return ret; + if (ret) + chip->flags &= ~TPM_CHIP_FLAG_IRQ; } /* Register the IRQ and issue a command that will cause an interrupt. If an @@ -765,60 +816,66 @@ static int tpm_tis_probe_irq_single(struct tpm_chip *chip, u32 intmask, int rc; u32 int_status; - if (devm_request_irq(chip->dev.parent, irq, tis_int_handler, flags, - dev_name(&chip->dev), chip) != 0) { + + rc = devm_request_threaded_irq(chip->dev.parent, irq, NULL, + tis_int_handler, IRQF_ONESHOT | flags, + dev_name(&chip->dev), chip); + if (rc) { dev_info(&chip->dev, "Unable to request irq: %d for probe\n", irq); return -1; } priv->irq = irq; + rc = tpm_tis_request_locality(chip, 0); + if (rc < 0) + return rc; + rc = tpm_tis_read8(priv, TPM_INT_VECTOR(priv->locality), &original_int_vec); - if (rc < 0) + if (rc < 0) { + tpm_tis_relinquish_locality(chip, priv->locality); return rc; + } rc = tpm_tis_write8(priv, TPM_INT_VECTOR(priv->locality), irq); if (rc < 0) - return rc; + goto restore_irqs; rc = tpm_tis_read32(priv, TPM_INT_STATUS(priv->locality), &int_status); if (rc < 0) - return rc; + goto restore_irqs; /* Clear all existing */ rc = tpm_tis_write32(priv, TPM_INT_STATUS(priv->locality), int_status); if (rc < 0) - return rc; - + goto restore_irqs; /* Turn on */ rc = tpm_tis_write32(priv, TPM_INT_ENABLE(priv->locality), intmask | TPM_GLOBAL_INT_ENABLE); if (rc < 0) - return rc; + goto restore_irqs; - priv->irq_tested = false; + clear_bit(TPM_TIS_IRQ_TESTED, &priv->flags); /* Generate an interrupt by having the core call through to * tpm_tis_send */ - rc = tpm_tis_gen_interrupt(chip); - if (rc < 0) - return rc; + tpm_tis_gen_interrupt(chip); +restore_irqs: /* tpm_tis_send will either confirm the interrupt is working or it * will call disable_irq which undoes all of the above. */ if (!(chip->flags & TPM_CHIP_FLAG_IRQ)) { - rc = tpm_tis_write8(priv, original_int_vec, - TPM_INT_VECTOR(priv->locality)); - if (rc < 0) - return rc; - - return 1; + tpm_tis_write8(priv, original_int_vec, + TPM_INT_VECTOR(priv->locality)); + rc = -1; } - return 0; + tpm_tis_relinquish_locality(chip, priv->locality); + + return rc; } /* Try to find the IRQ the TPM is using. This is for legacy x86 systems that @@ -932,8 +989,8 @@ static const struct tpm_class_ops tpm_tis = { .req_complete_mask = TPM_STS_DATA_AVAIL | TPM_STS_VALID, .req_complete_val = TPM_STS_DATA_AVAIL | TPM_STS_VALID, .req_canceled = tpm_tis_req_canceled, - .request_locality = request_locality, - .relinquish_locality = release_locality, + .request_locality = tpm_tis_request_locality, + .relinquish_locality = tpm_tis_relinquish_locality, .clk_enable = tpm_tis_clkrun_enable, }; @@ -967,6 +1024,8 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, priv->timeout_min = TPM_TIMEOUT_USECS_MIN; priv->timeout_max = TPM_TIMEOUT_USECS_MAX; priv->phy_ops = phy_ops; + priv->locality_count = 0; + mutex_init(&priv->locality_count_mutex); dev_set_drvdata(&chip->dev, priv); @@ -1009,18 +1068,50 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, if (rc < 0) goto out_err; - intmask |= TPM_INTF_CMD_READY_INT | TPM_INTF_LOCALITY_CHANGE_INT | - TPM_INTF_DATA_AVAIL_INT | TPM_INTF_STS_VALID_INT; + /* Figure out the capabilities */ + rc = tpm_tis_read32(priv, TPM_INTF_CAPS(priv->locality), &intfcaps); + if (rc < 0) + goto out_err; + + dev_dbg(dev, "TPM interface capabilities (0x%x):\n", + intfcaps); + if (intfcaps & TPM_INTF_BURST_COUNT_STATIC) + dev_dbg(dev, "\tBurst Count Static\n"); + if (intfcaps & TPM_INTF_CMD_READY_INT) { + intmask |= TPM_INTF_CMD_READY_INT; + dev_dbg(dev, "\tCommand Ready Int Support\n"); + } + if (intfcaps & TPM_INTF_INT_EDGE_FALLING) + dev_dbg(dev, "\tInterrupt Edge Falling\n"); + if (intfcaps & TPM_INTF_INT_EDGE_RISING) + dev_dbg(dev, "\tInterrupt Edge Rising\n"); + if (intfcaps & TPM_INTF_INT_LEVEL_LOW) + dev_dbg(dev, "\tInterrupt Level Low\n"); + if (intfcaps & TPM_INTF_INT_LEVEL_HIGH) + dev_dbg(dev, "\tInterrupt Level High\n"); + if (intfcaps & TPM_INTF_LOCALITY_CHANGE_INT) { + intmask |= TPM_INTF_LOCALITY_CHANGE_INT; + dev_dbg(dev, "\tLocality Change Int Support\n"); + } + if (intfcaps & TPM_INTF_STS_VALID_INT) { + intmask |= TPM_INTF_STS_VALID_INT; + dev_dbg(dev, "\tSts Valid Int Support\n"); + } + if (intfcaps & TPM_INTF_DATA_AVAIL_INT) { + intmask |= TPM_INTF_DATA_AVAIL_INT; + dev_dbg(dev, "\tData Avail Int Support\n"); + } + intmask &= ~TPM_GLOBAL_INT_ENABLE; - rc = request_locality(chip, 0); + rc = tpm_tis_request_locality(chip, 0); if (rc < 0) { rc = -ENODEV; goto out_err; } tpm_tis_write32(priv, TPM_INT_ENABLE(priv->locality), intmask); - release_locality(chip, 0); + tpm_tis_relinquish_locality(chip, 0); rc = tpm_chip_start(chip); if (rc) @@ -1044,35 +1135,14 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, goto out_err; } - /* Figure out the capabilities */ - rc = tpm_tis_read32(priv, TPM_INTF_CAPS(priv->locality), &intfcaps); - if (rc < 0) - goto out_err; - - dev_dbg(dev, "TPM interface capabilities (0x%x):\n", - intfcaps); - if (intfcaps & TPM_INTF_BURST_COUNT_STATIC) - dev_dbg(dev, "\tBurst Count Static\n"); - if (intfcaps & TPM_INTF_CMD_READY_INT) - dev_dbg(dev, "\tCommand Ready Int Support\n"); - if (intfcaps & TPM_INTF_INT_EDGE_FALLING) - dev_dbg(dev, "\tInterrupt Edge Falling\n"); - if (intfcaps & TPM_INTF_INT_EDGE_RISING) - dev_dbg(dev, "\tInterrupt Edge Rising\n"); - if (intfcaps & TPM_INTF_INT_LEVEL_LOW) - dev_dbg(dev, "\tInterrupt Level Low\n"); - if (intfcaps & TPM_INTF_INT_LEVEL_HIGH) - dev_dbg(dev, "\tInterrupt Level High\n"); - if (intfcaps & TPM_INTF_LOCALITY_CHANGE_INT) - dev_dbg(dev, "\tLocality Change Int Support\n"); - if (intfcaps & TPM_INTF_STS_VALID_INT) - dev_dbg(dev, "\tSts Valid Int Support\n"); - if (intfcaps & TPM_INTF_DATA_AVAIL_INT) - dev_dbg(dev, "\tData Avail Int Support\n"); - /* INTERRUPT Setup */ init_waitqueue_head(&priv->read_queue); init_waitqueue_head(&priv->int_queue); + + rc = tpm_chip_startup(chip); + if (rc) + goto out_err; + if (irq != -1) { /* * Before doing irq testing issue a command to the TPM in polling mode @@ -1080,13 +1150,13 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, * proper timeouts for the driver. */ - rc = request_locality(chip, 0); + rc = tpm_tis_request_locality(chip, 0); if (rc < 0) goto out_err; rc = tpm_get_timeouts(chip); - release_locality(chip, 0); + tpm_tis_relinquish_locality(chip, 0); if (rc) { dev_err(dev, "Could not get TPM timeouts and durations\n"); @@ -1094,17 +1164,23 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, goto out_err; } - if (irq) { + if (irq) tpm_tis_probe_irq_single(chip, intmask, IRQF_SHARED, irq); - if (!(chip->flags & TPM_CHIP_FLAG_IRQ)) { - dev_err(&chip->dev, FW_BUG - "TPM interrupt not working, polling instead\n"); + else + tpm_tis_probe_irq(chip, intmask); - disable_interrupts(chip); - } + if (chip->flags & TPM_CHIP_FLAG_IRQ) { + priv->int_mask = intmask; } else { - tpm_tis_probe_irq(chip, intmask); + dev_err(&chip->dev, FW_BUG + "TPM interrupt not working, polling instead\n"); + + rc = tpm_tis_request_locality(chip, 0); + if (rc < 0) + goto out_err; + disable_interrupts(chip); + tpm_tis_relinquish_locality(chip, 0); } } @@ -1143,13 +1219,7 @@ static void tpm_tis_reenable_interrupts(struct tpm_chip *chip) if (rc < 0) goto out; - rc = tpm_tis_read32(priv, TPM_INT_ENABLE(priv->locality), &intmask); - if (rc < 0) - goto out; - - intmask |= TPM_INTF_CMD_READY_INT - | TPM_INTF_LOCALITY_CHANGE_INT | TPM_INTF_DATA_AVAIL_INT - | TPM_INTF_STS_VALID_INT | TPM_GLOBAL_INT_ENABLE; + intmask = priv->int_mask | TPM_GLOBAL_INT_ENABLE; tpm_tis_write32(priv, TPM_INT_ENABLE(priv->locality), intmask); @@ -1165,28 +1235,27 @@ int tpm_tis_resume(struct device *dev) struct tpm_chip *chip = dev_get_drvdata(dev); int ret; + ret = tpm_tis_request_locality(chip, 0); + if (ret < 0) + return ret; + if (chip->flags & TPM_CHIP_FLAG_IRQ) tpm_tis_reenable_interrupts(chip); ret = tpm_pm_resume(dev); if (ret) - return ret; + goto out; /* * TPM 1.2 requires self-test on resume. This function actually returns * an error code but for unknown reason it isn't handled. */ - if (!(chip->flags & TPM_CHIP_FLAG_TPM2)) { - ret = request_locality(chip, 0); - if (ret < 0) - return ret; - + if (!(chip->flags & TPM_CHIP_FLAG_TPM2)) tpm1_do_selftest(chip); +out: + tpm_tis_relinquish_locality(chip, 0); - release_locality(chip, 0); - } - - return 0; + return ret; } EXPORT_SYMBOL_GPL(tpm_tis_resume); #endif diff --git a/drivers/char/tpm/tpm_tis_core.h b/drivers/char/tpm/tpm_tis_core.h index b68479e0de10..e978f457fd4d 100644 --- a/drivers/char/tpm/tpm_tis_core.h +++ b/drivers/char/tpm/tpm_tis_core.h @@ -87,13 +87,16 @@ enum tpm_tis_flags { TPM_TIS_ITPM_WORKAROUND = BIT(0), TPM_TIS_INVALID_STATUS = BIT(1), TPM_TIS_DEFAULT_CANCELLATION = BIT(2), + TPM_TIS_IRQ_TESTED = BIT(3), }; struct tpm_tis_data { u16 manufacturer_id; + struct mutex locality_count_mutex; + unsigned int locality_count; int locality; int irq; - bool irq_tested; + unsigned int int_mask; unsigned long flags; void __iomem *ilb_base_addr; u16 clkrun_enabled; diff --git a/drivers/char/tpm/tpm_tis_i2c_cr50.c b/drivers/char/tpm/tpm_tis_i2c_cr50.c index 77cea5b31c6e..376ae18a04eb 100644 --- a/drivers/char/tpm/tpm_tis_i2c_cr50.c +++ b/drivers/char/tpm/tpm_tis_i2c_cr50.c @@ -100,8 +100,7 @@ static int tpm_cr50_i2c_wait_tpm_ready(struct tpm_chip *chip) } /* Wait for interrupt to indicate TPM is ready to respond */ - if (!wait_for_completion_timeout(&priv->tpm_ready, - msecs_to_jiffies(chip->timeout_a))) { + if (!wait_for_completion_timeout(&priv->tpm_ready, chip->timeout_a)) { dev_warn(&chip->dev, "Timeout waiting for TPM ready\n"); return -ETIMEDOUT; } diff --git a/drivers/char/tpm/tpm_tis_spi_main.c b/drivers/char/tpm/tpm_tis_spi_main.c index a0963a3e92bd..1f5207974a17 100644 --- a/drivers/char/tpm/tpm_tis_spi_main.c +++ b/drivers/char/tpm/tpm_tis_spi_main.c @@ -231,7 +231,7 @@ static const struct spi_device_id tpm_tis_spi_id[] = { }; MODULE_DEVICE_TABLE(spi, tpm_tis_spi_id); -static const struct of_device_id of_tis_spi_match[] = { +static const struct of_device_id of_tis_spi_match[] __maybe_unused = { { .compatible = "st,st33htpm-spi", .data = tpm_tis_spi_probe }, { .compatible = "infineon,slb9670", .data = tpm_tis_spi_probe }, { .compatible = "tcg,tpm_tis-spi", .data = tpm_tis_spi_probe }, @@ -240,7 +240,7 @@ static const struct of_device_id of_tis_spi_match[] = { }; MODULE_DEVICE_TABLE(of, of_tis_spi_match); -static const struct acpi_device_id acpi_tis_spi_match[] = { +static const struct acpi_device_id acpi_tis_spi_match[] __maybe_unused = { {"SMO0768", 0}, {} }; diff --git a/drivers/char/tpm/tpm_tis_synquacer.c b/drivers/char/tpm/tpm_tis_synquacer.c index 679196c61401..49278746b0e2 100644 --- a/drivers/char/tpm/tpm_tis_synquacer.c +++ b/drivers/char/tpm/tpm_tis_synquacer.c @@ -127,14 +127,12 @@ static int tpm_tis_synquacer_probe(struct platform_device *pdev) return tpm_tis_synquacer_init(&pdev->dev, &tpm_info); } -static int tpm_tis_synquacer_remove(struct platform_device *pdev) +static void tpm_tis_synquacer_remove(struct platform_device *pdev) { struct tpm_chip *chip = dev_get_drvdata(&pdev->dev); tpm_chip_unregister(chip); tpm_tis_remove(chip); - - return 0; } #ifdef CONFIG_OF @@ -155,7 +153,7 @@ MODULE_DEVICE_TABLE(acpi, tpm_synquacer_acpi_tbl); static struct platform_driver tis_synquacer_drv = { .probe = tpm_tis_synquacer_probe, - .remove = tpm_tis_synquacer_remove, + .remove_new = tpm_tis_synquacer_remove, .driver = { .name = "tpm_tis_synquacer", .pm = &tpm_tis_synquacer_pm, diff --git a/drivers/clk/clk-renesas-pcie.c b/drivers/clk/clk-renesas-pcie.c index f91f30560820..ff3a52d48479 100644 --- a/drivers/clk/clk-renesas-pcie.c +++ b/drivers/clk/clk-renesas-pcie.c @@ -143,8 +143,9 @@ static int rs9_regmap_i2c_read(void *context, static const struct regmap_config rs9_regmap_config = { .reg_bits = 8, .val_bits = 8, - .cache_type = REGCACHE_NONE, + .cache_type = REGCACHE_FLAT, .max_register = RS9_REG_BCP, + .num_reg_defaults_raw = 0x8, .rd_table = &rs9_readable_table, .wr_table = &rs9_writeable_table, .reg_write = rs9_regmap_i2c_write, diff --git a/drivers/clk/imx/clk-imx6ul.c b/drivers/clk/imx/clk-imx6ul.c index 2836adb817b7..e3696a88b5a3 100644 --- a/drivers/clk/imx/clk-imx6ul.c +++ b/drivers/clk/imx/clk-imx6ul.c @@ -95,14 +95,16 @@ static const struct clk_div_table video_div_table[] = { { } }; -static const char * enet1_ref_sels[] = { "enet1_ref_125m", "enet1_ref_pad", }; +static const char * enet1_ref_sels[] = { "enet1_ref_125m", "enet1_ref_pad", "dummy", "dummy"}; static const u32 enet1_ref_sels_table[] = { IMX6UL_GPR1_ENET1_TX_CLK_DIR, - IMX6UL_GPR1_ENET1_CLK_SEL }; + IMX6UL_GPR1_ENET1_CLK_SEL, 0, + IMX6UL_GPR1_ENET1_TX_CLK_DIR | IMX6UL_GPR1_ENET1_CLK_SEL }; static const u32 enet1_ref_sels_table_mask = IMX6UL_GPR1_ENET1_TX_CLK_DIR | IMX6UL_GPR1_ENET1_CLK_SEL; -static const char * enet2_ref_sels[] = { "enet2_ref_125m", "enet2_ref_pad", }; +static const char * enet2_ref_sels[] = { "enet2_ref_125m", "enet2_ref_pad", "dummy", "dummy"}; static const u32 enet2_ref_sels_table[] = { IMX6UL_GPR1_ENET2_TX_CLK_DIR, - IMX6UL_GPR1_ENET2_CLK_SEL }; + IMX6UL_GPR1_ENET2_CLK_SEL, 0, + IMX6UL_GPR1_ENET2_TX_CLK_DIR | IMX6UL_GPR1_ENET2_CLK_SEL }; static const u32 enet2_ref_sels_table_mask = IMX6UL_GPR1_ENET2_TX_CLK_DIR | IMX6UL_GPR1_ENET2_CLK_SEL; diff --git a/drivers/clk/sprd/common.c b/drivers/clk/sprd/common.c index ce81e4087a8f..2bfbab8db94b 100644 --- a/drivers/clk/sprd/common.c +++ b/drivers/clk/sprd/common.c @@ -17,7 +17,6 @@ static const struct regmap_config sprdclk_regmap_config = { .reg_bits = 32, .reg_stride = 4, .val_bits = 32, - .max_register = 0xffff, .fast_io = true, }; @@ -43,6 +42,8 @@ int sprd_clk_regmap_init(struct platform_device *pdev, struct device *dev = &pdev->dev; struct device_node *node = dev->of_node, *np; struct regmap *regmap; + struct resource *res; + struct regmap_config reg_config = sprdclk_regmap_config; if (of_find_property(node, "sprd,syscon", NULL)) { regmap = syscon_regmap_lookup_by_phandle(node, "sprd,syscon"); @@ -59,12 +60,14 @@ int sprd_clk_regmap_init(struct platform_device *pdev, return PTR_ERR(regmap); } } else { - base = devm_platform_ioremap_resource(pdev, 0); + base = devm_platform_get_and_ioremap_resource(pdev, 0, &res); if (IS_ERR(base)) return PTR_ERR(base); + reg_config.max_register = resource_size(res) - reg_config.reg_stride; + regmap = devm_regmap_init_mmio(&pdev->dev, base, - &sprdclk_regmap_config); + ®_config); if (IS_ERR(regmap)) { pr_err("failed to init regmap\n"); return PTR_ERR(regmap); diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 73c7643b2697..8dd46fad151e 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -840,22 +840,20 @@ static int amd_pstate_update_status(const char *buf, size_t size) switch(mode_idx) { case AMD_PSTATE_DISABLE: - if (!current_pstate_driver) - return -EINVAL; - if (cppc_state == AMD_PSTATE_ACTIVE) - return -EBUSY; - cpufreq_unregister_driver(current_pstate_driver); - amd_pstate_driver_cleanup(); + if (current_pstate_driver) { + cpufreq_unregister_driver(current_pstate_driver); + amd_pstate_driver_cleanup(); + } break; case AMD_PSTATE_PASSIVE: if (current_pstate_driver) { if (current_pstate_driver == &amd_pstate_driver) return 0; cpufreq_unregister_driver(current_pstate_driver); - cppc_state = AMD_PSTATE_PASSIVE; - current_pstate_driver = &amd_pstate_driver; } + current_pstate_driver = &amd_pstate_driver; + cppc_state = AMD_PSTATE_PASSIVE; ret = cpufreq_register_driver(current_pstate_driver); break; case AMD_PSTATE_ACTIVE: @@ -863,10 +861,10 @@ static int amd_pstate_update_status(const char *buf, size_t size) if (current_pstate_driver == &amd_pstate_epp_driver) return 0; cpufreq_unregister_driver(current_pstate_driver); - current_pstate_driver = &amd_pstate_epp_driver; - cppc_state = AMD_PSTATE_ACTIVE; } + current_pstate_driver = &amd_pstate_epp_driver; + cppc_state = AMD_PSTATE_ACTIVE; ret = cpufreq_register_driver(current_pstate_driver); break; default: diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index e2f25926eb51..e346c00b132a 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -442,12 +442,19 @@ static int __sev_init_ex_locked(int *error) return __sev_do_cmd_locked(SEV_CMD_INIT_EX, &data, error); } +static inline int __sev_do_init_locked(int *psp_ret) +{ + if (sev_init_ex_buffer) + return __sev_init_ex_locked(psp_ret); + else + return __sev_init_locked(psp_ret); +} + static int __sev_platform_init_locked(int *error) { + int rc = 0, psp_ret = SEV_RET_NO_FW_CALL; struct psp_device *psp = psp_master; struct sev_device *sev; - int rc = 0, psp_ret = -1; - int (*init_function)(int *error); if (!psp || !psp->sev_data) return -ENODEV; @@ -458,15 +465,12 @@ static int __sev_platform_init_locked(int *error) return 0; if (sev_init_ex_buffer) { - init_function = __sev_init_ex_locked; rc = sev_read_init_ex_file(); if (rc) return rc; - } else { - init_function = __sev_init_locked; } - rc = init_function(&psp_ret); + rc = __sev_do_init_locked(&psp_ret); if (rc && psp_ret == SEV_RET_SECURE_DATA_INVALID) { /* * Initialization command returned an integrity check failure @@ -475,9 +479,11 @@ static int __sev_platform_init_locked(int *error) * initialization function should succeed by replacing the state * with a reset state. */ - dev_err(sev->dev, "SEV: retrying INIT command because of SECURE_DATA_INVALID error. Retrying once to reset PSP SEV state."); - rc = init_function(&psp_ret); + dev_err(sev->dev, +"SEV: retrying INIT command because of SECURE_DATA_INVALID error. Retrying once to reset PSP SEV state."); + rc = __sev_do_init_locked(&psp_ret); } + if (error) *error = psp_ret; diff --git a/drivers/dma/apple-admac.c b/drivers/dma/apple-admac.c index 90f28bda29c8..4cf8da77bdd9 100644 --- a/drivers/dma/apple-admac.c +++ b/drivers/dma/apple-admac.c @@ -75,6 +75,7 @@ #define REG_TX_INTSTATE(idx) (0x0030 + (idx) * 4) #define REG_RX_INTSTATE(idx) (0x0040 + (idx) * 4) +#define REG_GLOBAL_INTSTATE(idx) (0x0050 + (idx) * 4) #define REG_CHAN_INTSTATUS(ch, idx) (0x8010 + (ch) * 0x200 + (idx) * 4) #define REG_CHAN_INTMASK(ch, idx) (0x8020 + (ch) * 0x200 + (idx) * 4) @@ -511,7 +512,10 @@ static int admac_terminate_all(struct dma_chan *chan) admac_stop_chan(adchan); admac_reset_rings(adchan); - adchan->current_tx = NULL; + if (adchan->current_tx) { + list_add_tail(&adchan->current_tx->node, &adchan->to_free); + adchan->current_tx = NULL; + } /* * Descriptors can only be freed after the tasklet * has been killed (in admac_synchronize). @@ -672,13 +676,14 @@ static void admac_handle_chan_int(struct admac_data *ad, int no) static irqreturn_t admac_interrupt(int irq, void *devid) { struct admac_data *ad = devid; - u32 rx_intstate, tx_intstate; + u32 rx_intstate, tx_intstate, global_intstate; int i; rx_intstate = readl_relaxed(ad->base + REG_RX_INTSTATE(ad->irq_index)); tx_intstate = readl_relaxed(ad->base + REG_TX_INTSTATE(ad->irq_index)); + global_intstate = readl_relaxed(ad->base + REG_GLOBAL_INTSTATE(ad->irq_index)); - if (!tx_intstate && !rx_intstate) + if (!tx_intstate && !rx_intstate && !global_intstate) return IRQ_NONE; for (i = 0; i < ad->nchannels; i += 2) { @@ -693,6 +698,12 @@ static irqreturn_t admac_interrupt(int irq, void *devid) rx_intstate >>= 1; } + if (global_intstate) { + dev_warn(ad->dev, "clearing unknown global interrupt flag: %x\n", + global_intstate); + writel_relaxed(~(u32) 0, ad->base + REG_GLOBAL_INTSTATE(ad->irq_index)); + } + return IRQ_HANDLED; } @@ -850,6 +861,9 @@ static int admac_probe(struct platform_device *pdev) dma->directions = BIT(DMA_MEM_TO_DEV) | BIT(DMA_DEV_TO_MEM); dma->residue_granularity = DMA_RESIDUE_GRANULARITY_BURST; + dma->src_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_1_BYTE) | + BIT(DMA_SLAVE_BUSWIDTH_2_BYTES) | + BIT(DMA_SLAVE_BUSWIDTH_4_BYTES); dma->dst_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_1_BYTE) | BIT(DMA_SLAVE_BUSWIDTH_2_BYTES) | BIT(DMA_SLAVE_BUSWIDTH_4_BYTES); diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index c24bca210104..826b98284fa1 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -1342,7 +1342,7 @@ int dmaenginem_async_device_register(struct dma_device *device) if (ret) return ret; - return devm_add_action(device->dev, dmaenginem_async_device_unregister, device); + return devm_add_action_or_reset(device->dev, dmaenginem_async_device_unregister, device); } EXPORT_SYMBOL(dmaenginem_async_device_register); diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index 462109c61653..93ee298d52b8 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -277,7 +277,7 @@ failed: /** * xdma_xfer_start - Start DMA transfer - * @xdma_chan: DMA channel pointer + * @xchan: DMA channel pointer */ static int xdma_xfer_start(struct xdma_chan *xchan) { diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index e7e8e624a436..8b31cd54bdb6 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -2149,10 +2149,8 @@ static int altr_edac_a10_probe(struct platform_device *pdev) } edac->sb_irq = platform_get_irq(pdev, 0); - if (edac->sb_irq < 0) { - dev_err(&pdev->dev, "No SBERR IRQ resource\n"); + if (edac->sb_irq < 0) return edac->sb_irq; - } irq_set_chained_handler_and_data(edac->sb_irq, altr_edac_a10_irq_handler, @@ -2184,10 +2182,9 @@ static int altr_edac_a10_probe(struct platform_device *pdev) } #else edac->db_irq = platform_get_irq(pdev, 1); - if (edac->db_irq < 0) { - dev_err(&pdev->dev, "No DBERR IRQ resource\n"); + if (edac->db_irq < 0) return edac->db_irq; - } + irq_set_chained_handler_and_data(edac->db_irq, altr_edac_a10_irq_handler, edac); #endif @@ -2226,6 +2223,5 @@ static struct platform_driver altr_edac_a10_driver = { }; module_platform_driver(altr_edac_a10_driver); -MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Thor Thayer"); MODULE_DESCRIPTION("EDAC Driver for Altera Memories"); diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 5b42533f306a..5c4292e65b96 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -13,11 +13,9 @@ module_param(ecc_enable_override, int, 0644); static struct msr __percpu *msrs; -static struct amd64_family_type *fam_type; - -static inline u32 get_umc_reg(u32 reg) +static inline u32 get_umc_reg(struct amd64_pvt *pvt, u32 reg) { - if (!fam_type->flags.zn_regs_v2) + if (!pvt->flags.zn_regs_v2) return reg; switch (reg) { @@ -437,7 +435,7 @@ static void get_cs_base_and_mask(struct amd64_pvt *pvt, int csrow, u8 dct, for (i = 0; i < pvt->csels[dct].m_cnt; i++) #define for_each_umc(i) \ - for (i = 0; i < fam_type->max_mcs; i++) + for (i = 0; i < pvt->max_mcs; i++) /* * @input_addr is an InputAddr associated with the node given by mci. Return the @@ -1258,40 +1256,102 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *, u16); * Determine if the DIMMs have ECC enabled. ECC is enabled ONLY if all the DIMMs * are ECC capable. */ -static unsigned long determine_edac_cap(struct amd64_pvt *pvt) +static unsigned long dct_determine_edac_cap(struct amd64_pvt *pvt) { unsigned long edac_cap = EDAC_FLAG_NONE; u8 bit; - if (pvt->umc) { - u8 i, umc_en_mask = 0, dimm_ecc_en_mask = 0; + bit = (pvt->fam > 0xf || pvt->ext_model >= K8_REV_F) + ? 19 + : 17; - for_each_umc(i) { - if (!(pvt->umc[i].sdp_ctrl & UMC_SDP_INIT)) - continue; + if (pvt->dclr0 & BIT(bit)) + edac_cap = EDAC_FLAG_SECDED; - umc_en_mask |= BIT(i); + return edac_cap; +} - /* UMC Configuration bit 12 (DimmEccEn) */ - if (pvt->umc[i].umc_cfg & BIT(12)) - dimm_ecc_en_mask |= BIT(i); - } +static unsigned long umc_determine_edac_cap(struct amd64_pvt *pvt) +{ + u8 i, umc_en_mask = 0, dimm_ecc_en_mask = 0; + unsigned long edac_cap = EDAC_FLAG_NONE; - if (umc_en_mask == dimm_ecc_en_mask) - edac_cap = EDAC_FLAG_SECDED; - } else { - bit = (pvt->fam > 0xf || pvt->ext_model >= K8_REV_F) - ? 19 - : 17; + for_each_umc(i) { + if (!(pvt->umc[i].sdp_ctrl & UMC_SDP_INIT)) + continue; - if (pvt->dclr0 & BIT(bit)) - edac_cap = EDAC_FLAG_SECDED; + umc_en_mask |= BIT(i); + + /* UMC Configuration bit 12 (DimmEccEn) */ + if (pvt->umc[i].umc_cfg & BIT(12)) + dimm_ecc_en_mask |= BIT(i); } + if (umc_en_mask == dimm_ecc_en_mask) + edac_cap = EDAC_FLAG_SECDED; + return edac_cap; } -static void debug_display_dimm_sizes(struct amd64_pvt *, u8); +/* + * debug routine to display the memory sizes of all logical DIMMs and its + * CSROWs + */ +static void dct_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) +{ + u32 *dcsb = ctrl ? pvt->csels[1].csbases : pvt->csels[0].csbases; + u32 dbam = ctrl ? pvt->dbam1 : pvt->dbam0; + int dimm, size0, size1; + + if (pvt->fam == 0xf) { + /* K8 families < revF not supported yet */ + if (pvt->ext_model < K8_REV_F) + return; + + WARN_ON(ctrl != 0); + } + + if (pvt->fam == 0x10) { + dbam = (ctrl && !dct_ganging_enabled(pvt)) ? pvt->dbam1 + : pvt->dbam0; + dcsb = (ctrl && !dct_ganging_enabled(pvt)) ? + pvt->csels[1].csbases : + pvt->csels[0].csbases; + } else if (ctrl) { + dbam = pvt->dbam0; + dcsb = pvt->csels[1].csbases; + } + edac_dbg(1, "F2x%d80 (DRAM Bank Address Mapping): 0x%08x\n", + ctrl, dbam); + + edac_printk(KERN_DEBUG, EDAC_MC, "DCT%d chip selects:\n", ctrl); + + /* Dump memory sizes for DIMM and its CSROWs */ + for (dimm = 0; dimm < 4; dimm++) { + size0 = 0; + if (dcsb[dimm * 2] & DCSB_CS_ENABLE) + /* + * For F15m60h, we need multiplier for LRDIMM cs_size + * calculation. We pass dimm value to the dbam_to_cs + * mapper so we can find the multiplier from the + * corresponding DCSM. + */ + size0 = pvt->ops->dbam_to_cs(pvt, ctrl, + DBAM_DIMM(dimm, dbam), + dimm); + + size1 = 0; + if (dcsb[dimm * 2 + 1] & DCSB_CS_ENABLE) + size1 = pvt->ops->dbam_to_cs(pvt, ctrl, + DBAM_DIMM(dimm, dbam), + dimm); + + amd64_info(EDAC_MC ": %d: %5dMB %d: %5dMB\n", + dimm * 2, size0, + dimm * 2 + 1, size1); + } +} + static void debug_dump_dramcfg_low(struct amd64_pvt *pvt, u32 dclr, int chan) { @@ -1334,7 +1394,7 @@ static void debug_dump_dramcfg_low(struct amd64_pvt *pvt, u32 dclr, int chan) #define CS_EVEN (CS_EVEN_PRIMARY | CS_EVEN_SECONDARY) #define CS_ODD (CS_ODD_PRIMARY | CS_ODD_SECONDARY) -static int f17_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) +static int umc_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) { u8 base, count = 0; int cs_mode = 0; @@ -1366,7 +1426,85 @@ static int f17_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) return cs_mode; } -static void debug_display_dimm_sizes_df(struct amd64_pvt *pvt, u8 ctrl) +static int umc_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, + unsigned int cs_mode, int csrow_nr) +{ + u32 addr_mask_orig, addr_mask_deinterleaved; + u32 msb, weight, num_zero_bits; + int cs_mask_nr = csrow_nr; + int dimm, size = 0; + + /* No Chip Selects are enabled. */ + if (!cs_mode) + return size; + + /* Requested size of an even CS but none are enabled. */ + if (!(cs_mode & CS_EVEN) && !(csrow_nr & 1)) + return size; + + /* Requested size of an odd CS but none are enabled. */ + if (!(cs_mode & CS_ODD) && (csrow_nr & 1)) + return size; + + /* + * Family 17h introduced systems with one mask per DIMM, + * and two Chip Selects per DIMM. + * + * CS0 and CS1 -> MASK0 / DIMM0 + * CS2 and CS3 -> MASK1 / DIMM1 + * + * Family 19h Model 10h introduced systems with one mask per Chip Select, + * and two Chip Selects per DIMM. + * + * CS0 -> MASK0 -> DIMM0 + * CS1 -> MASK1 -> DIMM0 + * CS2 -> MASK2 -> DIMM1 + * CS3 -> MASK3 -> DIMM1 + * + * Keep the mask number equal to the Chip Select number for newer systems, + * and shift the mask number for older systems. + */ + dimm = csrow_nr >> 1; + + if (!pvt->flags.zn_regs_v2) + cs_mask_nr >>= 1; + + /* Asymmetric dual-rank DIMM support. */ + if ((csrow_nr & 1) && (cs_mode & CS_ODD_SECONDARY)) + addr_mask_orig = pvt->csels[umc].csmasks_sec[cs_mask_nr]; + else + addr_mask_orig = pvt->csels[umc].csmasks[cs_mask_nr]; + + /* + * The number of zero bits in the mask is equal to the number of bits + * in a full mask minus the number of bits in the current mask. + * + * The MSB is the number of bits in the full mask because BIT[0] is + * always 0. + * + * In the special 3 Rank interleaving case, a single bit is flipped + * without swapping with the most significant bit. This can be handled + * by keeping the MSB where it is and ignoring the single zero bit. + */ + msb = fls(addr_mask_orig) - 1; + weight = hweight_long(addr_mask_orig); + num_zero_bits = msb - weight - !!(cs_mode & CS_3R_INTERLEAVE); + + /* Take the number of zero bits off from the top of the mask. */ + addr_mask_deinterleaved = GENMASK_ULL(msb - num_zero_bits, 1); + + edac_dbg(1, "CS%d DIMM%d AddrMasks:\n", csrow_nr, dimm); + edac_dbg(1, " Original AddrMask: 0x%x\n", addr_mask_orig); + edac_dbg(1, " Deinterleaved AddrMask: 0x%x\n", addr_mask_deinterleaved); + + /* Register [31:1] = Address [39:9]. Size is in kBs here. */ + size = (addr_mask_deinterleaved >> 2) + 1; + + /* Return size in MBs. */ + return size >> 10; +} + +static void umc_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) { int dimm, size0, size1, cs0, cs1, cs_mode; @@ -1376,10 +1514,10 @@ static void debug_display_dimm_sizes_df(struct amd64_pvt *pvt, u8 ctrl) cs0 = dimm * 2; cs1 = dimm * 2 + 1; - cs_mode = f17_get_cs_mode(dimm, ctrl, pvt); + cs_mode = umc_get_cs_mode(dimm, ctrl, pvt); - size0 = pvt->ops->dbam_to_cs(pvt, ctrl, cs_mode, cs0); - size1 = pvt->ops->dbam_to_cs(pvt, ctrl, cs_mode, cs1); + size0 = umc_addr_mask_to_cs_size(pvt, ctrl, cs_mode, cs0); + size1 = umc_addr_mask_to_cs_size(pvt, ctrl, cs_mode, cs1); amd64_info(EDAC_MC ": %d: %5dMB %d: %5dMB\n", cs0, size0, @@ -1387,7 +1525,7 @@ static void debug_display_dimm_sizes_df(struct amd64_pvt *pvt, u8 ctrl) } } -static void __dump_misc_regs_df(struct amd64_pvt *pvt) +static void umc_dump_misc_regs(struct amd64_pvt *pvt) { struct amd64_umc *umc; u32 i, tmp, umc_base; @@ -1420,18 +1558,17 @@ static void __dump_misc_regs_df(struct amd64_pvt *pvt) if (umc->dram_type == MEM_LRDDR4 || umc->dram_type == MEM_LRDDR5) { amd_smn_read(pvt->mc_node_id, - umc_base + get_umc_reg(UMCCH_ADDR_CFG), + umc_base + get_umc_reg(pvt, UMCCH_ADDR_CFG), &tmp); edac_dbg(1, "UMC%d LRDIMM %dx rank multiply\n", i, 1 << ((tmp >> 4) & 0x3)); } - debug_display_dimm_sizes_df(pvt, i); + umc_debug_display_dimm_sizes(pvt, i); } } -/* Display and decode various NB registers for debug purposes. */ -static void __dump_misc_regs(struct amd64_pvt *pvt) +static void dct_dump_misc_regs(struct amd64_pvt *pvt) { edac_dbg(1, "F3xE8 (NB Cap): 0x%08x\n", pvt->nbcap); @@ -1451,28 +1588,19 @@ static void __dump_misc_regs(struct amd64_pvt *pvt) (pvt->fam == 0xf) ? k8_dhar_offset(pvt) : f10_dhar_offset(pvt)); - debug_display_dimm_sizes(pvt, 0); + dct_debug_display_dimm_sizes(pvt, 0); /* everything below this point is Fam10h and above */ if (pvt->fam == 0xf) return; - debug_display_dimm_sizes(pvt, 1); + dct_debug_display_dimm_sizes(pvt, 1); /* Only if NOT ganged does dclr1 have valid info */ if (!dct_ganging_enabled(pvt)) debug_dump_dramcfg_low(pvt, pvt->dclr1, 1); edac_dbg(1, " DramHoleValid: %s\n", dhar_valid(pvt) ? "yes" : "no"); -} - -/* Display and decode various NB registers for debug purposes. */ -static void dump_misc_regs(struct amd64_pvt *pvt) -{ - if (pvt->umc) - __dump_misc_regs_df(pvt); - else - __dump_misc_regs(pvt); amd64_info("using x%u syndromes.\n", pvt->ecc_sym_sz); } @@ -1480,7 +1608,7 @@ static void dump_misc_regs(struct amd64_pvt *pvt) /* * See BKDG, F2x[1,0][5C:40], F2[1,0][6C:60] */ -static void prep_chip_selects(struct amd64_pvt *pvt) +static void dct_prep_chip_selects(struct amd64_pvt *pvt) { if (pvt->fam == 0xf && pvt->ext_model < K8_REV_F) { pvt->csels[0].b_cnt = pvt->csels[1].b_cnt = 8; @@ -1488,21 +1616,23 @@ static void prep_chip_selects(struct amd64_pvt *pvt) } else if (pvt->fam == 0x15 && pvt->model == 0x30) { pvt->csels[0].b_cnt = pvt->csels[1].b_cnt = 4; pvt->csels[0].m_cnt = pvt->csels[1].m_cnt = 2; - } else if (pvt->fam >= 0x17) { - int umc; - - for_each_umc(umc) { - pvt->csels[umc].b_cnt = 4; - pvt->csels[umc].m_cnt = fam_type->flags.zn_regs_v2 ? 4 : 2; - } - } else { pvt->csels[0].b_cnt = pvt->csels[1].b_cnt = 8; pvt->csels[0].m_cnt = pvt->csels[1].m_cnt = 4; } } -static void read_umc_base_mask(struct amd64_pvt *pvt) +static void umc_prep_chip_selects(struct amd64_pvt *pvt) +{ + int umc; + + for_each_umc(umc) { + pvt->csels[umc].b_cnt = 4; + pvt->csels[umc].m_cnt = pvt->flags.zn_regs_v2 ? 4 : 2; + } +} + +static void umc_read_base_mask(struct amd64_pvt *pvt) { u32 umc_base_reg, umc_base_reg_sec; u32 umc_mask_reg, umc_mask_reg_sec; @@ -1533,7 +1663,7 @@ static void read_umc_base_mask(struct amd64_pvt *pvt) } umc_mask_reg = get_umc_base(umc) + UMCCH_ADDR_MASK; - umc_mask_reg_sec = get_umc_base(umc) + get_umc_reg(UMCCH_ADDR_MASK_SEC); + umc_mask_reg_sec = get_umc_base(umc) + get_umc_reg(pvt, UMCCH_ADDR_MASK_SEC); for_each_chip_select_mask(cs, umc, pvt) { mask = &pvt->csels[umc].csmasks[cs]; @@ -1556,15 +1686,10 @@ static void read_umc_base_mask(struct amd64_pvt *pvt) /* * Function 2 Offset F10_DCSB0; read in the DCS Base and DCS Mask registers */ -static void read_dct_base_mask(struct amd64_pvt *pvt) +static void dct_read_base_mask(struct amd64_pvt *pvt) { int cs; - prep_chip_selects(pvt); - - if (pvt->umc) - return read_umc_base_mask(pvt); - for_each_chip_select(cs, 0, pvt) { int reg0 = DCSB0 + (cs * 4); int reg1 = DCSB1 + (cs * 4); @@ -1604,7 +1729,7 @@ static void read_dct_base_mask(struct amd64_pvt *pvt) } } -static void determine_memory_type_df(struct amd64_pvt *pvt) +static void umc_determine_memory_type(struct amd64_pvt *pvt) { struct amd64_umc *umc; u32 i; @@ -1621,7 +1746,7 @@ static void determine_memory_type_df(struct amd64_pvt *pvt) * Check if the system supports the "DDR Type" field in UMC Config * and has DDR5 DIMMs in use. */ - if (fam_type->flags.zn_regs_v2 && ((umc->umc_cfg & GENMASK(2, 0)) == 0x1)) { + if (pvt->flags.zn_regs_v2 && ((umc->umc_cfg & GENMASK(2, 0)) == 0x1)) { if (umc->dimm_cfg & BIT(5)) umc->dram_type = MEM_LRDDR5; else if (umc->dimm_cfg & BIT(4)) @@ -1641,13 +1766,10 @@ static void determine_memory_type_df(struct amd64_pvt *pvt) } } -static void determine_memory_type(struct amd64_pvt *pvt) +static void dct_determine_memory_type(struct amd64_pvt *pvt) { u32 dram_ctrl, dcsm; - if (pvt->umc) - return determine_memory_type_df(pvt); - switch (pvt->fam) { case 0xf: if (pvt->ext_model >= K8_REV_F) @@ -1697,6 +1819,8 @@ static void determine_memory_type(struct amd64_pvt *pvt) WARN(1, KERN_ERR "%s: Family??? 0x%x\n", __func__, pvt->fam); pvt->dram_type = MEM_EMPTY; } + + edac_dbg(1, " DIMM type: %s\n", edac_mem_types[pvt->dram_type]); return; ddr3: @@ -2081,84 +2205,6 @@ static int f16_dbam_to_chip_select(struct amd64_pvt *pvt, u8 dct, return ddr3_cs_size(cs_mode, false); } -static int f17_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, - unsigned int cs_mode, int csrow_nr) -{ - u32 addr_mask_orig, addr_mask_deinterleaved; - u32 msb, weight, num_zero_bits; - int cs_mask_nr = csrow_nr; - int dimm, size = 0; - - /* No Chip Selects are enabled. */ - if (!cs_mode) - return size; - - /* Requested size of an even CS but none are enabled. */ - if (!(cs_mode & CS_EVEN) && !(csrow_nr & 1)) - return size; - - /* Requested size of an odd CS but none are enabled. */ - if (!(cs_mode & CS_ODD) && (csrow_nr & 1)) - return size; - - /* - * Family 17h introduced systems with one mask per DIMM, - * and two Chip Selects per DIMM. - * - * CS0 and CS1 -> MASK0 / DIMM0 - * CS2 and CS3 -> MASK1 / DIMM1 - * - * Family 19h Model 10h introduced systems with one mask per Chip Select, - * and two Chip Selects per DIMM. - * - * CS0 -> MASK0 -> DIMM0 - * CS1 -> MASK1 -> DIMM0 - * CS2 -> MASK2 -> DIMM1 - * CS3 -> MASK3 -> DIMM1 - * - * Keep the mask number equal to the Chip Select number for newer systems, - * and shift the mask number for older systems. - */ - dimm = csrow_nr >> 1; - - if (!fam_type->flags.zn_regs_v2) - cs_mask_nr >>= 1; - - /* Asymmetric dual-rank DIMM support. */ - if ((csrow_nr & 1) && (cs_mode & CS_ODD_SECONDARY)) - addr_mask_orig = pvt->csels[umc].csmasks_sec[cs_mask_nr]; - else - addr_mask_orig = pvt->csels[umc].csmasks[cs_mask_nr]; - - /* - * The number of zero bits in the mask is equal to the number of bits - * in a full mask minus the number of bits in the current mask. - * - * The MSB is the number of bits in the full mask because BIT[0] is - * always 0. - * - * In the special 3 Rank interleaving case, a single bit is flipped - * without swapping with the most significant bit. This can be handled - * by keeping the MSB where it is and ignoring the single zero bit. - */ - msb = fls(addr_mask_orig) - 1; - weight = hweight_long(addr_mask_orig); - num_zero_bits = msb - weight - !!(cs_mode & CS_3R_INTERLEAVE); - - /* Take the number of zero bits off from the top of the mask. */ - addr_mask_deinterleaved = GENMASK_ULL(msb - num_zero_bits, 1); - - edac_dbg(1, "CS%d DIMM%d AddrMasks:\n", csrow_nr, dimm); - edac_dbg(1, " Original AddrMask: 0x%x\n", addr_mask_orig); - edac_dbg(1, " Deinterleaved AddrMask: 0x%x\n", addr_mask_deinterleaved); - - /* Register [31:1] = Address [39:9]. Size is in kBs here. */ - size = (addr_mask_deinterleaved >> 2) + 1; - - /* Return size in MBs. */ - return size >> 10; -} - static void read_dram_ctl_register(struct amd64_pvt *pvt) { @@ -2682,196 +2728,6 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, } /* - * debug routine to display the memory sizes of all logical DIMMs and its - * CSROWs - */ -static void debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) -{ - int dimm, size0, size1; - u32 *dcsb = ctrl ? pvt->csels[1].csbases : pvt->csels[0].csbases; - u32 dbam = ctrl ? pvt->dbam1 : pvt->dbam0; - - if (pvt->fam == 0xf) { - /* K8 families < revF not supported yet */ - if (pvt->ext_model < K8_REV_F) - return; - else - WARN_ON(ctrl != 0); - } - - if (pvt->fam == 0x10) { - dbam = (ctrl && !dct_ganging_enabled(pvt)) ? pvt->dbam1 - : pvt->dbam0; - dcsb = (ctrl && !dct_ganging_enabled(pvt)) ? - pvt->csels[1].csbases : - pvt->csels[0].csbases; - } else if (ctrl) { - dbam = pvt->dbam0; - dcsb = pvt->csels[1].csbases; - } - edac_dbg(1, "F2x%d80 (DRAM Bank Address Mapping): 0x%08x\n", - ctrl, dbam); - - edac_printk(KERN_DEBUG, EDAC_MC, "DCT%d chip selects:\n", ctrl); - - /* Dump memory sizes for DIMM and its CSROWs */ - for (dimm = 0; dimm < 4; dimm++) { - - size0 = 0; - if (dcsb[dimm*2] & DCSB_CS_ENABLE) - /* - * For F15m60h, we need multiplier for LRDIMM cs_size - * calculation. We pass dimm value to the dbam_to_cs - * mapper so we can find the multiplier from the - * corresponding DCSM. - */ - size0 = pvt->ops->dbam_to_cs(pvt, ctrl, - DBAM_DIMM(dimm, dbam), - dimm); - - size1 = 0; - if (dcsb[dimm*2 + 1] & DCSB_CS_ENABLE) - size1 = pvt->ops->dbam_to_cs(pvt, ctrl, - DBAM_DIMM(dimm, dbam), - dimm); - - amd64_info(EDAC_MC ": %d: %5dMB %d: %5dMB\n", - dimm * 2, size0, - dimm * 2 + 1, size1); - } -} - -static struct amd64_family_type family_types[] = { - [K8_CPUS] = { - .ctl_name = "K8", - .f1_id = PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, - .f2_id = PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, - .max_mcs = 2, - .ops = { - .map_sysaddr_to_csrow = k8_map_sysaddr_to_csrow, - .dbam_to_cs = k8_dbam_to_chip_select, - } - }, - [F10_CPUS] = { - .ctl_name = "F10h", - .f1_id = PCI_DEVICE_ID_AMD_10H_NB_MAP, - .f2_id = PCI_DEVICE_ID_AMD_10H_NB_DRAM, - .max_mcs = 2, - .ops = { - .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, - .dbam_to_cs = f10_dbam_to_chip_select, - } - }, - [F15_CPUS] = { - .ctl_name = "F15h", - .f1_id = PCI_DEVICE_ID_AMD_15H_NB_F1, - .f2_id = PCI_DEVICE_ID_AMD_15H_NB_F2, - .max_mcs = 2, - .ops = { - .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, - .dbam_to_cs = f15_dbam_to_chip_select, - } - }, - [F15_M30H_CPUS] = { - .ctl_name = "F15h_M30h", - .f1_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F1, - .f2_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F2, - .max_mcs = 2, - .ops = { - .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, - .dbam_to_cs = f16_dbam_to_chip_select, - } - }, - [F15_M60H_CPUS] = { - .ctl_name = "F15h_M60h", - .f1_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F1, - .f2_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F2, - .max_mcs = 2, - .ops = { - .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, - .dbam_to_cs = f15_m60h_dbam_to_chip_select, - } - }, - [F16_CPUS] = { - .ctl_name = "F16h", - .f1_id = PCI_DEVICE_ID_AMD_16H_NB_F1, - .f2_id = PCI_DEVICE_ID_AMD_16H_NB_F2, - .max_mcs = 2, - .ops = { - .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, - .dbam_to_cs = f16_dbam_to_chip_select, - } - }, - [F16_M30H_CPUS] = { - .ctl_name = "F16h_M30h", - .f1_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F1, - .f2_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F2, - .max_mcs = 2, - .ops = { - .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, - .dbam_to_cs = f16_dbam_to_chip_select, - } - }, - [F17_CPUS] = { - .ctl_name = "F17h", - .max_mcs = 2, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } - }, - [F17_M10H_CPUS] = { - .ctl_name = "F17h_M10h", - .max_mcs = 2, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } - }, - [F17_M30H_CPUS] = { - .ctl_name = "F17h_M30h", - .max_mcs = 8, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } - }, - [F17_M60H_CPUS] = { - .ctl_name = "F17h_M60h", - .max_mcs = 2, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } - }, - [F17_M70H_CPUS] = { - .ctl_name = "F17h_M70h", - .max_mcs = 2, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } - }, - [F19_CPUS] = { - .ctl_name = "F19h", - .max_mcs = 8, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } - }, - [F19_M10H_CPUS] = { - .ctl_name = "F19h_M10h", - .max_mcs = 12, - .flags.zn_regs_v2 = 1, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } - }, - [F19_M50H_CPUS] = { - .ctl_name = "F19h_M50h", - .max_mcs = 2, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } - }, -}; - -/* * These are tables of eigenvectors (one per line) which can be used for the * construction of the syndrome tables. The modified syndrome search algorithm * uses those to find the symbol in error and thus the DIMM. @@ -3118,10 +2974,14 @@ static inline void decode_bus_error(int node_id, struct mce *m) * Currently, we can derive the channel number by looking at the 6th nibble in * the instance_id. For example, instance_id=0xYXXXXX where Y is the channel * number. + * + * For DRAM ECC errors, the Chip Select number is given in bits [2:0] of + * the MCA_SYND[ErrorInformation] field. */ -static int find_umc_channel(struct mce *m) +static void umc_get_err_info(struct mce *m, struct err_info *err) { - return (m->ipid & GENMASK(31, 0)) >> 20; + err->channel = (m->ipid & GENMASK(31, 0)) >> 20; + err->csrow = m->synd & 0x7; } static void decode_umc_error(int node_id, struct mce *m) @@ -3143,8 +3003,6 @@ static void decode_umc_error(int node_id, struct mce *m) if (m->status & MCI_STATUS_DEFERRED) ecc_type = 3; - err.channel = find_umc_channel(m); - if (!(m->status & MCI_STATUS_SYNDV)) { err.err_code = ERR_SYND; goto log_error; @@ -3159,7 +3017,7 @@ static void decode_umc_error(int node_id, struct mce *m) err.err_code = ERR_CHANNEL; } - err.csrow = m->synd & 0x7; + pvt->ops->get_err_info(m, &err); if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, err.channel, &sys_addr)) { err.err_code = ERR_NORM_ADDR; @@ -3179,9 +3037,6 @@ log_error: static int reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) { - if (pvt->umc) - return 0; - /* Reserve the ADDRESS MAP Device */ pvt->F1 = pci_get_related_function(pvt->F3->vendor, pci_id1, pvt->F3); if (!pvt->F1) { @@ -3209,36 +3064,11 @@ reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) return 0; } -static void free_mc_sibling_devs(struct amd64_pvt *pvt) -{ - if (pvt->umc) { - return; - } else { - pci_dev_put(pvt->F1); - pci_dev_put(pvt->F2); - } -} - static void determine_ecc_sym_sz(struct amd64_pvt *pvt) { pvt->ecc_sym_sz = 4; - if (pvt->umc) { - u8 i; - - for_each_umc(i) { - /* Check enabled channels only: */ - if (pvt->umc[i].sdp_ctrl & UMC_SDP_INIT) { - if (pvt->umc[i].ecc_ctrl & BIT(9)) { - pvt->ecc_sym_sz = 16; - return; - } else if (pvt->umc[i].ecc_ctrl & BIT(7)) { - pvt->ecc_sym_sz = 8; - return; - } - } - } - } else if (pvt->fam >= 0x10) { + if (pvt->fam >= 0x10) { u32 tmp; amd64_read_pci_cfg(pvt->F3, EXT_NB_MCA_CFG, &tmp); @@ -3255,7 +3085,7 @@ static void determine_ecc_sym_sz(struct amd64_pvt *pvt) /* * Retrieve the hardware registers of the memory controller. */ -static void __read_mc_regs_df(struct amd64_pvt *pvt) +static void umc_read_mc_regs(struct amd64_pvt *pvt) { u8 nid = pvt->mc_node_id; struct amd64_umc *umc; @@ -3267,7 +3097,7 @@ static void __read_mc_regs_df(struct amd64_pvt *pvt) umc_base = get_umc_base(i); umc = &pvt->umc[i]; - amd_smn_read(nid, umc_base + get_umc_reg(UMCCH_DIMM_CFG), &umc->dimm_cfg); + amd_smn_read(nid, umc_base + get_umc_reg(pvt, UMCCH_DIMM_CFG), &umc->dimm_cfg); amd_smn_read(nid, umc_base + UMCCH_UMC_CFG, &umc->umc_cfg); amd_smn_read(nid, umc_base + UMCCH_SDP_CTRL, &umc->sdp_ctrl); amd_smn_read(nid, umc_base + UMCCH_ECC_CTRL, &umc->ecc_ctrl); @@ -3279,7 +3109,7 @@ static void __read_mc_regs_df(struct amd64_pvt *pvt) * Retrieve the hardware registers of the memory controller (this includes the * 'Address Map' and 'Misc' device regs) */ -static void read_mc_regs(struct amd64_pvt *pvt) +static void dct_read_mc_regs(struct amd64_pvt *pvt) { unsigned int range; u64 msr_val; @@ -3300,12 +3130,6 @@ static void read_mc_regs(struct amd64_pvt *pvt) edac_dbg(0, " TOP_MEM2 disabled\n"); } - if (pvt->umc) { - __read_mc_regs_df(pvt); - - goto skip; - } - amd64_read_pci_cfg(pvt->F3, NBCAP, &pvt->nbcap); read_dram_ctl_register(pvt); @@ -3346,14 +3170,6 @@ static void read_mc_regs(struct amd64_pvt *pvt) amd64_read_dct_pci_cfg(pvt, 1, DCHR0, &pvt->dchr1); } -skip: - read_dct_base_mask(pvt); - - determine_memory_type(pvt); - - if (!pvt->umc) - edac_dbg(1, " DIMM type: %s\n", edac_mem_types[pvt->dram_type]); - determine_ecc_sym_sz(pvt); } @@ -3391,36 +3207,47 @@ skip: * encompasses * */ -static u32 get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr_orig) +static u32 dct_get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr) { u32 dbam = dct ? pvt->dbam1 : pvt->dbam0; - int csrow_nr = csrow_nr_orig; u32 cs_mode, nr_pages; - if (!pvt->umc) { - csrow_nr >>= 1; - cs_mode = DBAM_DIMM(csrow_nr, dbam); - } else { - cs_mode = f17_get_cs_mode(csrow_nr >> 1, dct, pvt); - } + csrow_nr >>= 1; + cs_mode = DBAM_DIMM(csrow_nr, dbam); nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode, csrow_nr); nr_pages <<= 20 - PAGE_SHIFT; edac_dbg(0, "csrow: %d, channel: %d, DBAM idx: %d\n", - csrow_nr_orig, dct, cs_mode); + csrow_nr, dct, cs_mode); edac_dbg(0, "nr_pages/channel: %u\n", nr_pages); return nr_pages; } -static int init_csrows_df(struct mem_ctl_info *mci) +static u32 umc_get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr_orig) +{ + int csrow_nr = csrow_nr_orig; + u32 cs_mode, nr_pages; + + cs_mode = umc_get_cs_mode(csrow_nr >> 1, dct, pvt); + + nr_pages = umc_addr_mask_to_cs_size(pvt, dct, cs_mode, csrow_nr); + nr_pages <<= 20 - PAGE_SHIFT; + + edac_dbg(0, "csrow: %d, channel: %d, cs_mode %d\n", + csrow_nr_orig, dct, cs_mode); + edac_dbg(0, "nr_pages/channel: %u\n", nr_pages); + + return nr_pages; +} + +static void umc_init_csrows(struct mem_ctl_info *mci) { struct amd64_pvt *pvt = mci->pvt_info; enum edac_type edac_mode = EDAC_NONE; enum dev_type dev_type = DEV_UNKNOWN; struct dimm_info *dimm; - int empty = 1; u8 umc, cs; if (mci->edac_ctl_cap & EDAC_FLAG_S16ECD16ED) { @@ -3441,40 +3268,34 @@ static int init_csrows_df(struct mem_ctl_info *mci) if (!csrow_enabled(cs, umc, pvt)) continue; - empty = 0; dimm = mci->csrows[cs]->channels[umc]->dimm; edac_dbg(1, "MC node: %d, csrow: %d\n", pvt->mc_node_id, cs); - dimm->nr_pages = get_csrow_nr_pages(pvt, umc, cs); + dimm->nr_pages = umc_get_csrow_nr_pages(pvt, umc, cs); dimm->mtype = pvt->umc[umc].dram_type; dimm->edac_mode = edac_mode; dimm->dtype = dev_type; dimm->grain = 64; } } - - return empty; } /* * Initialize the array of csrow attribute instances, based on the values * from pci config hardware registers. */ -static int init_csrows(struct mem_ctl_info *mci) +static void dct_init_csrows(struct mem_ctl_info *mci) { struct amd64_pvt *pvt = mci->pvt_info; enum edac_type edac_mode = EDAC_NONE; struct csrow_info *csrow; struct dimm_info *dimm; - int i, j, empty = 1; int nr_pages = 0; + int i, j; u32 val; - if (pvt->umc) - return init_csrows_df(mci); - amd64_read_pci_cfg(pvt->F3, NBCFG, &val); pvt->nbcfg = val; @@ -3497,19 +3318,18 @@ static int init_csrows(struct mem_ctl_info *mci) continue; csrow = mci->csrows[i]; - empty = 0; edac_dbg(1, "MC node: %d, csrow: %d\n", pvt->mc_node_id, i); if (row_dct0) { - nr_pages = get_csrow_nr_pages(pvt, 0, i); + nr_pages = dct_get_csrow_nr_pages(pvt, 0, i); csrow->channels[0]->dimm->nr_pages = nr_pages; } /* K8 has only one DCT */ if (pvt->fam != 0xf && row_dct1) { - int row_dct1_pages = get_csrow_nr_pages(pvt, 1, i); + int row_dct1_pages = dct_get_csrow_nr_pages(pvt, 1, i); csrow->channels[1]->dimm->nr_pages = row_dct1_pages; nr_pages += row_dct1_pages; @@ -3524,15 +3344,13 @@ static int init_csrows(struct mem_ctl_info *mci) : EDAC_SECDED; } - for (j = 0; j < fam_type->max_mcs; j++) { + for (j = 0; j < pvt->max_mcs; j++) { dimm = csrow->channels[j]->dimm; dimm->mtype = pvt->dram_type; dimm->edac_mode = edac_mode; dimm->grain = 64; } } - - return empty; } /* get all cores on this DCT */ @@ -3695,59 +3513,66 @@ static void restore_ecc_error_reporting(struct ecc_settings *s, u16 nid, amd64_warn("Error restoring NB MCGCTL settings!\n"); } -static bool ecc_enabled(struct amd64_pvt *pvt) +static bool dct_ecc_enabled(struct amd64_pvt *pvt) { u16 nid = pvt->mc_node_id; bool nb_mce_en = false; - u8 ecc_en = 0, i; + u8 ecc_en = 0; u32 value; - if (boot_cpu_data.x86 >= 0x17) { - u8 umc_en_mask = 0, ecc_en_mask = 0; - struct amd64_umc *umc; + amd64_read_pci_cfg(pvt->F3, NBCFG, &value); - for_each_umc(i) { - umc = &pvt->umc[i]; + ecc_en = !!(value & NBCFG_ECC_ENABLE); - /* Only check enabled UMCs. */ - if (!(umc->sdp_ctrl & UMC_SDP_INIT)) - continue; + nb_mce_en = nb_mce_bank_enabled_on_node(nid); + if (!nb_mce_en) + edac_dbg(0, "NB MCE bank disabled, set MSR 0x%08x[4] on node %d to enable.\n", + MSR_IA32_MCG_CTL, nid); - umc_en_mask |= BIT(i); + edac_dbg(3, "Node %d: DRAM ECC %s.\n", nid, (ecc_en ? "enabled" : "disabled")); - if (umc->umc_cap_hi & UMC_ECC_ENABLED) - ecc_en_mask |= BIT(i); - } + if (!ecc_en || !nb_mce_en) + return false; + else + return true; +} - /* Check whether at least one UMC is enabled: */ - if (umc_en_mask) - ecc_en = umc_en_mask == ecc_en_mask; - else - edac_dbg(0, "Node %d: No enabled UMCs.\n", nid); +static bool umc_ecc_enabled(struct amd64_pvt *pvt) +{ + u8 umc_en_mask = 0, ecc_en_mask = 0; + u16 nid = pvt->mc_node_id; + struct amd64_umc *umc; + u8 ecc_en = 0, i; - /* Assume UMC MCA banks are enabled. */ - nb_mce_en = true; - } else { - amd64_read_pci_cfg(pvt->F3, NBCFG, &value); + for_each_umc(i) { + umc = &pvt->umc[i]; - ecc_en = !!(value & NBCFG_ECC_ENABLE); + /* Only check enabled UMCs. */ + if (!(umc->sdp_ctrl & UMC_SDP_INIT)) + continue; + + umc_en_mask |= BIT(i); - nb_mce_en = nb_mce_bank_enabled_on_node(nid); - if (!nb_mce_en) - edac_dbg(0, "NB MCE bank disabled, set MSR 0x%08x[4] on node %d to enable.\n", - MSR_IA32_MCG_CTL, nid); + if (umc->umc_cap_hi & UMC_ECC_ENABLED) + ecc_en_mask |= BIT(i); } + /* Check whether at least one UMC is enabled: */ + if (umc_en_mask) + ecc_en = umc_en_mask == ecc_en_mask; + else + edac_dbg(0, "Node %d: No enabled UMCs.\n", nid); + edac_dbg(3, "Node %d: DRAM ECC %s.\n", nid, (ecc_en ? "enabled" : "disabled")); - if (!ecc_en || !nb_mce_en) + if (!ecc_en) return false; else return true; } static inline void -f17h_determine_edac_ctl_cap(struct mem_ctl_info *mci, struct amd64_pvt *pvt) +umc_determine_edac_ctl_cap(struct mem_ctl_info *mci, struct amd64_pvt *pvt) { u8 i, ecc_en = 1, cpk_en = 1, dev_x4 = 1, dev_x16 = 1; @@ -3777,145 +3602,234 @@ f17h_determine_edac_ctl_cap(struct mem_ctl_info *mci, struct amd64_pvt *pvt) } } -static void setup_mci_misc_attrs(struct mem_ctl_info *mci) +static void dct_setup_mci_misc_attrs(struct mem_ctl_info *mci) { struct amd64_pvt *pvt = mci->pvt_info; mci->mtype_cap = MEM_FLAG_DDR2 | MEM_FLAG_RDDR2; mci->edac_ctl_cap = EDAC_FLAG_NONE; - if (pvt->umc) { - f17h_determine_edac_ctl_cap(mci, pvt); - } else { - if (pvt->nbcap & NBCAP_SECDED) - mci->edac_ctl_cap |= EDAC_FLAG_SECDED; + if (pvt->nbcap & NBCAP_SECDED) + mci->edac_ctl_cap |= EDAC_FLAG_SECDED; - if (pvt->nbcap & NBCAP_CHIPKILL) - mci->edac_ctl_cap |= EDAC_FLAG_S4ECD4ED; - } + if (pvt->nbcap & NBCAP_CHIPKILL) + mci->edac_ctl_cap |= EDAC_FLAG_S4ECD4ED; - mci->edac_cap = determine_edac_cap(pvt); + mci->edac_cap = dct_determine_edac_cap(pvt); mci->mod_name = EDAC_MOD_STR; - mci->ctl_name = fam_type->ctl_name; + mci->ctl_name = pvt->ctl_name; mci->dev_name = pci_name(pvt->F3); mci->ctl_page_to_phys = NULL; - if (pvt->fam >= 0x17) - return; - /* memory scrubber interface */ mci->set_sdram_scrub_rate = set_scrub_rate; mci->get_sdram_scrub_rate = get_scrub_rate; + + dct_init_csrows(mci); } -/* - * returns a pointer to the family descriptor on success, NULL otherwise. - */ -static struct amd64_family_type *per_family_init(struct amd64_pvt *pvt) +static void umc_setup_mci_misc_attrs(struct mem_ctl_info *mci) +{ + struct amd64_pvt *pvt = mci->pvt_info; + + mci->mtype_cap = MEM_FLAG_DDR4 | MEM_FLAG_RDDR4; + mci->edac_ctl_cap = EDAC_FLAG_NONE; + + umc_determine_edac_ctl_cap(mci, pvt); + + mci->edac_cap = umc_determine_edac_cap(pvt); + mci->mod_name = EDAC_MOD_STR; + mci->ctl_name = pvt->ctl_name; + mci->dev_name = pci_name(pvt->F3); + mci->ctl_page_to_phys = NULL; + + umc_init_csrows(mci); +} + +static int dct_hw_info_get(struct amd64_pvt *pvt) +{ + int ret = reserve_mc_sibling_devs(pvt, pvt->f1_id, pvt->f2_id); + + if (ret) + return ret; + + dct_prep_chip_selects(pvt); + dct_read_base_mask(pvt); + dct_read_mc_regs(pvt); + dct_determine_memory_type(pvt); + + return 0; +} + +static int umc_hw_info_get(struct amd64_pvt *pvt) +{ + pvt->umc = kcalloc(pvt->max_mcs, sizeof(struct amd64_umc), GFP_KERNEL); + if (!pvt->umc) + return -ENOMEM; + + umc_prep_chip_selects(pvt); + umc_read_base_mask(pvt); + umc_read_mc_regs(pvt); + umc_determine_memory_type(pvt); + + return 0; +} + +static void hw_info_put(struct amd64_pvt *pvt) +{ + pci_dev_put(pvt->F1); + pci_dev_put(pvt->F2); + kfree(pvt->umc); +} + +static struct low_ops umc_ops = { + .hw_info_get = umc_hw_info_get, + .ecc_enabled = umc_ecc_enabled, + .setup_mci_misc_attrs = umc_setup_mci_misc_attrs, + .dump_misc_regs = umc_dump_misc_regs, + .get_err_info = umc_get_err_info, +}; + +/* Use Family 16h versions for defaults and adjust as needed below. */ +static struct low_ops dct_ops = { + .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, + .dbam_to_cs = f16_dbam_to_chip_select, + .hw_info_get = dct_hw_info_get, + .ecc_enabled = dct_ecc_enabled, + .setup_mci_misc_attrs = dct_setup_mci_misc_attrs, + .dump_misc_regs = dct_dump_misc_regs, +}; + +static int per_family_init(struct amd64_pvt *pvt) { pvt->ext_model = boot_cpu_data.x86_model >> 4; pvt->stepping = boot_cpu_data.x86_stepping; pvt->model = boot_cpu_data.x86_model; pvt->fam = boot_cpu_data.x86; + pvt->max_mcs = 2; + + /* + * Decide on which ops group to use here and do any family/model + * overrides below. + */ + if (pvt->fam >= 0x17) + pvt->ops = &umc_ops; + else + pvt->ops = &dct_ops; switch (pvt->fam) { case 0xf: - fam_type = &family_types[K8_CPUS]; - pvt->ops = &family_types[K8_CPUS].ops; + pvt->ctl_name = (pvt->ext_model >= K8_REV_F) ? + "K8 revF or later" : "K8 revE or earlier"; + pvt->f1_id = PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP; + pvt->f2_id = PCI_DEVICE_ID_AMD_K8_NB_MEMCTL; + pvt->ops->map_sysaddr_to_csrow = k8_map_sysaddr_to_csrow; + pvt->ops->dbam_to_cs = k8_dbam_to_chip_select; break; case 0x10: - fam_type = &family_types[F10_CPUS]; - pvt->ops = &family_types[F10_CPUS].ops; + pvt->ctl_name = "F10h"; + pvt->f1_id = PCI_DEVICE_ID_AMD_10H_NB_MAP; + pvt->f2_id = PCI_DEVICE_ID_AMD_10H_NB_DRAM; + pvt->ops->dbam_to_cs = f10_dbam_to_chip_select; break; case 0x15: - if (pvt->model == 0x30) { - fam_type = &family_types[F15_M30H_CPUS]; - pvt->ops = &family_types[F15_M30H_CPUS].ops; + switch (pvt->model) { + case 0x30: + pvt->ctl_name = "F15h_M30h"; + pvt->f1_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F1; + pvt->f2_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F2; break; - } else if (pvt->model == 0x60) { - fam_type = &family_types[F15_M60H_CPUS]; - pvt->ops = &family_types[F15_M60H_CPUS].ops; + case 0x60: + pvt->ctl_name = "F15h_M60h"; + pvt->f1_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F1; + pvt->f2_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F2; + pvt->ops->dbam_to_cs = f15_m60h_dbam_to_chip_select; + break; + case 0x13: + /* Richland is only client */ + return -ENODEV; + default: + pvt->ctl_name = "F15h"; + pvt->f1_id = PCI_DEVICE_ID_AMD_15H_NB_F1; + pvt->f2_id = PCI_DEVICE_ID_AMD_15H_NB_F2; + pvt->ops->dbam_to_cs = f15_dbam_to_chip_select; break; - /* Richland is only client */ - } else if (pvt->model == 0x13) { - return NULL; - } else { - fam_type = &family_types[F15_CPUS]; - pvt->ops = &family_types[F15_CPUS].ops; } break; case 0x16: - if (pvt->model == 0x30) { - fam_type = &family_types[F16_M30H_CPUS]; - pvt->ops = &family_types[F16_M30H_CPUS].ops; + switch (pvt->model) { + case 0x30: + pvt->ctl_name = "F16h_M30h"; + pvt->f1_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F1; + pvt->f2_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F2; + break; + default: + pvt->ctl_name = "F16h"; + pvt->f1_id = PCI_DEVICE_ID_AMD_16H_NB_F1; + pvt->f2_id = PCI_DEVICE_ID_AMD_16H_NB_F2; break; } - fam_type = &family_types[F16_CPUS]; - pvt->ops = &family_types[F16_CPUS].ops; break; case 0x17: - if (pvt->model >= 0x10 && pvt->model <= 0x2f) { - fam_type = &family_types[F17_M10H_CPUS]; - pvt->ops = &family_types[F17_M10H_CPUS].ops; + switch (pvt->model) { + case 0x10 ... 0x2f: + pvt->ctl_name = "F17h_M10h"; break; - } else if (pvt->model >= 0x30 && pvt->model <= 0x3f) { - fam_type = &family_types[F17_M30H_CPUS]; - pvt->ops = &family_types[F17_M30H_CPUS].ops; + case 0x30 ... 0x3f: + pvt->ctl_name = "F17h_M30h"; + pvt->max_mcs = 8; break; - } else if (pvt->model >= 0x60 && pvt->model <= 0x6f) { - fam_type = &family_types[F17_M60H_CPUS]; - pvt->ops = &family_types[F17_M60H_CPUS].ops; + case 0x60 ... 0x6f: + pvt->ctl_name = "F17h_M60h"; break; - } else if (pvt->model >= 0x70 && pvt->model <= 0x7f) { - fam_type = &family_types[F17_M70H_CPUS]; - pvt->ops = &family_types[F17_M70H_CPUS].ops; + case 0x70 ... 0x7f: + pvt->ctl_name = "F17h_M70h"; + break; + default: + pvt->ctl_name = "F17h"; break; } - fallthrough; - case 0x18: - fam_type = &family_types[F17_CPUS]; - pvt->ops = &family_types[F17_CPUS].ops; + break; - if (pvt->fam == 0x18) - family_types[F17_CPUS].ctl_name = "F18h"; + case 0x18: + pvt->ctl_name = "F18h"; break; case 0x19: - if (pvt->model >= 0x10 && pvt->model <= 0x1f) { - fam_type = &family_types[F19_M10H_CPUS]; - pvt->ops = &family_types[F19_M10H_CPUS].ops; + switch (pvt->model) { + case 0x00 ... 0x0f: + pvt->ctl_name = "F19h"; + pvt->max_mcs = 8; break; - } else if (pvt->model >= 0x20 && pvt->model <= 0x2f) { - fam_type = &family_types[F17_M70H_CPUS]; - pvt->ops = &family_types[F17_M70H_CPUS].ops; - fam_type->ctl_name = "F19h_M20h"; + case 0x10 ... 0x1f: + pvt->ctl_name = "F19h_M10h"; + pvt->max_mcs = 12; + pvt->flags.zn_regs_v2 = 1; break; - } else if (pvt->model >= 0x50 && pvt->model <= 0x5f) { - fam_type = &family_types[F19_M50H_CPUS]; - pvt->ops = &family_types[F19_M50H_CPUS].ops; - fam_type->ctl_name = "F19h_M50h"; + case 0x20 ... 0x2f: + pvt->ctl_name = "F19h_M20h"; break; - } else if (pvt->model >= 0xa0 && pvt->model <= 0xaf) { - fam_type = &family_types[F19_M10H_CPUS]; - pvt->ops = &family_types[F19_M10H_CPUS].ops; - fam_type->ctl_name = "F19h_MA0h"; + case 0x50 ... 0x5f: + pvt->ctl_name = "F19h_M50h"; + break; + case 0xa0 ... 0xaf: + pvt->ctl_name = "F19h_MA0h"; + pvt->max_mcs = 12; + pvt->flags.zn_regs_v2 = 1; break; } - fam_type = &family_types[F19_CPUS]; - pvt->ops = &family_types[F19_CPUS].ops; - family_types[F19_CPUS].ctl_name = "F19h"; break; default: amd64_err("Unsupported family!\n"); - return NULL; + return -ENODEV; } - return fam_type; + return 0; } static const struct attribute_group *amd64_edac_attr_groups[] = { @@ -3926,37 +3840,6 @@ static const struct attribute_group *amd64_edac_attr_groups[] = { NULL }; -static int hw_info_get(struct amd64_pvt *pvt) -{ - u16 pci_id1 = 0, pci_id2 = 0; - int ret; - - if (pvt->fam >= 0x17) { - pvt->umc = kcalloc(fam_type->max_mcs, sizeof(struct amd64_umc), GFP_KERNEL); - if (!pvt->umc) - return -ENOMEM; - } else { - pci_id1 = fam_type->f1_id; - pci_id2 = fam_type->f2_id; - } - - ret = reserve_mc_sibling_devs(pvt, pci_id1, pci_id2); - if (ret) - return ret; - - read_mc_regs(pvt); - - return 0; -} - -static void hw_info_put(struct amd64_pvt *pvt) -{ - if (pvt->F1) - free_mc_sibling_devs(pvt); - - kfree(pvt->umc); -} - static int init_one_instance(struct amd64_pvt *pvt) { struct mem_ctl_info *mci = NULL; @@ -3967,7 +3850,7 @@ static int init_one_instance(struct amd64_pvt *pvt) layers[0].size = pvt->csels[0].b_cnt; layers[0].is_virt_csrow = true; layers[1].type = EDAC_MC_LAYER_CHANNEL; - layers[1].size = fam_type->max_mcs; + layers[1].size = pvt->max_mcs; layers[1].is_virt_csrow = false; mci = edac_mc_alloc(pvt->mc_node_id, ARRAY_SIZE(layers), layers, 0); @@ -3977,10 +3860,7 @@ static int init_one_instance(struct amd64_pvt *pvt) mci->pvt_info = pvt; mci->pdev = &pvt->F3->dev; - setup_mci_misc_attrs(mci); - - if (init_csrows(mci)) - mci->edac_cap = EDAC_FLAG_NONE; + pvt->ops->setup_mci_misc_attrs(mci); ret = -ENODEV; if (edac_mc_add_mc_with_groups(mci, amd64_edac_attr_groups)) { @@ -3997,7 +3877,7 @@ static bool instance_has_memory(struct amd64_pvt *pvt) bool cs_enabled = false; int cs = 0, dct = 0; - for (dct = 0; dct < fam_type->max_mcs; dct++) { + for (dct = 0; dct < pvt->max_mcs; dct++) { for_each_chip_select(cs, dct, pvt) cs_enabled |= csrow_enabled(cs, dct, pvt); } @@ -4026,12 +3906,11 @@ static int probe_one_instance(unsigned int nid) pvt->mc_node_id = nid; pvt->F3 = F3; - ret = -ENODEV; - fam_type = per_family_init(pvt); - if (!fam_type) + ret = per_family_init(pvt); + if (ret < 0) goto err_enable; - ret = hw_info_get(pvt); + ret = pvt->ops->hw_info_get(pvt); if (ret < 0) goto err_enable; @@ -4041,7 +3920,7 @@ static int probe_one_instance(unsigned int nid) goto err_enable; } - if (!ecc_enabled(pvt)) { + if (!pvt->ops->ecc_enabled(pvt)) { ret = -ENODEV; if (!ecc_enable_override) @@ -4067,13 +3946,10 @@ static int probe_one_instance(unsigned int nid) goto err_enable; } - amd64_info("%s %sdetected (node %d).\n", fam_type->ctl_name, - (pvt->fam == 0xf ? - (pvt->ext_model >= K8_REV_F ? "revF or later " - : "revE or earlier ") - : ""), pvt->mc_node_id); + amd64_info("%s detected (node %d).\n", pvt->ctl_name, pvt->mc_node_id); - dump_misc_regs(pvt); + /* Display and decode various registers for debug purposes. */ + pvt->ops->dump_misc_regs(pvt); return ret; @@ -4244,10 +4120,8 @@ module_init(amd64_edac_init); module_exit(amd64_edac_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("SoftwareBitMaker: Doug Thompson, " - "Dave Peterson, Thayne Harbaugh"); -MODULE_DESCRIPTION("MC support for AMD64 memory controllers - " - EDAC_AMD64_VERSION); +MODULE_AUTHOR("SoftwareBitMaker: Doug Thompson, Dave Peterson, Thayne Harbaugh; AMD"); +MODULE_DESCRIPTION("MC support for AMD64 memory controllers - " EDAC_AMD64_VERSION); module_param(edac_op_state, int, 0444); MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI"); diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index e4329dff8cf2..e84fe0d4120a 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -273,25 +273,6 @@ #define UMC_SDP_INIT BIT(31) -enum amd_families { - K8_CPUS = 0, - F10_CPUS, - F15_CPUS, - F15_M30H_CPUS, - F15_M60H_CPUS, - F16_CPUS, - F16_M30H_CPUS, - F17_CPUS, - F17_M10H_CPUS, - F17_M30H_CPUS, - F17_M60H_CPUS, - F17_M70H_CPUS, - F19_CPUS, - F19_M10H_CPUS, - F19_M50H_CPUS, - NUM_FAMILIES, -}; - /* Error injection control structure */ struct error_injection { u32 section; @@ -334,6 +315,16 @@ struct amd64_umc { enum mem_type dram_type; }; +struct amd64_family_flags { + /* + * Indicates that the system supports the new register offsets, etc. + * first introduced with Family 19h Model 10h. + */ + __u64 zn_regs_v2 : 1, + + __reserved : 63; +}; + struct amd64_pvt { struct low_ops *ops; @@ -375,6 +366,12 @@ struct amd64_pvt { /* x4, x8, or x16 syndromes in use */ u8 ecc_sym_sz; + const char *ctl_name; + u16 f1_id, f2_id; + /* Maximum number of memory controllers per die/node. */ + u8 max_mcs; + + struct amd64_family_flags flags; /* place to store error injection parameters prior to issue */ struct error_injection injection; @@ -465,29 +462,15 @@ struct ecc_settings { * functions and per device encoding/decoding logic. */ struct low_ops { - void (*map_sysaddr_to_csrow) (struct mem_ctl_info *mci, u64 sys_addr, - struct err_info *); - int (*dbam_to_cs) (struct amd64_pvt *pvt, u8 dct, - unsigned cs_mode, int cs_mask_nr); -}; - -struct amd64_family_flags { - /* - * Indicates that the system supports the new register offsets, etc. - * first introduced with Family 19h Model 10h. - */ - __u64 zn_regs_v2 : 1, - - __reserved : 63; -}; - -struct amd64_family_type { - const char *ctl_name; - u16 f1_id, f2_id; - /* Maximum number of memory controllers per die/node. */ - u8 max_mcs; - struct amd64_family_flags flags; - struct low_ops ops; + void (*map_sysaddr_to_csrow)(struct mem_ctl_info *mci, u64 sys_addr, + struct err_info *err); + int (*dbam_to_cs)(struct amd64_pvt *pvt, u8 dct, + unsigned int cs_mode, int cs_mask_nr); + int (*hw_info_get)(struct amd64_pvt *pvt); + bool (*ecc_enabled)(struct amd64_pvt *pvt); + void (*setup_mci_misc_attrs)(struct mem_ctl_info *mci); + void (*dump_misc_regs)(struct amd64_pvt *pvt); + void (*get_err_info)(struct mce *m, struct err_info *err); }; int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset, diff --git a/drivers/edac/amd8111_edac.c b/drivers/edac/amd8111_edac.c index 7508aa416ddb..ca718f63fcbc 100644 --- a/drivers/edac/amd8111_edac.c +++ b/drivers/edac/amd8111_edac.c @@ -593,5 +593,5 @@ module_init(amd8111_edac_init); module_exit(amd8111_edac_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Cao Qingtao <qingtao.cao@windriver.com>\n"); +MODULE_AUTHOR("Cao Qingtao <qingtao.cao@windriver.com>"); MODULE_DESCRIPTION("AMD8111 HyperTransport I/O Hub EDAC kernel module"); diff --git a/drivers/edac/amd8131_edac.c b/drivers/edac/amd8131_edac.c index 169353710982..28610ba514f4 100644 --- a/drivers/edac/amd8131_edac.c +++ b/drivers/edac/amd8131_edac.c @@ -354,5 +354,5 @@ module_init(amd8131_edac_init); module_exit(amd8131_edac_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Cao Qingtao <qingtao.cao@windriver.com>\n"); +MODULE_AUTHOR("Cao Qingtao <qingtao.cao@windriver.com>"); MODULE_DESCRIPTION("AMD8131 HyperTransport PCI-X Tunnel EDAC kernel module"); diff --git a/drivers/edac/e752x_edac.c b/drivers/edac/e752x_edac.c index ac7c9b42d4c7..7221b4bb6df2 100644 --- a/drivers/edac/e752x_edac.c +++ b/drivers/edac/e752x_edac.c @@ -1462,7 +1462,7 @@ module_init(e752x_init); module_exit(e752x_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Linux Networx (http://lnxi.com) Tom Zimmerman\n"); +MODULE_AUTHOR("Linux Networx (http://lnxi.com) Tom Zimmerman"); MODULE_DESCRIPTION("MC support for Intel e752x/3100 memory controllers"); module_param(force_function_unhide, int, 0444); diff --git a/drivers/edac/e7xxx_edac.c b/drivers/edac/e7xxx_edac.c index 497e710fca3d..5852b95fa470 100644 --- a/drivers/edac/e7xxx_edac.c +++ b/drivers/edac/e7xxx_edac.c @@ -596,8 +596,7 @@ module_init(e7xxx_init); module_exit(e7xxx_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Linux Networx (http://lnxi.com) Thayne Harbaugh et al\n" - "Based on.work by Dan Hollis et al"); +MODULE_AUTHOR("Linux Networx (http://lnxi.com) Thayne Harbaugh et al"); MODULE_DESCRIPTION("MC support for Intel e7xxx memory controllers"); module_param(edac_op_state, int, 0444); MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI"); diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 0a4691792801..a897b6aff368 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -906,6 +906,7 @@ static const struct x86_cpu_id i10nm_cpuids[] = { X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SAPPHIRERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(EMERALDRAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(GRANITERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SIERRAFOREST_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), {} }; MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids); diff --git a/drivers/edac/i5000_edac.c b/drivers/edac/i5000_edac.c index ba46057d4220..4b5a71f8739d 100644 --- a/drivers/edac/i5000_edac.c +++ b/drivers/edac/i5000_edac.c @@ -1573,13 +1573,10 @@ module_init(i5000_init); module_exit(i5000_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR - ("Linux Networx (http://lnxi.com) Doug Thompson <norsk5@xmission.com>"); -MODULE_DESCRIPTION("MC Driver for Intel I5000 memory controllers - " - I5000_REVISION); +MODULE_AUTHOR("Linux Networx (http://lnxi.com) Doug Thompson <norsk5@xmission.com>"); +MODULE_DESCRIPTION("MC Driver for Intel I5000 memory controllers - " I5000_REVISION); module_param(edac_op_state, int, 0444); MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI"); module_param(misc_messages, int, 0444); MODULE_PARM_DESC(misc_messages, "Log miscellaneous non fatal messages"); - diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c index f5d82518c15e..d470afe65001 100644 --- a/drivers/edac/i5100_edac.c +++ b/drivers/edac/i5100_edac.c @@ -909,7 +909,7 @@ static void i5100_do_inject(struct mem_ctl_info *mci) * * The injection code don't work without setting this register. * The register needs to be flipped off then on else the hardware - * will only preform the first injection. + * will only perform the first injection. * * Stop condition bits 7:4 * 1010 - Stop after one injection @@ -1220,6 +1220,5 @@ module_init(i5100_init); module_exit(i5100_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR - ("Arthur Jones <ajones@riverbed.com>"); +MODULE_AUTHOR("Arthur Jones <ajones@riverbed.com>"); MODULE_DESCRIPTION("MC Driver for Intel I5100 memory controllers"); diff --git a/drivers/edac/i82860_edac.c b/drivers/edac/i82860_edac.c index fbec90d00f1e..b8a497f0de28 100644 --- a/drivers/edac/i82860_edac.c +++ b/drivers/edac/i82860_edac.c @@ -355,8 +355,7 @@ module_init(i82860_init); module_exit(i82860_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Red Hat Inc. (http://www.redhat.com) " - "Ben Woodard <woodard@redhat.com>"); +MODULE_AUTHOR("Red Hat Inc. (http://www.redhat.com) Ben Woodard <woodard@redhat.com>"); MODULE_DESCRIPTION("ECC support for Intel 82860 memory hub controllers"); module_param(edac_op_state, int, 0444); diff --git a/drivers/edac/layerscape_edac.c b/drivers/edac/layerscape_edac.c index 35ceaca578e1..7c5e2b3c0daa 100644 --- a/drivers/edac/layerscape_edac.c +++ b/drivers/edac/layerscape_edac.c @@ -72,5 +72,4 @@ module_exit(fsl_ddr_mc_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("NXP Semiconductor"); module_param(edac_op_state, int, 0444); -MODULE_PARM_DESC(edac_op_state, - "EDAC Error Reporting state: 0=Poll, 2=Interrupt"); +MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll, 2=Interrupt"); diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c index e50d7928bf8f..55320546c174 100644 --- a/drivers/edac/mpc85xx_edac.c +++ b/drivers/edac/mpc85xx_edac.c @@ -711,5 +711,4 @@ module_exit(mpc85xx_mc_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Montavista Software, Inc."); module_param(edac_op_state, int, 0444); -MODULE_PARM_DESC(edac_op_state, - "EDAC Error Reporting state: 0=Poll, 2=Interrupt"); +MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll, 2=Interrupt"); diff --git a/drivers/edac/r82600_edac.c b/drivers/edac/r82600_edac.c index d0aef83dca2a..61e979d5437a 100644 --- a/drivers/edac/r82600_edac.c +++ b/drivers/edac/r82600_edac.c @@ -415,8 +415,7 @@ module_init(r82600_init); module_exit(r82600_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Tim Small <tim@buttersideup.com> - WPAD Ltd. " - "on behalf of EADS Astrium"); +MODULE_AUTHOR("Tim Small <tim@buttersideup.com> - WPAD Ltd. on behalf of EADS Astrium"); MODULE_DESCRIPTION("MC support for Radisys 82600 memory controllers"); module_param(disable_hardware_scrub, bool, 0644); diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index 9397abb42c49..0a862336a7ce 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -510,7 +510,7 @@ rir_found: } static u8 skx_close_row[] = { - 15, 16, 17, 18, 20, 21, 22, 28, 10, 11, 12, 13, 29, 30, 31, 32, 33 + 15, 16, 17, 18, 20, 21, 22, 28, 10, 11, 12, 13, 29, 30, 31, 32, 33, 34 }; static u8 skx_close_column[] = { @@ -518,7 +518,7 @@ static u8 skx_close_column[] = { }; static u8 skx_open_row[] = { - 14, 15, 16, 20, 28, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33 + 14, 15, 16, 20, 28, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34 }; static u8 skx_open_column[] = { diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index 29619f49873a..d9629ff87861 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -167,7 +167,8 @@ int psci_set_osi_mode(bool enable) err = invoke_psci_fn(PSCI_1_0_FN_SET_SUSPEND_MODE, suspend_mode, 0, 0); if (err < 0) - pr_warn("failed to set %s mode: %d\n", enable ? "OSI" : "PC", err); + pr_info(FW_BUG "failed to set %s mode: %d\n", + enable ? "OSI" : "PC", err); return psci_to_linux_errno(err); } diff --git a/drivers/fpga/dfl-pci.c b/drivers/fpga/dfl-pci.c index 0914e7328b1a..1bc04378118c 100644 --- a/drivers/fpga/dfl-pci.c +++ b/drivers/fpga/dfl-pci.c @@ -21,7 +21,6 @@ #include <linux/module.h> #include <linux/stddef.h> #include <linux/errno.h> -#include <linux/aer.h> #include "dfl.h" @@ -376,10 +375,6 @@ int cci_pci_probe(struct pci_dev *pcidev, const struct pci_device_id *pcidevid) return ret; } - ret = pci_enable_pcie_error_reporting(pcidev); - if (ret && ret != -EINVAL) - dev_info(&pcidev->dev, "PCIE AER unavailable %d.\n", ret); - pci_set_master(pcidev); ret = dma_set_mask_and_coherent(&pcidev->dev, DMA_BIT_MASK(64)); @@ -387,24 +382,22 @@ int cci_pci_probe(struct pci_dev *pcidev, const struct pci_device_id *pcidevid) ret = dma_set_mask_and_coherent(&pcidev->dev, DMA_BIT_MASK(32)); if (ret) { dev_err(&pcidev->dev, "No suitable DMA support available.\n"); - goto disable_error_report_exit; + return ret; } ret = cci_init_drvdata(pcidev); if (ret) { dev_err(&pcidev->dev, "Fail to init drvdata %d.\n", ret); - goto disable_error_report_exit; + return ret; } ret = cci_enumerate_feature_devs(pcidev); - if (!ret) + if (ret) { + dev_err(&pcidev->dev, "enumeration failure %d.\n", ret); return ret; + } - dev_err(&pcidev->dev, "enumeration failure %d.\n", ret); - -disable_error_report_exit: - pci_disable_pcie_error_reporting(pcidev); - return ret; + return 0; } static int cci_pci_sriov_configure(struct pci_dev *pcidev, int num_vfs) @@ -448,7 +441,6 @@ static void cci_pci_remove(struct pci_dev *pcidev) cci_pci_sriov_configure(pcidev, 0); cci_remove_feature_devs(pcidev); - pci_disable_pcie_error_reporting(pcidev); } static struct pci_driver cci_pci_driver = { diff --git a/drivers/fpga/fpga-bridge.c b/drivers/fpga/fpga-bridge.c index 5cd40acab5bf..0953e6e4db04 100644 --- a/drivers/fpga/fpga-bridge.c +++ b/drivers/fpga/fpga-bridge.c @@ -363,7 +363,6 @@ fpga_bridge_register(struct device *parent, const char *name, bridge->dev.parent = parent; bridge->dev.of_node = parent->of_node; bridge->dev.id = id; - of_platform_populate(bridge->dev.of_node, NULL, NULL, &bridge->dev); ret = dev_set_name(&bridge->dev, "br%d", id); if (ret) @@ -375,6 +374,8 @@ fpga_bridge_register(struct device *parent, const char *name, return ERR_PTR(ret); } + of_platform_populate(bridge->dev.of_node, NULL, NULL, &bridge->dev); + return bridge; error_device: diff --git a/drivers/fpga/intel-m10-bmc-sec-update.c b/drivers/fpga/intel-m10-bmc-sec-update.c index f0acedc80182..d7e2f9f461bc 100644 --- a/drivers/fpga/intel-m10-bmc-sec-update.c +++ b/drivers/fpga/intel-m10-bmc-sec-update.c @@ -474,7 +474,7 @@ static enum fw_upload_err rsu_send_data(struct m10bmc_sec *sec) ret = sec->ops->rsu_status(sec); if (ret < 0) - return ret; + return FW_UPLOAD_ERR_HW_ERROR; status = ret; if (!rsu_status_ok(status)) { diff --git a/drivers/fpga/xilinx-pr-decoupler.c b/drivers/fpga/xilinx-pr-decoupler.c index 2d9c491f7be9..b76d85449b8f 100644 --- a/drivers/fpga/xilinx-pr-decoupler.c +++ b/drivers/fpga/xilinx-pr-decoupler.c @@ -69,7 +69,7 @@ static int xlnx_pr_decoupler_enable_show(struct fpga_bridge *bridge) if (err) return err; - status = readl(priv->io_base); + status = xlnx_pr_decouple_read(priv, CTRL_OFFSET); clk_disable(priv->clk); diff --git a/drivers/gpio/gpio-104-dio-48e.c b/drivers/gpio/gpio-104-dio-48e.c index a3846faf3780..11c48130bb8f 100644 --- a/drivers/gpio/gpio-104-dio-48e.c +++ b/drivers/gpio/gpio-104-dio-48e.c @@ -86,6 +86,7 @@ static const struct regmap_config dio48e_regmap_config = { .volatile_table = &dio48e_volatile_table, .precious_table = &dio48e_precious_table, .cache_type = REGCACHE_FLAT, + .use_raw_spinlock = true, }; /* only bit 3 on each respective Port C supports interrupts */ diff --git a/drivers/gpio/gpio-104-idi-48.c b/drivers/gpio/gpio-104-idi-48.c index ca2175b84e24..ba73ee9c0c29 100644 --- a/drivers/gpio/gpio-104-idi-48.c +++ b/drivers/gpio/gpio-104-idi-48.c @@ -81,6 +81,7 @@ static const struct regmap_config idi48_regmap_config = { .wr_table = &idi_48_wr_table, .rd_table = &idi_48_rd_table, .precious_table = &idi_48_precious_table, + .use_raw_spinlock = true, }; #define IDI48_NGPIO 48 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c index d0a1cc88832c..fafebec5b7b6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c @@ -596,6 +596,9 @@ int amdgpu_irq_put(struct amdgpu_device *adev, struct amdgpu_irq_src *src, if (!src->enabled_types || !src->funcs->set) return -EINVAL; + if (WARN_ON(!amdgpu_irq_enabled(adev, src, type))) + return -EINVAL; + if (atomic_dec_and_test(&src->enabled_types[type])) return amdgpu_irq_update(adev, src, type); diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c index dc4f37240beb..8af70feca720 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c @@ -169,10 +169,21 @@ static inline int dm_set_vblank(struct drm_crtc *crtc, bool enable) if (rc) return rc; - irq_source = IRQ_TYPE_VBLANK + acrtc->otg_inst; + if (amdgpu_in_reset(adev)) { + irq_source = IRQ_TYPE_VBLANK + acrtc->otg_inst; + /* During gpu-reset we disable and then enable vblank irq, so + * don't use amdgpu_irq_get/put() to avoid refcount change. + */ + if (!dc_interrupt_set(adev->dm.dc, irq_source, enable)) + rc = -EBUSY; + } else { + rc = (enable) + ? amdgpu_irq_get(adev, &adev->crtc_irq, acrtc->crtc_id) + : amdgpu_irq_put(adev, &adev->crtc_irq, acrtc->crtc_id); + } - if (!dc_interrupt_set(adev->dm.dc, irq_source, enable)) - return -EBUSY; + if (rc) + return rc; skip: if (amdgpu_in_reset(adev)) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c index 1583157da355..efd025d8961e 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c @@ -177,6 +177,40 @@ void dm_helpers_dp_update_branch_info( const struct dc_link *link) {} +static void dm_helpers_construct_old_payload( + struct dc_link *link, + int pbn_per_slot, + struct drm_dp_mst_atomic_payload *new_payload, + struct drm_dp_mst_atomic_payload *old_payload) +{ + struct link_mst_stream_allocation_table current_link_table = + link->mst_stream_alloc_table; + struct link_mst_stream_allocation *dc_alloc; + int i; + + *old_payload = *new_payload; + + /* Set correct time_slots/PBN of old payload. + * other fields (delete & dsc_enabled) in + * struct drm_dp_mst_atomic_payload are don't care fields + * while calling drm_dp_remove_payload() + */ + for (i = 0; i < current_link_table.stream_count; i++) { + dc_alloc = + ¤t_link_table.stream_allocations[i]; + + if (dc_alloc->vcp_id == new_payload->vcpi) { + old_payload->time_slots = dc_alloc->slot_count; + old_payload->pbn = dc_alloc->slot_count * pbn_per_slot; + break; + } + } + + /* make sure there is an old payload*/ + ASSERT(i != current_link_table.stream_count); + +} + /* * Writes payload allocation table in immediate downstream device. */ @@ -188,7 +222,7 @@ bool dm_helpers_dp_mst_write_payload_allocation_table( { struct amdgpu_dm_connector *aconnector; struct drm_dp_mst_topology_state *mst_state; - struct drm_dp_mst_atomic_payload *payload; + struct drm_dp_mst_atomic_payload *target_payload, *new_payload, old_payload; struct drm_dp_mst_topology_mgr *mst_mgr; aconnector = (struct amdgpu_dm_connector *)stream->dm_stream_context; @@ -204,17 +238,26 @@ bool dm_helpers_dp_mst_write_payload_allocation_table( mst_state = to_drm_dp_mst_topology_state(mst_mgr->base.state); /* It's OK for this to fail */ - payload = drm_atomic_get_mst_payload_state(mst_state, aconnector->mst_output_port); - if (enable) - drm_dp_add_payload_part1(mst_mgr, mst_state, payload); - else - drm_dp_remove_payload(mst_mgr, mst_state, payload, payload); + new_payload = drm_atomic_get_mst_payload_state(mst_state, aconnector->mst_output_port); + + if (enable) { + target_payload = new_payload; + + drm_dp_add_payload_part1(mst_mgr, mst_state, new_payload); + } else { + /* construct old payload by VCPI*/ + dm_helpers_construct_old_payload(stream->link, mst_state->pbn_div, + new_payload, &old_payload); + target_payload = &old_payload; + + drm_dp_remove_payload(mst_mgr, mst_state, &old_payload, new_payload); + } /* mst_mgr->->payloads are VC payload notify MST branch using DPCD or * AUX message. The sequence is slot 1-63 allocated sequence for each * stream. AMD ASIC stream slot allocation should follow the same * sequence. copy DRM MST allocation to dc */ - fill_dc_mst_payload_table_from_drm(stream->link, enable, payload, proposed_table); + fill_dc_mst_payload_table_from_drm(stream->link, enable, target_payload, proposed_table); return true; } diff --git a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c index 54ed3de869d3..9ffba4c6fe55 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c @@ -1697,6 +1697,23 @@ static void dcn314_get_panel_config_defaults(struct dc_panel_config *panel_confi *panel_config = panel_config_defaults; } +static bool filter_modes_for_single_channel_workaround(struct dc *dc, + struct dc_state *context) +{ + // Filter 2K@240Hz+8K@24fps above combination timing if memory only has single dimm LPDDR + if (dc->clk_mgr->bw_params->vram_type == 34 && dc->clk_mgr->bw_params->num_channels < 2) { + int total_phy_pix_clk = 0; + + for (int i = 0; i < context->stream_count; i++) + if (context->res_ctx.pipe_ctx[i].stream) + total_phy_pix_clk += context->res_ctx.pipe_ctx[i].stream->phy_pix_clk; + + if (total_phy_pix_clk >= (1148928+826260)) //2K@240Hz+8K@24fps + return true; + } + return false; +} + bool dcn314_validate_bandwidth(struct dc *dc, struct dc_state *context, bool fast_validate) @@ -1712,6 +1729,9 @@ bool dcn314_validate_bandwidth(struct dc *dc, BW_VAL_TRACE_COUNT(); + if (filter_modes_for_single_channel_workaround(dc, context)) + goto validate_fail; + DC_FP_START(); // do not support self refresh only out = dcn30_internal_validate_bw(dc, context, pipes, &pipe_cnt, &vlevel, fast_validate, false); diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c index b37d14369a62..59836570603a 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c @@ -222,7 +222,7 @@ struct _vcs_dpi_ip_params_st dcn3_15_ip = { .maximum_dsc_bits_per_component = 10, .dsc422_native_support = false, .is_line_buffer_bpp_fixed = true, - .line_buffer_fixed_bpp = 49, + .line_buffer_fixed_bpp = 48, .line_buffer_size_bits = 789504, .max_line_buffer_lines = 12, .writeback_interface_buffer_size_kbytes = 90, diff --git a/drivers/gpu/drm/amd/display/modules/power/power_helpers.c b/drivers/gpu/drm/amd/display/modules/power/power_helpers.c index e39b133d05af..b56f07f99d09 100644 --- a/drivers/gpu/drm/amd/display/modules/power/power_helpers.c +++ b/drivers/gpu/drm/amd/display/modules/power/power_helpers.c @@ -934,6 +934,10 @@ bool psr_su_set_dsc_slice_height(struct dc *dc, struct dc_link *link, pic_height = stream->timing.v_addressable + stream->timing.v_border_top + stream->timing.v_border_bottom; + + if (stream->timing.dsc_cfg.num_slices_v == 0) + return false; + slice_height = pic_height / stream->timing.dsc_cfg.num_slices_v; config->dsc_slice_height = slice_height; diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h index f085cb97a620..85a090b9e3d9 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h @@ -61,6 +61,12 @@ #define CTF_OFFSET_HOTSPOT 5 #define CTF_OFFSET_MEM 5 +static const int pmfw_decoded_link_speed[5] = {1, 2, 3, 4, 5}; +static const int pmfw_decoded_link_width[7] = {0, 1, 2, 4, 8, 12, 16}; + +#define DECODE_GEN_SPEED(gen_speed_idx) (pmfw_decoded_link_speed[gen_speed_idx]) +#define DECODE_LANE_WIDTH(lane_width_idx) (pmfw_decoded_link_width[lane_width_idx]) + struct smu_13_0_max_sustainable_clocks { uint32_t display_clock; uint32_t phy_clock; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 27448ffe60a4..a5c97d61e92a 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -1144,8 +1144,8 @@ static int smu_v13_0_0_print_clk_levels(struct smu_context *smu, (pcie_table->pcie_lane[i] == 5) ? "x12" : (pcie_table->pcie_lane[i] == 6) ? "x16" : "", pcie_table->clk_freq[i], - ((gen_speed - 1) == pcie_table->pcie_gen[i]) && - (lane_width == link_width[pcie_table->pcie_lane[i]]) ? + (gen_speed == DECODE_GEN_SPEED(pcie_table->pcie_gen[i])) && + (lane_width == DECODE_LANE_WIDTH(link_width[pcie_table->pcie_lane[i]])) ? "*" : ""); break; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c index 9e1967d8049e..4399416dd9b8 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c @@ -575,6 +575,14 @@ static int smu_v13_0_7_set_default_dpm_table(struct smu_context *smu) dpm_table); if (ret) return ret; + + if (skutable->DriverReportedClocks.GameClockAc && + (dpm_table->dpm_levels[dpm_table->count - 1].value > + skutable->DriverReportedClocks.GameClockAc)) { + dpm_table->dpm_levels[dpm_table->count - 1].value = + skutable->DriverReportedClocks.GameClockAc; + dpm_table->max = skutable->DriverReportedClocks.GameClockAc; + } } else { dpm_table->count = 1; dpm_table->dpm_levels[0].value = smu->smu_table.boot_values.gfxclk / 100; @@ -828,6 +836,57 @@ static int smu_v13_0_7_get_smu_metrics_data(struct smu_context *smu, return ret; } +static int smu_v13_0_7_get_dpm_ultimate_freq(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t *min, + uint32_t *max) +{ + struct smu_13_0_dpm_context *dpm_context = + smu->smu_dpm.dpm_context; + struct smu_13_0_dpm_table *dpm_table; + + switch (clk_type) { + case SMU_MCLK: + case SMU_UCLK: + /* uclk dpm table */ + dpm_table = &dpm_context->dpm_tables.uclk_table; + break; + case SMU_GFXCLK: + case SMU_SCLK: + /* gfxclk dpm table */ + dpm_table = &dpm_context->dpm_tables.gfx_table; + break; + case SMU_SOCCLK: + /* socclk dpm table */ + dpm_table = &dpm_context->dpm_tables.soc_table; + break; + case SMU_FCLK: + /* fclk dpm table */ + dpm_table = &dpm_context->dpm_tables.fclk_table; + break; + case SMU_VCLK: + case SMU_VCLK1: + /* vclk dpm table */ + dpm_table = &dpm_context->dpm_tables.vclk_table; + break; + case SMU_DCLK: + case SMU_DCLK1: + /* dclk dpm table */ + dpm_table = &dpm_context->dpm_tables.dclk_table; + break; + default: + dev_err(smu->adev->dev, "Unsupported clock type!\n"); + return -EINVAL; + } + + if (min) + *min = dpm_table->min; + if (max) + *max = dpm_table->max; + + return 0; +} + static int smu_v13_0_7_read_sensor(struct smu_context *smu, enum amd_pp_sensors sensor, void *data, @@ -1074,8 +1133,8 @@ static int smu_v13_0_7_print_clk_levels(struct smu_context *smu, (pcie_table->pcie_lane[i] == 5) ? "x12" : (pcie_table->pcie_lane[i] == 6) ? "x16" : "", pcie_table->clk_freq[i], - (gen_speed == pcie_table->pcie_gen[i]) && - (lane_width == pcie_table->pcie_lane[i]) ? + (gen_speed == DECODE_GEN_SPEED(pcie_table->pcie_gen[i])) && + (lane_width == DECODE_LANE_WIDTH(pcie_table->pcie_lane[i])) ? "*" : ""); break; @@ -1329,9 +1388,17 @@ static int smu_v13_0_7_populate_umd_state_clk(struct smu_context *smu) &dpm_context->dpm_tables.fclk_table; struct smu_umd_pstate_table *pstate_table = &smu->pstate_table; + struct smu_table_context *table_context = &smu->smu_table; + PPTable_t *pptable = table_context->driver_pptable; + DriverReportedClocks_t driver_clocks = + pptable->SkuTable.DriverReportedClocks; pstate_table->gfxclk_pstate.min = gfx_table->min; - pstate_table->gfxclk_pstate.peak = gfx_table->max; + if (driver_clocks.GameClockAc && + (driver_clocks.GameClockAc < gfx_table->max)) + pstate_table->gfxclk_pstate.peak = driver_clocks.GameClockAc; + else + pstate_table->gfxclk_pstate.peak = gfx_table->max; pstate_table->uclk_pstate.min = mem_table->min; pstate_table->uclk_pstate.peak = mem_table->max; @@ -1348,12 +1415,12 @@ static int smu_v13_0_7_populate_umd_state_clk(struct smu_context *smu) pstate_table->fclk_pstate.min = fclk_table->min; pstate_table->fclk_pstate.peak = fclk_table->max; - /* - * For now, just use the mininum clock frequency. - * TODO: update them when the real pstate settings available - */ - pstate_table->gfxclk_pstate.standard = gfx_table->min; - pstate_table->uclk_pstate.standard = mem_table->min; + if (driver_clocks.BaseClockAc && + driver_clocks.BaseClockAc < gfx_table->max) + pstate_table->gfxclk_pstate.standard = driver_clocks.BaseClockAc; + else + pstate_table->gfxclk_pstate.standard = gfx_table->max; + pstate_table->uclk_pstate.standard = mem_table->max; pstate_table->socclk_pstate.standard = soc_table->min; pstate_table->vclk_pstate.standard = vclk_table->min; pstate_table->dclk_pstate.standard = dclk_table->min; @@ -1676,7 +1743,7 @@ static const struct pptable_funcs smu_v13_0_7_ppt_funcs = { .dpm_set_jpeg_enable = smu_v13_0_set_jpeg_enable, .init_pptable_microcode = smu_v13_0_init_pptable_microcode, .populate_umd_state_clk = smu_v13_0_7_populate_umd_state_clk, - .get_dpm_ultimate_freq = smu_v13_0_get_dpm_ultimate_freq, + .get_dpm_ultimate_freq = smu_v13_0_7_get_dpm_ultimate_freq, .get_vbios_bootup_values = smu_v13_0_get_vbios_bootup_values, .read_sensor = smu_v13_0_7_read_sensor, .feature_is_enabled = smu_cmn_feature_is_enabled, diff --git a/drivers/gpu/drm/armada/armada_drv.c b/drivers/gpu/drm/armada/armada_drv.c index 0643887800b4..142668cd6d7c 100644 --- a/drivers/gpu/drm/armada/armada_drv.c +++ b/drivers/gpu/drm/armada/armada_drv.c @@ -99,7 +99,6 @@ static int armada_drm_bind(struct device *dev) if (ret) { dev_err(dev, "[" DRM_NAME ":%s] can't kick out simple-fb: %d\n", __func__, ret); - kfree(priv); return ret; } diff --git a/drivers/gpu/drm/i915/display/icl_dsi.c b/drivers/gpu/drm/i915/display/icl_dsi.c index 468a792e6a40..fc0eaf40dc94 100644 --- a/drivers/gpu/drm/i915/display/icl_dsi.c +++ b/drivers/gpu/drm/i915/display/icl_dsi.c @@ -300,9 +300,21 @@ static void configure_dual_link_mode(struct intel_encoder *encoder, { struct drm_i915_private *dev_priv = to_i915(encoder->base.dev); struct intel_dsi *intel_dsi = enc_to_intel_dsi(encoder); + i915_reg_t dss_ctl1_reg, dss_ctl2_reg; u32 dss_ctl1; - dss_ctl1 = intel_de_read(dev_priv, DSS_CTL1); + /* FIXME: Move all DSS handling to intel_vdsc.c */ + if (DISPLAY_VER(dev_priv) >= 12) { + struct intel_crtc *crtc = to_intel_crtc(pipe_config->uapi.crtc); + + dss_ctl1_reg = ICL_PIPE_DSS_CTL1(crtc->pipe); + dss_ctl2_reg = ICL_PIPE_DSS_CTL2(crtc->pipe); + } else { + dss_ctl1_reg = DSS_CTL1; + dss_ctl2_reg = DSS_CTL2; + } + + dss_ctl1 = intel_de_read(dev_priv, dss_ctl1_reg); dss_ctl1 |= SPLITTER_ENABLE; dss_ctl1 &= ~OVERLAP_PIXELS_MASK; dss_ctl1 |= OVERLAP_PIXELS(intel_dsi->pixel_overlap); @@ -323,16 +335,16 @@ static void configure_dual_link_mode(struct intel_encoder *encoder, dss_ctl1 &= ~LEFT_DL_BUF_TARGET_DEPTH_MASK; dss_ctl1 |= LEFT_DL_BUF_TARGET_DEPTH(dl_buffer_depth); - dss_ctl2 = intel_de_read(dev_priv, DSS_CTL2); + dss_ctl2 = intel_de_read(dev_priv, dss_ctl2_reg); dss_ctl2 &= ~RIGHT_DL_BUF_TARGET_DEPTH_MASK; dss_ctl2 |= RIGHT_DL_BUF_TARGET_DEPTH(dl_buffer_depth); - intel_de_write(dev_priv, DSS_CTL2, dss_ctl2); + intel_de_write(dev_priv, dss_ctl2_reg, dss_ctl2); } else { /* Interleave */ dss_ctl1 |= DUAL_LINK_MODE_INTERLEAVE; } - intel_de_write(dev_priv, DSS_CTL1, dss_ctl1); + intel_de_write(dev_priv, dss_ctl1_reg, dss_ctl1); } /* aka DSI 8X clock */ diff --git a/drivers/gpu/drm/i915/display/intel_dp_aux.c b/drivers/gpu/drm/i915/display/intel_dp_aux.c index 5a176bfb10a2..30c98810e28b 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_aux.c +++ b/drivers/gpu/drm/i915/display/intel_dp_aux.c @@ -163,7 +163,7 @@ static u32 skl_get_aux_send_ctl(struct intel_dp *intel_dp, DP_AUX_CH_CTL_TIME_OUT_MAX | DP_AUX_CH_CTL_RECEIVE_ERROR | (send_bytes << DP_AUX_CH_CTL_MESSAGE_SIZE_SHIFT) | - DP_AUX_CH_CTL_FW_SYNC_PULSE_SKL(32) | + DP_AUX_CH_CTL_FW_SYNC_PULSE_SKL(24) | DP_AUX_CH_CTL_SYNC_PULSE_SKL(32); if (intel_tc_port_in_tbt_alt_mode(dig_port)) diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c index f77e44958037..ab9062e50977 100644 --- a/drivers/gpu/drm/nouveau/nouveau_gem.c +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c @@ -645,7 +645,7 @@ nouveau_gem_pushbuf_reloc_apply(struct nouveau_cli *cli, struct drm_nouveau_gem_pushbuf_reloc *reloc, struct drm_nouveau_gem_pushbuf_bo *bo) { - long ret = 0; + int ret = 0; unsigned i; for (i = 0; i < req->nr_relocs; i++) { @@ -653,6 +653,7 @@ nouveau_gem_pushbuf_reloc_apply(struct nouveau_cli *cli, struct drm_nouveau_gem_pushbuf_bo *b; struct nouveau_bo *nvbo; uint32_t data; + long lret; if (unlikely(r->bo_index >= req->nr_buffers)) { NV_PRINTK(err, cli, "reloc bo index invalid\n"); @@ -703,13 +704,18 @@ nouveau_gem_pushbuf_reloc_apply(struct nouveau_cli *cli, data |= r->vor; } - ret = dma_resv_wait_timeout(nvbo->bo.base.resv, - DMA_RESV_USAGE_BOOKKEEP, - false, 15 * HZ); - if (ret == 0) + lret = dma_resv_wait_timeout(nvbo->bo.base.resv, + DMA_RESV_USAGE_BOOKKEEP, + false, 15 * HZ); + if (!lret) ret = -EBUSY; + else if (lret > 0) + ret = 0; + else + ret = lret; + if (ret) { - NV_PRINTK(err, cli, "reloc wait_idle failed: %ld\n", + NV_PRINTK(err, cli, "reloc wait_idle failed: %d\n", ret); break; } diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gf108.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gf108.c index 76678dd60f93..c4c6f67af7cc 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gf108.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gf108.c @@ -31,6 +31,7 @@ gf108_fb = { .init = gf100_fb_init, .init_page = gf100_fb_init_page, .intr = gf100_fb_intr, + .sysmem.flush_page_init = gf100_fb_sysmem_flush_page_init, .ram_new = gf108_ram_new, .default_bigpage = 17, }; diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gk104.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gk104.c index f73442ccb424..433fa966ba23 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gk104.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gk104.c @@ -77,6 +77,7 @@ gk104_fb = { .init = gf100_fb_init, .init_page = gf100_fb_init_page, .intr = gf100_fb_intr, + .sysmem.flush_page_init = gf100_fb_sysmem_flush_page_init, .ram_new = gk104_ram_new, .default_bigpage = 17, .clkgate_pack = gk104_fb_clkgate_pack, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gk110.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gk110.c index 45d6cdffafee..4dc283dedf8b 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gk110.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gk110.c @@ -59,6 +59,7 @@ gk110_fb = { .init = gf100_fb_init, .init_page = gf100_fb_init_page, .intr = gf100_fb_intr, + .sysmem.flush_page_init = gf100_fb_sysmem_flush_page_init, .ram_new = gk104_ram_new, .default_bigpage = 17, .clkgate_pack = gk110_fb_clkgate_pack, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gm107.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gm107.c index de52462a92bf..90bfff616d35 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gm107.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gm107.c @@ -31,6 +31,7 @@ gm107_fb = { .init = gf100_fb_init, .init_page = gf100_fb_init_page, .intr = gf100_fb_intr, + .sysmem.flush_page_init = gf100_fb_sysmem_flush_page_init, .ram_new = gm107_ram_new, .default_bigpage = 17, }; diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c b/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c index ba3b81789509..293c228a83f9 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c @@ -839,6 +839,8 @@ static void vop2_enable(struct vop2 *vop2) return; } + regcache_sync(vop2->map); + if (vop2->data->soc_id == 3566) vop2_writel(vop2, RK3568_OTP_WIN_EN, 1); @@ -867,6 +869,8 @@ static void vop2_disable(struct vop2 *vop2) pm_runtime_put_sync(vop2->dev); + regcache_mark_dirty(vop2->map); + clk_disable_unprepare(vop2->aclk); clk_disable_unprepare(vop2->hclk); } diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index 15d04a0ec623..e0a8890a62e2 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -507,12 +507,19 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) { struct drm_sched_entity *entity = sched_job->entity; bool first; + ktime_t submit_ts; trace_drm_sched_job(sched_job, entity); atomic_inc(entity->rq->sched->score); WRITE_ONCE(entity->last_user, current->group_leader); + + /* + * After the sched_job is pushed into the entity queue, it may be + * completed and freed up at any time. We can no longer access it. + * Make sure to set the submit_ts first, to avoid a race. + */ + sched_job->submit_ts = submit_ts = ktime_get(); first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node); - sched_job->submit_ts = ktime_get(); /* first job wakes up scheduler */ if (first) { @@ -529,7 +536,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) spin_unlock(&entity->rq_lock); if (drm_sched_policy == DRM_SCHED_POLICY_FIFO) - drm_sched_rq_update_fifo(entity, sched_job->submit_ts); + drm_sched_rq_update_fifo(entity, submit_ts); drm_sched_wakeup(entity->rq->sched); } diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 0e4378420271..1e08cc5a1702 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -308,7 +308,8 @@ static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched) */ void drm_sched_fault(struct drm_gpu_scheduler *sched) { - mod_delayed_work(sched->timeout_wq, &sched->work_tdr, 0); + if (sched->ready) + mod_delayed_work(sched->timeout_wq, &sched->work_tdr, 0); } EXPORT_SYMBOL(drm_sched_fault); diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index 82f64fb31fda..4ce012f83253 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -1122,7 +1122,7 @@ config HID_TOPRE tristate "Topre REALFORCE keyboards" depends on HID help - Say Y for N-key rollover support on Topre REALFORCE R2 108 key keyboards. + Say Y for N-key rollover support on Topre REALFORCE R2 108/87 key keyboards. config HID_THINGM tristate "ThingM blink(1) USB RGB LED" diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 63545cd307e5..c2e9b6d1fd7d 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -420,6 +420,9 @@ #define I2C_DEVICE_ID_SURFACE_GO_TOUCHSCREEN 0x261A #define I2C_DEVICE_ID_SURFACE_GO2_TOUCHSCREEN 0x2A1C #define I2C_DEVICE_ID_LENOVO_YOGA_C630_TOUCHSCREEN 0x279F +#define I2C_DEVICE_ID_HP_SPECTRE_X360_13T_AW100 0x29F5 +#define I2C_DEVICE_ID_HP_SPECTRE_X360_14T_EA100_V1 0x2BED +#define I2C_DEVICE_ID_HP_SPECTRE_X360_14T_EA100_V2 0x2BEE #define USB_VENDOR_ID_ELECOM 0x056e #define USB_DEVICE_ID_ELECOM_BM084 0x0061 @@ -1249,6 +1252,7 @@ #define USB_VENDOR_ID_TOPRE 0x0853 #define USB_DEVICE_ID_TOPRE_REALFORCE_R2_108 0x0148 +#define USB_DEVICE_ID_TOPRE_REALFORCE_R2_87 0x0146 #define USB_VENDOR_ID_TOPSEED 0x0766 #define USB_DEVICE_ID_TOPSEED_CYBERLINK 0x0204 diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 7fc967964dd8..5c65a584b3fa 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -398,6 +398,12 @@ static const struct hid_device_id hid_battery_quirks[] = { HID_BATTERY_QUIRK_IGNORE }, { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_LENOVO_YOGA_C630_TOUCHSCREEN), HID_BATTERY_QUIRK_IGNORE }, + { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_HP_SPECTRE_X360_13T_AW100), + HID_BATTERY_QUIRK_IGNORE }, + { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_HP_SPECTRE_X360_14T_EA100_V1), + HID_BATTERY_QUIRK_IGNORE }, + { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_HP_SPECTRE_X360_14T_EA100_V2), + HID_BATTERY_QUIRK_IGNORE }, {} }; diff --git a/drivers/hid/hid-sensor-custom.c b/drivers/hid/hid-sensor-custom.c index 3e3f89e01d81..d85398721659 100644 --- a/drivers/hid/hid-sensor-custom.c +++ b/drivers/hid/hid-sensor-custom.c @@ -940,7 +940,7 @@ hid_sensor_register_platform_device(struct platform_device *pdev, struct hid_sensor_hub_device *hsdev, const struct hid_sensor_custom_match *match) { - char real_usage[HID_SENSOR_USAGE_LENGTH]; + char real_usage[HID_SENSOR_USAGE_LENGTH] = { 0 }; struct platform_device *custom_pdev; const char *dev_name; char *c; diff --git a/drivers/hid/hid-topre.c b/drivers/hid/hid-topre.c index 88a91cdad5f8..d1d5ca310ead 100644 --- a/drivers/hid/hid-topre.c +++ b/drivers/hid/hid-topre.c @@ -36,6 +36,8 @@ static __u8 *topre_report_fixup(struct hid_device *hdev, __u8 *rdesc, static const struct hid_device_id topre_id_table[] = { { HID_USB_DEVICE(USB_VENDOR_ID_TOPRE, USB_DEVICE_ID_TOPRE_REALFORCE_R2_108) }, + { HID_USB_DEVICE(USB_VENDOR_ID_TOPRE, + USB_DEVICE_ID_TOPRE_REALFORCE_R2_87) }, { } }; MODULE_DEVICE_TABLE(hid, topre_id_table); diff --git a/drivers/hid/intel-ish-hid/ishtp/bus.c b/drivers/hid/intel-ish-hid/ishtp/bus.c index 81385ab37fa9..7fc738a22375 100644 --- a/drivers/hid/intel-ish-hid/ishtp/bus.c +++ b/drivers/hid/intel-ish-hid/ishtp/bus.c @@ -241,8 +241,8 @@ static int ishtp_cl_bus_match(struct device *dev, struct device_driver *drv) struct ishtp_cl_device *device = to_ishtp_cl_device(dev); struct ishtp_cl_driver *driver = to_ishtp_cl_driver(drv); - return guid_equal(&driver->id[0].guid, - &device->fw_client->props.protocol_name); + return(device->fw_client ? guid_equal(&driver->id[0].guid, + &device->fw_client->props.protocol_name) : 0); } /** diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index c6692fd5ab15..2111e97c3b63 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -211,7 +211,7 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, ring_info->ring_buffer = (struct hv_ring_buffer *) vmap_pfn(pfns_wraparound, page_cnt * 2 - 1, - PAGE_KERNEL); + pgprot_decrypted(PAGE_KERNEL)); kfree(pfns_wraparound); if (!ring_info->ring_buffer) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index d24dd65b33d4..e9e1c4139e0d 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2156,7 +2156,6 @@ void vmbus_device_unregister(struct hv_device *device_obj) * VMBUS is an acpi enumerated device. Get the information we * need from DSDT. */ -#define VTPM_BASE_ADDRESS 0xfed40000 static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx) { resource_size_t start = 0; diff --git a/drivers/i2c/busses/i2c-mchp-pci1xxxx.c b/drivers/i2c/busses/i2c-mchp-pci1xxxx.c index 09af75921147..b21ffd6df927 100644 --- a/drivers/i2c/busses/i2c-mchp-pci1xxxx.c +++ b/drivers/i2c/busses/i2c-mchp-pci1xxxx.c @@ -48,9 +48,9 @@ * SR_HOLD_TIME_XK_TICKS field will indicate the number of ticks of the * baud clock required to program 'Hold Time' at X KHz. */ -#define SR_HOLD_TIME_100K_TICKS 133 -#define SR_HOLD_TIME_400K_TICKS 20 -#define SR_HOLD_TIME_1000K_TICKS 11 +#define SR_HOLD_TIME_100K_TICKS 150 +#define SR_HOLD_TIME_400K_TICKS 20 +#define SR_HOLD_TIME_1000K_TICKS 12 #define SMB_CORE_COMPLETION_REG_OFF3 (SMBUS_MAST_CORE_ADDR_BASE + 0x23) @@ -65,17 +65,17 @@ * the baud clock required to program 'fair idle delay' at X KHz. Fair idle * delay establishes the MCTP T(IDLE_DELAY) period. */ -#define FAIR_BUS_IDLE_MIN_100K_TICKS 969 -#define FAIR_BUS_IDLE_MIN_400K_TICKS 157 -#define FAIR_BUS_IDLE_MIN_1000K_TICKS 157 +#define FAIR_BUS_IDLE_MIN_100K_TICKS 992 +#define FAIR_BUS_IDLE_MIN_400K_TICKS 500 +#define FAIR_BUS_IDLE_MIN_1000K_TICKS 500 /* * FAIR_IDLE_DELAY_XK_TICKS field will indicate the number of ticks of the * baud clock required to satisfy the fairness protocol at X KHz. */ -#define FAIR_IDLE_DELAY_100K_TICKS 1000 -#define FAIR_IDLE_DELAY_400K_TICKS 500 -#define FAIR_IDLE_DELAY_1000K_TICKS 500 +#define FAIR_IDLE_DELAY_100K_TICKS 963 +#define FAIR_IDLE_DELAY_400K_TICKS 156 +#define FAIR_IDLE_DELAY_1000K_TICKS 156 #define SMB_IDLE_SCALING_100K \ ((FAIR_IDLE_DELAY_100K_TICKS << 16) | FAIR_BUS_IDLE_MIN_100K_TICKS) @@ -105,7 +105,7 @@ */ #define BUS_CLK_100K_LOW_PERIOD_TICKS 156 #define BUS_CLK_400K_LOW_PERIOD_TICKS 41 -#define BUS_CLK_1000K_LOW_PERIOD_TICKS 15 +#define BUS_CLK_1000K_LOW_PERIOD_TICKS 15 /* * BUS_CLK_XK_HIGH_PERIOD_TICKS field defines the number of I2C Baud Clock @@ -131,7 +131,7 @@ */ #define CLK_SYNC_100K 4 #define CLK_SYNC_400K 4 -#define CLK_SYNC_1000K 4 +#define CLK_SYNC_1000K 4 #define SMB_CORE_DATA_TIMING_REG_OFF (SMBUS_MAST_CORE_ADDR_BASE + 0x40) @@ -142,25 +142,25 @@ * determines the SCLK hold time following SDAT driven low during the first * START bit in a transfer. */ -#define FIRST_START_HOLD_100K_TICKS 22 -#define FIRST_START_HOLD_400K_TICKS 16 -#define FIRST_START_HOLD_1000K_TICKS 6 +#define FIRST_START_HOLD_100K_TICKS 23 +#define FIRST_START_HOLD_400K_TICKS 8 +#define FIRST_START_HOLD_1000K_TICKS 12 /* * STOP_SETUP_XK_TICKS will indicate the number of ticks of the baud clock * required to program 'STOP_SETUP' timer at X KHz. This timer determines the * SDAT setup time from the rising edge of SCLK for a STOP condition. */ -#define STOP_SETUP_100K_TICKS 157 +#define STOP_SETUP_100K_TICKS 150 #define STOP_SETUP_400K_TICKS 20 -#define STOP_SETUP_1000K_TICKS 12 +#define STOP_SETUP_1000K_TICKS 12 /* * RESTART_SETUP_XK_TICKS will indicate the number of ticks of the baud clock * required to program 'RESTART_SETUP' timer at X KHz. This timer determines the * SDAT setup time from the rising edge of SCLK for a repeated START condition. */ -#define RESTART_SETUP_100K_TICKS 157 +#define RESTART_SETUP_100K_TICKS 156 #define RESTART_SETUP_400K_TICKS 20 #define RESTART_SETUP_1000K_TICKS 12 @@ -169,7 +169,7 @@ * required to program 'DATA_HOLD' timer at X KHz. This timer determines the * SDAT hold time following SCLK driven low. */ -#define DATA_HOLD_100K_TICKS 2 +#define DATA_HOLD_100K_TICKS 12 #define DATA_HOLD_400K_TICKS 2 #define DATA_HOLD_1000K_TICKS 2 @@ -190,35 +190,35 @@ * Bus Idle Minimum time = BUS_IDLE_MIN[7:0] x Baud_Clock_Period x * (BUS_IDLE_MIN_XK_TICKS[7] ? 4,1) */ -#define BUS_IDLE_MIN_100K_TICKS 167UL -#define BUS_IDLE_MIN_400K_TICKS 139UL -#define BUS_IDLE_MIN_1000K_TICKS 133UL +#define BUS_IDLE_MIN_100K_TICKS 36UL +#define BUS_IDLE_MIN_400K_TICKS 10UL +#define BUS_IDLE_MIN_1000K_TICKS 4UL /* * CTRL_CUM_TIME_OUT_XK_TICKS defines SMBus Controller Cumulative Time-Out. * SMBus Controller Cumulative Time-Out duration = * CTRL_CUM_TIME_OUT_XK_TICKS[7:0] x Baud_Clock_Period x 2048 */ -#define CTRL_CUM_TIME_OUT_100K_TICKS 159 -#define CTRL_CUM_TIME_OUT_400K_TICKS 159 -#define CTRL_CUM_TIME_OUT_1000K_TICKS 159 +#define CTRL_CUM_TIME_OUT_100K_TICKS 76 +#define CTRL_CUM_TIME_OUT_400K_TICKS 76 +#define CTRL_CUM_TIME_OUT_1000K_TICKS 76 /* * TARGET_CUM_TIME_OUT_XK_TICKS defines SMBus Target Cumulative Time-Out duration. * SMBus Target Cumulative Time-Out duration = TARGET_CUM_TIME_OUT_XK_TICKS[7:0] x * Baud_Clock_Period x 4096 */ -#define TARGET_CUM_TIME_OUT_100K_TICKS 199 -#define TARGET_CUM_TIME_OUT_400K_TICKS 199 -#define TARGET_CUM_TIME_OUT_1000K_TICKS 199 +#define TARGET_CUM_TIME_OUT_100K_TICKS 95 +#define TARGET_CUM_TIME_OUT_400K_TICKS 95 +#define TARGET_CUM_TIME_OUT_1000K_TICKS 95 /* * CLOCK_HIGH_TIME_OUT_XK defines Clock High time out period. * Clock High time out period = CLOCK_HIGH_TIME_OUT_XK[7:0] x Baud_Clock_Period x 8 */ -#define CLOCK_HIGH_TIME_OUT_100K_TICKS 204 -#define CLOCK_HIGH_TIME_OUT_400K_TICKS 204 -#define CLOCK_HIGH_TIME_OUT_1000K_TICKS 204 +#define CLOCK_HIGH_TIME_OUT_100K_TICKS 97 +#define CLOCK_HIGH_TIME_OUT_400K_TICKS 97 +#define CLOCK_HIGH_TIME_OUT_1000K_TICKS 97 #define TO_SCALING_100K \ ((BUS_IDLE_MIN_100K_TICKS << 24) | (CTRL_CUM_TIME_OUT_100K_TICKS << 16) | \ diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c index a0af027db04c..2e575856c5cd 100644 --- a/drivers/i2c/busses/i2c-ocores.c +++ b/drivers/i2c/busses/i2c-ocores.c @@ -342,18 +342,18 @@ static int ocores_poll_wait(struct ocores_i2c *i2c) * ocores_isr(), we just add our polling code around it. * * It can run in atomic context + * + * Return: 0 on success, -ETIMEDOUT on timeout */ -static void ocores_process_polling(struct ocores_i2c *i2c) +static int ocores_process_polling(struct ocores_i2c *i2c) { - while (1) { - irqreturn_t ret; - int err; + irqreturn_t ret; + int err = 0; + while (1) { err = ocores_poll_wait(i2c); - if (err) { - i2c->state = STATE_ERROR; + if (err) break; /* timeout */ - } ret = ocores_isr(-1, i2c); if (ret == IRQ_NONE) @@ -364,13 +364,15 @@ static void ocores_process_polling(struct ocores_i2c *i2c) break; } } + + return err; } static int ocores_xfer_core(struct ocores_i2c *i2c, struct i2c_msg *msgs, int num, bool polling) { - int ret; + int ret = 0; u8 ctrl; ctrl = oc_getreg(i2c, OCI2C_CONTROL); @@ -388,15 +390,16 @@ static int ocores_xfer_core(struct ocores_i2c *i2c, oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_START); if (polling) { - ocores_process_polling(i2c); + ret = ocores_process_polling(i2c); } else { - ret = wait_event_timeout(i2c->wait, - (i2c->state == STATE_ERROR) || - (i2c->state == STATE_DONE), HZ); - if (ret == 0) { - ocores_process_timeout(i2c); - return -ETIMEDOUT; - } + if (wait_event_timeout(i2c->wait, + (i2c->state == STATE_ERROR) || + (i2c->state == STATE_DONE), HZ) == 0) + ret = -ETIMEDOUT; + } + if (ret) { + ocores_process_timeout(i2c); + return ret; } return (i2c->state == STATE_DONE) ? num : -EIO; diff --git a/drivers/i2c/i2c-core-of.c b/drivers/i2c/i2c-core-of.c index bce6b796e04c..545436b7dd53 100644 --- a/drivers/i2c/i2c-core-of.c +++ b/drivers/i2c/i2c-core-of.c @@ -178,6 +178,11 @@ static int of_i2c_notify(struct notifier_block *nb, unsigned long action, return NOTIFY_OK; } + /* + * Clear the flag before adding the device so that fw_devlink + * doesn't skip adding consumers to this device. + */ + rd->dn->fwnode.flags &= ~FWNODE_FLAG_NOT_DEVICE; client = of_i2c_register_device(adap, rd->dn); if (IS_ERR(client)) { dev_err(&adap->dev, "failed to create client for '%pOF'\n", diff --git a/drivers/iio/adc/at91-sama5d2_adc.c b/drivers/iio/adc/at91-sama5d2_adc.c index 50d02e5fc6fc..7258912fe17b 100644 --- a/drivers/iio/adc/at91-sama5d2_adc.c +++ b/drivers/iio/adc/at91-sama5d2_adc.c @@ -1409,7 +1409,7 @@ static struct iio_trigger *at91_adc_allocate_trigger(struct iio_dev *indio, trig = devm_iio_trigger_alloc(&indio->dev, "%s-dev%d-%s", indio->name, iio_device_id(indio), trigger_name); if (!trig) - return NULL; + return ERR_PTR(-ENOMEM); trig->dev.parent = indio->dev.parent; iio_trigger_set_drvdata(trig, indio); diff --git a/drivers/iio/dac/ad5755.c b/drivers/iio/dac/ad5755.c index beadfa938d2d..404865e35460 100644 --- a/drivers/iio/dac/ad5755.c +++ b/drivers/iio/dac/ad5755.c @@ -802,6 +802,7 @@ static struct ad5755_platform_data *ad5755_parse_fw(struct device *dev) return pdata; error_out: + fwnode_handle_put(pp); devm_kfree(dev, pdata); return NULL; } diff --git a/drivers/iio/light/tsl2772.c b/drivers/iio/light/tsl2772.c index ad50baa0202c..e823c145f679 100644 --- a/drivers/iio/light/tsl2772.c +++ b/drivers/iio/light/tsl2772.c @@ -601,6 +601,7 @@ static int tsl2772_read_prox_diodes(struct tsl2772_chip *chip) return -EINVAL; } } + chip->settings.prox_diode = prox_diode_mask; return 0; } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 308155937713..6b9563d4f23c 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -624,22 +624,11 @@ static inline unsigned short cma_family(struct rdma_id_private *id_priv) return id_priv->id.route.addr.src_addr.ss_family; } -static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) +static int cma_set_default_qkey(struct rdma_id_private *id_priv) { struct ib_sa_mcmember_rec rec; int ret = 0; - if (id_priv->qkey) { - if (qkey && id_priv->qkey != qkey) - return -EINVAL; - return 0; - } - - if (qkey) { - id_priv->qkey = qkey; - return 0; - } - switch (id_priv->id.ps) { case RDMA_PS_UDP: case RDMA_PS_IB: @@ -659,6 +648,16 @@ static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) return ret; } +static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) +{ + if (!qkey || + (id_priv->qkey && (id_priv->qkey != qkey))) + return -EINVAL; + + id_priv->qkey = qkey; + return 0; +} + static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { dev_addr->dev_type = ARPHRD_INFINIBAND; @@ -1229,7 +1228,7 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { - ret = cma_set_qkey(id_priv, 0); + ret = cma_set_default_qkey(id_priv); if (ret) return ret; @@ -4569,7 +4568,10 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { - ret = cma_set_qkey(id_priv, qkey); + if (qkey) + ret = cma_set_qkey(id_priv, qkey); + else + ret = cma_set_default_qkey(id_priv); if (ret) return ret; rep.qp_num = id_priv->qp_num; @@ -4774,9 +4776,7 @@ static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, enum ib_gid_type gid_type; struct net_device *ndev; - if (!status) - status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); - else + if (status) pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n", status); @@ -4804,7 +4804,7 @@ static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, } event->param.ud.qp_num = 0xFFFFFF; - event->param.ud.qkey = be32_to_cpu(multicast->rec.qkey); + event->param.ud.qkey = id_priv->qkey; out: if (ndev) @@ -4823,8 +4823,11 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING) goto out; - cma_make_mc_event(status, id_priv, multicast, &event, mc); - ret = cma_cm_event_handler(id_priv, &event); + ret = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); + if (!ret) { + cma_make_mc_event(status, id_priv, multicast, &event, mc); + ret = cma_cm_event_handler(id_priv, &event); + } rdma_destroy_ah_attr(&event.param.ud.ah_attr); WARN_ON(ret); @@ -4877,9 +4880,11 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, if (ret) return ret; - ret = cma_set_qkey(id_priv, 0); - if (ret) - return ret; + if (!id_priv->qkey) { + ret = cma_set_default_qkey(id_priv); + if (ret) + return ret; + } cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); rec.qkey = cpu_to_be32(id_priv->qkey); @@ -4956,9 +4961,6 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, cma_iboe_set_mgid(addr, &ib.rec.mgid, gid_type); ib.rec.pkey = cpu_to_be16(0xffff); - if (id_priv->id.ps == RDMA_PS_UDP) - ib.rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); - if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!ndev) @@ -4984,6 +4986,9 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, if (err || !ib.rec.mtu) return err ?: -EINVAL; + if (!id_priv->qkey) + cma_set_default_qkey(id_priv); + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &ib.rec.port_gid); INIT_WORK(&mc->iboe_join.work, cma_iboe_join_work_handler); @@ -5009,6 +5014,9 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, READ_ONCE(id_priv->state) != RDMA_CM_ADDR_RESOLVED)) return -EINVAL; + if (id_priv->id.qp_type != IB_QPT_UD) + return -EINVAL; + mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return -ENOMEM; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 11b1c1603aeb..b99b3cc283b6 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -532,6 +532,8 @@ static struct ib_ah *_rdma_create_ah(struct ib_pd *pd, else ret = device->ops.create_ah(ah, &init_attr, NULL); if (ret) { + if (ah->sgid_attr) + rdma_put_gid_attr(ah->sgid_attr); kfree(ah); return ERR_PTR(ret); } diff --git a/drivers/infiniband/hw/erdma/erdma_cq.c b/drivers/infiniband/hw/erdma/erdma_cq.c index cabd8678b355..7bc354273d4e 100644 --- a/drivers/infiniband/hw/erdma/erdma_cq.c +++ b/drivers/infiniband/hw/erdma/erdma_cq.c @@ -65,7 +65,7 @@ static const enum ib_wc_opcode wc_mapping_table[ERDMA_NUM_OPCODES] = { [ERDMA_OP_LOCAL_INV] = IB_WC_LOCAL_INV, [ERDMA_OP_READ_WITH_INV] = IB_WC_RDMA_READ, [ERDMA_OP_ATOMIC_CAS] = IB_WC_COMP_SWAP, - [ERDMA_OP_ATOMIC_FAD] = IB_WC_FETCH_ADD, + [ERDMA_OP_ATOMIC_FAA] = IB_WC_FETCH_ADD, }; static const struct { diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 4c38d99c73f1..37ad1bb1917c 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -441,7 +441,7 @@ struct erdma_reg_mr_sqe { }; /* EQ related. */ -#define ERDMA_DEFAULT_EQ_DEPTH 256 +#define ERDMA_DEFAULT_EQ_DEPTH 4096 /* ceqe */ #define ERDMA_CEQE_HDR_DB_MASK BIT_ULL(63) @@ -491,7 +491,7 @@ enum erdma_opcode { ERDMA_OP_LOCAL_INV = 15, ERDMA_OP_READ_WITH_INV = 16, ERDMA_OP_ATOMIC_CAS = 17, - ERDMA_OP_ATOMIC_FAD = 18, + ERDMA_OP_ATOMIC_FAA = 18, ERDMA_NUM_OPCODES = 19, ERDMA_OP_INVALID = ERDMA_NUM_OPCODES + 1 }; diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 5dc31e5df5cb..4a29a53a6652 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -56,7 +56,7 @@ done: static int erdma_enum_and_get_netdev(struct erdma_dev *dev) { struct net_device *netdev; - int ret = -ENODEV; + int ret = -EPROBE_DEFER; /* Already binded to a net_device, so we skip. */ if (dev->netdev) diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index d088d6bef431..44923c51a01b 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -405,7 +405,7 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, FIELD_PREP(ERDMA_SQE_MR_MTT_CNT_MASK, mr->mem.mtt_nents); - if (mr->mem.mtt_nents < ERDMA_MAX_INLINE_MTT_ENTRIES) { + if (mr->mem.mtt_nents <= ERDMA_MAX_INLINE_MTT_ENTRIES) { attrs |= FIELD_PREP(ERDMA_SQE_MR_MTT_TYPE_MASK, 0); /* Copy SGLs to SQE content to accelerate */ memcpy(get_queue_entry(qp->kern_qp.sq_buf, idx + 1, @@ -439,7 +439,7 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, cpu_to_le64(atomic_wr(send_wr)->compare_add); } else { wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, - ERDMA_OP_ATOMIC_FAD); + ERDMA_OP_ATOMIC_FAA); atomic_sqe->fetchadd_swap_data = cpu_to_le64(atomic_wr(send_wr)->compare_add); } diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index e0a993bc032a..131cf5f40982 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -11,7 +11,7 @@ /* RDMA Capability. */ #define ERDMA_MAX_PD (128 * 1024) -#define ERDMA_MAX_SEND_WR 4096 +#define ERDMA_MAX_SEND_WR 8192 #define ERDMA_MAX_ORD 128 #define ERDMA_MAX_IRD 128 #define ERDMA_MAX_SGE_RD 1 diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index b1d6ca7e9708..f3d6ce45c397 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -267,6 +267,8 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) if (!HFI1_CAP_IS_KSET(SDMA)) return -EINVAL; + if (!from->user_backed) + return -EINVAL; idx = srcu_read_lock(&fd->pq_srcu); pq = srcu_dereference(fd->pq, &fd->pq_srcu); if (!cq || !pq) { @@ -274,11 +276,6 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) return -EIO; } - if (!iter_is_iovec(from) || !dim) { - srcu_read_unlock(&fd->pq_srcu, idx); - return -EINVAL; - } - trace_hfi1_sdma_request(fd->dd, fd->uctxt->ctxt, fd->subctxt, dim); if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) { @@ -287,11 +284,12 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) } while (dim) { + const struct iovec *iov = iter_iov(from); int ret; unsigned long count = 0; ret = hfi1_user_sdma_process_request( - fd, (struct iovec *)(from->iov + done), + fd, (struct iovec *)(iov + done), dim, &count); if (ret) { reqs = ret; diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 195aa9ea18b6..8817864154af 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -1458,13 +1458,15 @@ static int irdma_send_fin(struct irdma_cm_node *cm_node) * irdma_find_listener - find a cm node listening on this addr-port pair * @cm_core: cm's core * @dst_addr: listener ip addr + * @ipv4: flag indicating IPv4 when true * @dst_port: listener tcp port num * @vlan_id: virtual LAN ID * @listener_state: state to match with listen node's */ static struct irdma_cm_listener * -irdma_find_listener(struct irdma_cm_core *cm_core, u32 *dst_addr, u16 dst_port, - u16 vlan_id, enum irdma_cm_listener_state listener_state) +irdma_find_listener(struct irdma_cm_core *cm_core, u32 *dst_addr, bool ipv4, + u16 dst_port, u16 vlan_id, + enum irdma_cm_listener_state listener_state) { struct irdma_cm_listener *listen_node; static const u32 ip_zero[4] = { 0, 0, 0, 0 }; @@ -1477,7 +1479,7 @@ irdma_find_listener(struct irdma_cm_core *cm_core, u32 *dst_addr, u16 dst_port, list_for_each_entry (listen_node, &cm_core->listen_list, list) { memcpy(listen_addr, listen_node->loc_addr, sizeof(listen_addr)); listen_port = listen_node->loc_port; - if (listen_port != dst_port || + if (listen_node->ipv4 != ipv4 || listen_port != dst_port || !(listener_state & listen_node->listener_state)) continue; /* compare node pair, return node handle if a match */ @@ -2902,9 +2904,10 @@ irdma_make_listen_node(struct irdma_cm_core *cm_core, unsigned long flags; /* cannot have multiple matching listeners */ - listener = irdma_find_listener(cm_core, cm_info->loc_addr, - cm_info->loc_port, cm_info->vlan_id, - IRDMA_CM_LISTENER_EITHER_STATE); + listener = + irdma_find_listener(cm_core, cm_info->loc_addr, cm_info->ipv4, + cm_info->loc_port, cm_info->vlan_id, + IRDMA_CM_LISTENER_EITHER_STATE); if (listener && listener->listener_state == IRDMA_CM_LISTENER_ACTIVE_STATE) { refcount_dec(&listener->refcnt); @@ -3153,6 +3156,7 @@ void irdma_receive_ilq(struct irdma_sc_vsi *vsi, struct irdma_puda_buf *rbuf) listener = irdma_find_listener(cm_core, cm_info.loc_addr, + cm_info.ipv4, cm_info.loc_port, cm_info.vlan_id, IRDMA_CM_LISTENER_ACTIVE_STATE); diff --git a/drivers/infiniband/hw/irdma/cm.h b/drivers/infiniband/hw/irdma/cm.h index 19c284975fc7..7feadb3e1eda 100644 --- a/drivers/infiniband/hw/irdma/cm.h +++ b/drivers/infiniband/hw/irdma/cm.h @@ -41,7 +41,7 @@ #define TCP_OPTIONS_PADDING 3 #define IRDMA_DEFAULT_RETRYS 64 -#define IRDMA_DEFAULT_RETRANS 8 +#define IRDMA_DEFAULT_RETRANS 32 #define IRDMA_DEFAULT_TTL 0x40 #define IRDMA_DEFAULT_RTT_VAR 6 #define IRDMA_DEFAULT_SS_THRESH 0x3fffffff diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 2e1e2bad0401..43dfa4761f06 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -41,6 +41,7 @@ static enum irdma_hmc_rsrc_type iw_hmc_obj_types[] = { IRDMA_HMC_IW_XFFL, IRDMA_HMC_IW_Q1, IRDMA_HMC_IW_Q1FL, + IRDMA_HMC_IW_PBLE, IRDMA_HMC_IW_TIMER, IRDMA_HMC_IW_FSIMC, IRDMA_HMC_IW_FSIAV, @@ -827,6 +828,8 @@ static int irdma_create_hmc_objs(struct irdma_pci_f *rf, bool privileged, info.entry_type = rf->sd_type; for (i = 0; i < IW_HMC_OBJ_TYPE_NUM; i++) { + if (iw_hmc_obj_types[i] == IRDMA_HMC_IW_PBLE) + continue; if (dev->hmc_info->hmc_obj[iw_hmc_obj_types[i]].cnt) { info.rsrc_type = iw_hmc_obj_types[i]; info.count = dev->hmc_info->hmc_obj[info.rsrc_type].cnt; diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 445e69e86409..7887230c867b 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -2595,7 +2595,10 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) /* remove the SQ WR by moving SQ tail*/ IRDMA_RING_SET_TAIL(*sq_ring, sq_ring->tail + qp->sq_wrtrk_array[sq_ring->tail].quanta); - + if (cmpl->cpi.op_type == IRDMAQP_OP_NOP) { + kfree(cmpl); + continue; + } ibdev_dbg(iwqp->iwscq->ibcq.device, "DEV: %s: adding wr_id = 0x%llx SQ Completion to list qp_id=%d\n", __func__, cmpl->cpi.wr_id, qp->qp_id); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 5b988db66b8f..5d45de223c43 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -442,6 +442,10 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed, *active_width = IB_WIDTH_2X; *active_speed = IB_SPEED_NDR; break; + case MLX5E_PROT_MASK(MLX5E_400GAUI_8): + *active_width = IB_WIDTH_8X; + *active_speed = IB_SPEED_HDR; + break; case MLX5E_PROT_MASK(MLX5E_400GAUI_4_400GBASE_CR4_KR4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_NDR; diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 80fe92a21f96..815ea72ad473 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -2245,10 +2245,10 @@ static ssize_t qib_write_iter(struct kiocb *iocb, struct iov_iter *from) struct qib_ctxtdata *rcd = ctxt_fp(iocb->ki_filp); struct qib_user_sdma_queue *pq = fp->pq; - if (!iter_is_iovec(from) || !from->nr_segs || !pq) + if (!from->user_backed || !from->nr_segs || !pq) return -EINVAL; - return qib_user_sdma_writev(rcd, pq, from->iov, from->nr_segs); + return qib_user_sdma_writev(rcd, pq, iter_iov(from), from->nr_segs); } static struct class *qib_class; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 3acab569fbb9..9b4c0389d2c0 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -97,7 +97,7 @@ static void cacheless_memcpy(void *dst, void *src, size_t n) * there are no security issues. The extra fault recovery machinery * is not invoked. */ - __copy_user_nocache(dst, (void __user *)src, n, 0); + __copy_user_nocache(dst, (void __user *)src, n); } void rvt_wss_exit(struct rvt_dev_info *rdi) diff --git a/drivers/input/tablet/pegasus_notetaker.c b/drivers/input/tablet/pegasus_notetaker.c index d836d3dcc6a2..a68da2988f9c 100644 --- a/drivers/input/tablet/pegasus_notetaker.c +++ b/drivers/input/tablet/pegasus_notetaker.c @@ -296,6 +296,12 @@ static int pegasus_probe(struct usb_interface *intf, pegasus->intf = intf; pipe = usb_rcvintpipe(dev, endpoint->bEndpointAddress); + /* Sanity check that pipe's type matches endpoint's type */ + if (usb_pipe_type_check(dev, pipe)) { + error = -EINVAL; + goto err_free_mem; + } + pegasus->data_len = usb_maxpacket(dev, pipe); pegasus->data = usb_alloc_coherent(dev, pegasus->data_len, GFP_KERNEL, diff --git a/drivers/input/touchscreen/cyttsp5.c b/drivers/input/touchscreen/cyttsp5.c index 16caffa35dd9..30102cb80fac 100644 --- a/drivers/input/touchscreen/cyttsp5.c +++ b/drivers/input/touchscreen/cyttsp5.c @@ -111,6 +111,7 @@ struct cyttsp5_sensing_conf_data_dev { __le16 max_z; u8 origin_x; u8 origin_y; + u8 panel_id; u8 btn; u8 scan_mode; u8 max_num_of_tch_per_refresh_cycle; diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c index bf7667845459..bbfaf6536903 100644 --- a/drivers/memstick/core/memstick.c +++ b/drivers/memstick/core/memstick.c @@ -410,6 +410,7 @@ static struct memstick_dev *memstick_alloc_card(struct memstick_host *host) return card; err_out: host->card = old_card; + kfree_const(card->dev.kobj.name); kfree(card); return NULL; } @@ -468,8 +469,10 @@ static void memstick_check(struct work_struct *work) put_device(&card->dev); host->card = NULL; } - } else + } else { + kfree_const(card->dev.kobj.name); kfree(card); + } } out_power_off: diff --git a/drivers/misc/vmw_vmci/vmci_context.c b/drivers/misc/vmw_vmci/vmci_context.c index 172696abce31..f22b44827e92 100644 --- a/drivers/misc/vmw_vmci/vmci_context.c +++ b/drivers/misc/vmw_vmci/vmci_context.c @@ -687,7 +687,7 @@ int vmci_ctx_remove_notification(u32 context_id, u32 remote_cid) spin_unlock(&context->lock); if (notifier) - kvfree_rcu(notifier); + kvfree_rcu_mightsleep(notifier); vmci_ctx_put(context); diff --git a/drivers/misc/vmw_vmci/vmci_event.c b/drivers/misc/vmw_vmci/vmci_event.c index 2100297c94ad..5d7ac07623c2 100644 --- a/drivers/misc/vmw_vmci/vmci_event.c +++ b/drivers/misc/vmw_vmci/vmci_event.c @@ -209,7 +209,7 @@ int vmci_event_unsubscribe(u32 sub_id) if (!s) return VMCI_ERROR_NOT_FOUND; - kvfree_rcu(s); + kvfree_rcu_mightsleep(s); return VMCI_SUCCESS; } diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index 89953093e20c..672d37ea98d0 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -351,8 +351,6 @@ static void sdhci_am654_write_b(struct sdhci_host *host, u8 val, int reg) */ case MMC_TIMING_SD_HS: case MMC_TIMING_MMC_HS: - case MMC_TIMING_UHS_SDR12: - case MMC_TIMING_UHS_SDR25: val &= ~SDHCI_CTRL_HISPD; } } diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c index 1e94e7d10b8b..a0a1194dc1d9 100644 --- a/drivers/mtd/mtdblock.c +++ b/drivers/mtd/mtdblock.c @@ -153,7 +153,7 @@ static int do_cached_write (struct mtdblk_dev *mtdblk, unsigned long pos, mtdblk->cache_state = STATE_EMPTY; ret = mtd_read(mtd, sect_start, sect_size, &retlen, mtdblk->cache_data); - if (ret) + if (ret && !mtd_is_bitflip(ret)) return ret; if (retlen != sect_size) return -EIO; @@ -188,8 +188,12 @@ static int do_cached_read (struct mtdblk_dev *mtdblk, unsigned long pos, pr_debug("mtdblock: read on \"%s\" at 0x%lx, size 0x%x\n", mtd->name, pos, len); - if (!sect_size) - return mtd_read(mtd, pos, len, &retlen, buf); + if (!sect_size) { + ret = mtd_read(mtd, pos, len, &retlen, buf); + if (ret && !mtd_is_bitflip(ret)) + return ret; + return 0; + } while (len > 0) { unsigned long sect_start = (pos/sect_size)*sect_size; @@ -209,7 +213,7 @@ static int do_cached_read (struct mtdblk_dev *mtdblk, unsigned long pos, memcpy (buf, mtdblk->cache_data + offset, size); } else { ret = mtd_read(mtd, pos, size, &retlen, buf); - if (ret) + if (ret && !mtd_is_bitflip(ret)) return ret; if (retlen != size) return -EIO; diff --git a/drivers/mtd/nand/raw/meson_nand.c b/drivers/mtd/nand/raw/meson_nand.c index a28574c00900..074e14225c06 100644 --- a/drivers/mtd/nand/raw/meson_nand.c +++ b/drivers/mtd/nand/raw/meson_nand.c @@ -280,7 +280,7 @@ static void meson_nfc_cmd_access(struct nand_chip *nand, int raw, bool dir, if (raw) { len = mtd->writesize + mtd->oobsize; - cmd = (len & GENMASK(5, 0)) | scrambler | DMA_DIR(dir); + cmd = (len & GENMASK(13, 0)) | scrambler | DMA_DIR(dir); writel(cmd, nfc->reg_base + NFC_REG_CMD); return; } @@ -544,7 +544,7 @@ static int meson_nfc_read_buf(struct nand_chip *nand, u8 *buf, int len) if (ret) goto out; - cmd = NFC_CMD_N2M | (len & GENMASK(5, 0)); + cmd = NFC_CMD_N2M | (len & GENMASK(13, 0)); writel(cmd, nfc->reg_base + NFC_REG_CMD); meson_nfc_drain_cmd(nfc); @@ -568,7 +568,7 @@ static int meson_nfc_write_buf(struct nand_chip *nand, u8 *buf, int len) if (ret) return ret; - cmd = NFC_CMD_M2N | (len & GENMASK(5, 0)); + cmd = NFC_CMD_M2N | (len & GENMASK(13, 0)); writel(cmd, nfc->reg_base + NFC_REG_CMD); meson_nfc_drain_cmd(nfc); diff --git a/drivers/mtd/nand/raw/stm32_fmc2_nand.c b/drivers/mtd/nand/raw/stm32_fmc2_nand.c index 5d627048c420..9e74bcd90aaa 100644 --- a/drivers/mtd/nand/raw/stm32_fmc2_nand.c +++ b/drivers/mtd/nand/raw/stm32_fmc2_nand.c @@ -1531,6 +1531,9 @@ static int stm32_fmc2_nfc_setup_interface(struct nand_chip *chip, int chipnr, if (IS_ERR(sdrt)) return PTR_ERR(sdrt); + if (conf->timings.mode > 3) + return -EOPNOTSUPP; + if (chipnr == NAND_DATA_IFACE_CHECK_ONLY) return 0; diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 0904eb40c95f..ad025b2ee417 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -666,12 +666,6 @@ static int io_init(struct ubi_device *ubi, int max_beb_per1024) ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE, ubi->hdrs_min_io_size); ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE, ubi->hdrs_min_io_size); - if (ubi->vid_hdr_offset && ((ubi->vid_hdr_offset + UBI_VID_HDR_SIZE) > - ubi->vid_hdr_alsize)) { - ubi_err(ubi, "VID header offset %d too large.", ubi->vid_hdr_offset); - return -EINVAL; - } - dbg_gen("min_io_size %d", ubi->min_io_size); dbg_gen("max_write_size %d", ubi->max_write_size); dbg_gen("hdrs_min_io_size %d", ubi->hdrs_min_io_size); @@ -689,6 +683,21 @@ static int io_init(struct ubi_device *ubi, int max_beb_per1024) ubi->vid_hdr_aloffset; } + /* + * Memory allocation for VID header is ubi->vid_hdr_alsize + * which is described in comments in io.c. + * Make sure VID header shift + UBI_VID_HDR_SIZE not exceeds + * ubi->vid_hdr_alsize, so that all vid header operations + * won't access memory out of bounds. + */ + if ((ubi->vid_hdr_shift + UBI_VID_HDR_SIZE) > ubi->vid_hdr_alsize) { + ubi_err(ubi, "Invalid VID header offset %d, VID header shift(%d)" + " + VID header size(%zu) > VID header aligned size(%d).", + ubi->vid_hdr_offset, ubi->vid_hdr_shift, + UBI_VID_HDR_SIZE, ubi->vid_hdr_alsize); + return -EINVAL; + } + /* Similar for the data offset */ ubi->leb_start = ubi->vid_hdr_offset + UBI_VID_HDR_SIZE; ubi->leb_start = ALIGN(ubi->leb_start, ubi->min_io_size); diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index 40f39e5d6dfc..26a214f016c1 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -575,7 +575,7 @@ static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk, * @vol_id: the volume ID that last used this PEB * @lnum: the last used logical eraseblock number for the PEB * @torture: if the physical eraseblock has to be tortured - * @nested: denotes whether the work_sem is already held in read mode + * @nested: denotes whether the work_sem is already held * * This function returns zero in case of success and a %-ENOMEM in case of * failure. @@ -1131,7 +1131,7 @@ static int __erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk) int err1; /* Re-schedule the LEB for erasure */ - err1 = schedule_erase(ubi, e, vol_id, lnum, 0, false); + err1 = schedule_erase(ubi, e, vol_id, lnum, 0, true); if (err1) { spin_lock(&ubi->wl_lock); wl_entry_destroy(ubi, e); diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 236e5219c811..7a7d584f378a 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1777,14 +1777,15 @@ void bond_lower_state_changed(struct slave *slave) /* The bonding driver uses ether_setup() to convert a master bond device * to ARPHRD_ETHER, that resets the target netdevice's flags so we always - * have to restore the IFF_MASTER flag, and only restore IFF_SLAVE if it was set + * have to restore the IFF_MASTER flag, and only restore IFF_SLAVE and IFF_UP + * if they were set */ static void bond_ether_setup(struct net_device *bond_dev) { - unsigned int slave_flag = bond_dev->flags & IFF_SLAVE; + unsigned int flags = bond_dev->flags & (IFF_SLAVE | IFF_UP); ether_setup(bond_dev); - bond_dev->flags |= IFF_MASTER | slave_flag; + bond_dev->flags |= IFF_MASTER | flags; bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING; } @@ -3269,7 +3270,8 @@ static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond, combined = skb_header_pointer(skb, 0, sizeof(_combined), &_combined); if (!combined || combined->ip6.nexthdr != NEXTHDR_ICMP || - combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT) + (combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION && + combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT)) goto out; saddr = &combined->ip6.saddr; @@ -3291,7 +3293,7 @@ static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond, else if (curr_active_slave && time_after(slave_last_rx(bond, curr_active_slave), curr_active_slave->last_link_up)) - bond_validate_na(bond, slave, saddr, daddr); + bond_validate_na(bond, slave, daddr, saddr); else if (curr_arp_slave && bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1)) bond_validate_na(bond, slave, saddr, daddr); diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index 3fffd5da8d3b..ffcad057d065 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -96,7 +96,7 @@ static int ksz8795_change_mtu(struct ksz_device *dev, int frame_size) if (frame_size > KSZ8_LEGAL_PACKET_SIZE) ctrl2 |= SW_LEGAL_PACKET_DISABLE; - else if (frame_size > KSZ8863_NORMAL_PACKET_SIZE) + if (frame_size > KSZ8863_NORMAL_PACKET_SIZE) ctrl1 |= SW_HUGE_PACKET; ret = ksz_rmw8(dev, REG_SW_CTRL_1, SW_HUGE_PACKET, ctrl1); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c23e3b397bcf..651b79ce5d80 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2388,7 +2388,7 @@ static int bnxt_async_event_process(struct bnxt *bp, case ASYNC_EVENT_CMPL_EVENT_ID_PHC_UPDATE: { switch (BNXT_EVENT_PHC_EVENT_TYPE(data1)) { case ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_RTC_UPDATE: - if (bp->fw_cap & BNXT_FW_CAP_PTP_RTC) { + if (BNXT_PTP_USE_RTC(bp)) { struct bnxt_ptp_cfg *ptp = bp->ptp_cfg; u64 ns; @@ -7627,7 +7627,7 @@ static int __bnxt_hwrm_ptp_qcfg(struct bnxt *bp) u8 flags; int rc; - if (bp->hwrm_spec_code < 0x10801) { + if (bp->hwrm_spec_code < 0x10801 || !BNXT_CHIP_P5_THOR(bp)) { rc = -ENODEV; goto no_ptp; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index e7b5e28ee29f..852eb449ccae 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -304,7 +304,7 @@ void bnxt_rdma_aux_device_uninit(struct bnxt *bp) struct auxiliary_device *adev; /* Skip if no auxiliary device init was done. */ - if (!(bp->flags & BNXT_FLAG_ROCE_CAP)) + if (!bp->aux_priv) return; aux_priv = bp->aux_priv; @@ -324,6 +324,7 @@ static void bnxt_aux_dev_release(struct device *dev) bp->edev = NULL; kfree(aux_priv->edev); kfree(aux_priv); + bp->aux_priv = NULL; } static void bnxt_set_edev_info(struct bnxt_en_dev *edev, struct bnxt *bp) @@ -359,19 +360,18 @@ void bnxt_rdma_aux_device_init(struct bnxt *bp) if (!(bp->flags & BNXT_FLAG_ROCE_CAP)) return; - bp->aux_priv = kzalloc(sizeof(*bp->aux_priv), GFP_KERNEL); - if (!bp->aux_priv) + aux_priv = kzalloc(sizeof(*bp->aux_priv), GFP_KERNEL); + if (!aux_priv) goto exit; - bp->aux_priv->id = ida_alloc(&bnxt_aux_dev_ids, GFP_KERNEL); - if (bp->aux_priv->id < 0) { + aux_priv->id = ida_alloc(&bnxt_aux_dev_ids, GFP_KERNEL); + if (aux_priv->id < 0) { netdev_warn(bp->dev, "ida alloc failed for ROCE auxiliary device\n"); - kfree(bp->aux_priv); + kfree(aux_priv); goto exit; } - aux_priv = bp->aux_priv; aux_dev = &aux_priv->aux_dev; aux_dev->id = aux_priv->id; aux_dev->name = "rdma"; @@ -380,10 +380,11 @@ void bnxt_rdma_aux_device_init(struct bnxt *bp) rc = auxiliary_device_init(aux_dev); if (rc) { - ida_free(&bnxt_aux_dev_ids, bp->aux_priv->id); - kfree(bp->aux_priv); + ida_free(&bnxt_aux_dev_ids, aux_priv->id); + kfree(aux_priv); goto exit; } + bp->aux_priv = aux_priv; /* From this point, all cleanup will happen via the .release callback & * any error unwinding will need to include a call to diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 66e30561569e..e43d99ec50ba 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1064,6 +1064,10 @@ static dma_addr_t macb_get_addr(struct macb *bp, struct macb_dma_desc *desc) } #endif addr |= MACB_BF(RX_WADDR, MACB_BFEXT(RX_WADDR, desc->addr)); +#ifdef CONFIG_MACB_USE_HWSTAMP + if (bp->hw_dma_cap & HW_DMA_CAP_PTP) + addr &= ~GEM_BIT(DMA_RXVALID); +#endif return addr; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c index dd9be229819a..d3541159487d 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c @@ -1135,7 +1135,7 @@ void cxgb4_cleanup_tc_flower(struct adapter *adap) return; if (adap->flower_stats_timer.function) - del_timer_sync(&adap->flower_stats_timer); + timer_shutdown_sync(&adap->flower_stats_timer); cancel_work_sync(&adap->flower_stats_work); rhashtable_destroy(&adap->flower_tbl); adap->tc_flower_initialized = false; diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c index da9d4b310fcd..838750a03cf6 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c @@ -989,6 +989,20 @@ static int enetc_get_mm(struct net_device *ndev, struct ethtool_mm_state *state) return 0; } +/* FIXME: Workaround for the link partner's verification failing if ENETC + * priorly received too much express traffic. The documentation doesn't + * suggest this is needed. + */ +static void enetc_restart_emac_rx(struct enetc_si *si) +{ + u32 val = enetc_port_rd(&si->hw, ENETC_PM0_CMD_CFG); + + enetc_port_wr(&si->hw, ENETC_PM0_CMD_CFG, val & ~ENETC_PM0_RX_EN); + + if (val & ENETC_PM0_RX_EN) + enetc_port_wr(&si->hw, ENETC_PM0_CMD_CFG, val); +} + static int enetc_set_mm(struct net_device *ndev, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack) { @@ -1040,6 +1054,8 @@ static int enetc_set_mm(struct net_device *ndev, struct ethtool_mm_cfg *cfg, enetc_port_wr(hw, ENETC_MMCSR, val); + enetc_restart_emac_rx(priv->si); + mutex_unlock(&priv->mm_lock); return 0; diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index e1eb1de88bf9..e14d1e45318f 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -5288,31 +5288,6 @@ static void e1000_watchdog_task(struct work_struct *work) ew32(TARC(0), tarc0); } - /* disable TSO for pcie and 10/100 speeds, to avoid - * some hardware issues - */ - if (!(adapter->flags & FLAG_TSO_FORCE)) { - switch (adapter->link_speed) { - case SPEED_10: - case SPEED_100: - e_info("10/100 speed: disabling TSO\n"); - netdev->features &= ~NETIF_F_TSO; - netdev->features &= ~NETIF_F_TSO6; - break; - case SPEED_1000: - netdev->features |= NETIF_F_TSO; - netdev->features |= NETIF_F_TSO6; - break; - default: - /* oops */ - break; - } - if (hw->mac.type == e1000_pch_spt) { - netdev->features &= ~NETIF_F_TSO; - netdev->features &= ~NETIF_F_TSO6; - } - } - /* enable transmits in the hardware, need to do this * after setting TARC(0) */ @@ -7526,6 +7501,32 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent) NETIF_F_RXCSUM | NETIF_F_HW_CSUM); + /* disable TSO for pcie and 10/100 speeds to avoid + * some hardware issues and for i219 to fix transfer + * speed being capped at 60% + */ + if (!(adapter->flags & FLAG_TSO_FORCE)) { + switch (adapter->link_speed) { + case SPEED_10: + case SPEED_100: + e_info("10/100 speed: disabling TSO\n"); + netdev->features &= ~NETIF_F_TSO; + netdev->features &= ~NETIF_F_TSO6; + break; + case SPEED_1000: + netdev->features |= NETIF_F_TSO; + netdev->features |= NETIF_F_TSO6; + break; + default: + /* oops */ + break; + } + if (hw->mac.type == e1000_pch_spt) { + netdev->features &= ~NETIF_F_TSO; + netdev->features &= ~NETIF_F_TSO6; + } + } + /* Set user-changeable features (subset of all device features) */ netdev->hw_features = netdev->features; netdev->hw_features |= NETIF_F_RXFCS; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 228cd502bb48..7c30abd0dfc2 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11059,8 +11059,11 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) pf->hw.aq.asq_last_status)); } /* reinit the misc interrupt */ - if (pf->flags & I40E_FLAG_MSIX_ENABLED) + if (pf->flags & I40E_FLAG_MSIX_ENABLED) { ret = i40e_setup_misc_vector(pf); + if (ret) + goto end_unlock; + } /* Add a filter to drop all Flow control frames from any VSI from being * transmitted. By doing so we stop a malicious VF from sending out @@ -14133,15 +14136,15 @@ static int i40e_add_vsi(struct i40e_vsi *vsi) vsi->id = ctxt.vsi_number; } - vsi->active_filters = 0; - clear_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state); spin_lock_bh(&vsi->mac_filter_hash_lock); + vsi->active_filters = 0; /* If macvlan filters already exist, force them to get loaded */ hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) { f->state = I40E_FILTER_NEW; f_count++; } spin_unlock_bh(&vsi->mac_filter_hash_lock); + clear_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state); if (f_count) { vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED; diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 232bc61d9eee..746ff76f2fb1 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -59,8 +59,6 @@ enum iavf_vsi_state_t { struct iavf_vsi { struct iavf_adapter *back; struct net_device *netdev; - unsigned long active_cvlans[BITS_TO_LONGS(VLAN_N_VID)]; - unsigned long active_svlans[BITS_TO_LONGS(VLAN_N_VID)]; u16 seid; u16 id; DECLARE_BITMAP(state, __IAVF_VSI_STATE_SIZE__); @@ -158,15 +156,20 @@ struct iavf_vlan { u16 tpid; }; +enum iavf_vlan_state_t { + IAVF_VLAN_INVALID, + IAVF_VLAN_ADD, /* filter needs to be added */ + IAVF_VLAN_IS_NEW, /* filter is new, wait for PF answer */ + IAVF_VLAN_ACTIVE, /* filter is accepted by PF */ + IAVF_VLAN_DISABLE, /* filter needs to be deleted by PF, then marked INACTIVE */ + IAVF_VLAN_INACTIVE, /* filter is inactive, we are in IFF_DOWN */ + IAVF_VLAN_REMOVE, /* filter needs to be removed from list */ +}; + struct iavf_vlan_filter { struct list_head list; struct iavf_vlan vlan; - struct { - u8 is_new_vlan:1; /* filter is new, wait for PF answer */ - u8 remove:1; /* filter needs to be removed */ - u8 add:1; /* filter needs to be added */ - u8 padding:5; - }; + enum iavf_vlan_state_t state; }; #define IAVF_MAX_TRAFFIC_CLASS 4 @@ -258,6 +261,7 @@ struct iavf_adapter { wait_queue_head_t vc_waitqueue; struct iavf_q_vector *q_vectors; struct list_head vlan_filter_list; + int num_vlan_filters; struct list_head mac_filter_list; struct mutex crit_lock; struct mutex client_lock; diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 095201e83c9d..2de4baff4c20 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -791,7 +791,8 @@ iavf_vlan_filter *iavf_add_vlan(struct iavf_adapter *adapter, f->vlan = vlan; list_add_tail(&f->list, &adapter->vlan_filter_list); - f->add = true; + f->state = IAVF_VLAN_ADD; + adapter->num_vlan_filters++; adapter->aq_required |= IAVF_FLAG_AQ_ADD_VLAN_FILTER; } @@ -813,7 +814,7 @@ static void iavf_del_vlan(struct iavf_adapter *adapter, struct iavf_vlan vlan) f = iavf_find_vlan(adapter, vlan); if (f) { - f->remove = true; + f->state = IAVF_VLAN_REMOVE; adapter->aq_required |= IAVF_FLAG_AQ_DEL_VLAN_FILTER; } @@ -828,14 +829,18 @@ static void iavf_del_vlan(struct iavf_adapter *adapter, struct iavf_vlan vlan) **/ static void iavf_restore_filters(struct iavf_adapter *adapter) { - u16 vid; + struct iavf_vlan_filter *f; /* re-add all VLAN filters */ - for_each_set_bit(vid, adapter->vsi.active_cvlans, VLAN_N_VID) - iavf_add_vlan(adapter, IAVF_VLAN(vid, ETH_P_8021Q)); + spin_lock_bh(&adapter->mac_vlan_list_lock); - for_each_set_bit(vid, adapter->vsi.active_svlans, VLAN_N_VID) - iavf_add_vlan(adapter, IAVF_VLAN(vid, ETH_P_8021AD)); + list_for_each_entry(f, &adapter->vlan_filter_list, list) { + if (f->state == IAVF_VLAN_INACTIVE) + f->state = IAVF_VLAN_ADD; + } + + spin_unlock_bh(&adapter->mac_vlan_list_lock); + adapter->aq_required |= IAVF_FLAG_AQ_ADD_VLAN_FILTER; } /** @@ -844,8 +849,7 @@ static void iavf_restore_filters(struct iavf_adapter *adapter) */ u16 iavf_get_num_vlans_added(struct iavf_adapter *adapter) { - return bitmap_weight(adapter->vsi.active_cvlans, VLAN_N_VID) + - bitmap_weight(adapter->vsi.active_svlans, VLAN_N_VID); + return adapter->num_vlan_filters; } /** @@ -928,11 +932,6 @@ static int iavf_vlan_rx_kill_vid(struct net_device *netdev, return 0; iavf_del_vlan(adapter, IAVF_VLAN(vid, be16_to_cpu(proto))); - if (proto == cpu_to_be16(ETH_P_8021Q)) - clear_bit(vid, adapter->vsi.active_cvlans); - else - clear_bit(vid, adapter->vsi.active_svlans); - return 0; } @@ -1293,16 +1292,11 @@ static void iavf_clear_mac_vlan_filters(struct iavf_adapter *adapter) } } - /* remove all VLAN filters */ + /* disable all VLAN filters */ list_for_each_entry_safe(vlf, vlftmp, &adapter->vlan_filter_list, - list) { - if (vlf->add) { - list_del(&vlf->list); - kfree(vlf); - } else { - vlf->remove = true; - } - } + list) + vlf->state = IAVF_VLAN_DISABLE; + spin_unlock_bh(&adapter->mac_vlan_list_lock); } @@ -2914,6 +2908,7 @@ static void iavf_disable_vf(struct iavf_adapter *adapter) list_del(&fv->list); kfree(fv); } + adapter->num_vlan_filters = 0; spin_unlock_bh(&adapter->mac_vlan_list_lock); @@ -3131,9 +3126,6 @@ continue_reset: adapter->aq_required |= IAVF_FLAG_AQ_ADD_CLOUD_FILTER; iavf_misc_irq_enable(adapter); - bitmap_clear(adapter->vsi.active_cvlans, 0, VLAN_N_VID); - bitmap_clear(adapter->vsi.active_svlans, 0, VLAN_N_VID); - mod_delayed_work(adapter->wq, &adapter->watchdog_task, 2); /* We were running when the reset started, so we need to restore some diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 4e17d006c52d..9afbbdac3590 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -642,16 +642,10 @@ static void iavf_vlan_add_reject(struct iavf_adapter *adapter) spin_lock_bh(&adapter->mac_vlan_list_lock); list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { - if (f->is_new_vlan) { - if (f->vlan.tpid == ETH_P_8021Q) - clear_bit(f->vlan.vid, - adapter->vsi.active_cvlans); - else - clear_bit(f->vlan.vid, - adapter->vsi.active_svlans); - + if (f->state == IAVF_VLAN_IS_NEW) { list_del(&f->list); kfree(f); + adapter->num_vlan_filters--; } } spin_unlock_bh(&adapter->mac_vlan_list_lock); @@ -679,7 +673,7 @@ void iavf_add_vlans(struct iavf_adapter *adapter) spin_lock_bh(&adapter->mac_vlan_list_lock); list_for_each_entry(f, &adapter->vlan_filter_list, list) { - if (f->add) + if (f->state == IAVF_VLAN_ADD) count++; } if (!count || !VLAN_FILTERING_ALLOWED(adapter)) { @@ -710,11 +704,10 @@ void iavf_add_vlans(struct iavf_adapter *adapter) vvfl->vsi_id = adapter->vsi_res->vsi_id; vvfl->num_elements = count; list_for_each_entry(f, &adapter->vlan_filter_list, list) { - if (f->add) { + if (f->state == IAVF_VLAN_ADD) { vvfl->vlan_id[i] = f->vlan.vid; i++; - f->add = false; - f->is_new_vlan = true; + f->state = IAVF_VLAN_IS_NEW; if (i == count) break; } @@ -760,7 +753,7 @@ void iavf_add_vlans(struct iavf_adapter *adapter) vvfl_v2->vport_id = adapter->vsi_res->vsi_id; vvfl_v2->num_elements = count; list_for_each_entry(f, &adapter->vlan_filter_list, list) { - if (f->add) { + if (f->state == IAVF_VLAN_ADD) { struct virtchnl_vlan_supported_caps *filtering_support = &adapter->vlan_v2_caps.filtering.filtering_support; struct virtchnl_vlan *vlan; @@ -778,8 +771,7 @@ void iavf_add_vlans(struct iavf_adapter *adapter) vlan->tpid = f->vlan.tpid; i++; - f->add = false; - f->is_new_vlan = true; + f->state = IAVF_VLAN_IS_NEW; } } @@ -822,10 +814,16 @@ void iavf_del_vlans(struct iavf_adapter *adapter) * filters marked for removal to enable bailing out before * sending a virtchnl message */ - if (f->remove && !VLAN_FILTERING_ALLOWED(adapter)) { + if (f->state == IAVF_VLAN_REMOVE && + !VLAN_FILTERING_ALLOWED(adapter)) { list_del(&f->list); kfree(f); - } else if (f->remove) { + adapter->num_vlan_filters--; + } else if (f->state == IAVF_VLAN_DISABLE && + !VLAN_FILTERING_ALLOWED(adapter)) { + f->state = IAVF_VLAN_INACTIVE; + } else if (f->state == IAVF_VLAN_REMOVE || + f->state == IAVF_VLAN_DISABLE) { count++; } } @@ -857,11 +855,18 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vvfl->vsi_id = adapter->vsi_res->vsi_id; vvfl->num_elements = count; list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { - if (f->remove) { + if (f->state == IAVF_VLAN_DISABLE) { vvfl->vlan_id[i] = f->vlan.vid; + f->state = IAVF_VLAN_INACTIVE; i++; + if (i == count) + break; + } else if (f->state == IAVF_VLAN_REMOVE) { + vvfl->vlan_id[i] = f->vlan.vid; list_del(&f->list); kfree(f); + adapter->num_vlan_filters--; + i++; if (i == count) break; } @@ -901,7 +906,8 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vvfl_v2->vport_id = adapter->vsi_res->vsi_id; vvfl_v2->num_elements = count; list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { - if (f->remove) { + if (f->state == IAVF_VLAN_DISABLE || + f->state == IAVF_VLAN_REMOVE) { struct virtchnl_vlan_supported_caps *filtering_support = &adapter->vlan_v2_caps.filtering.filtering_support; struct virtchnl_vlan *vlan; @@ -915,8 +921,13 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vlan->tci = f->vlan.vid; vlan->tpid = f->vlan.tpid; - list_del(&f->list); - kfree(f); + if (f->state == IAVF_VLAN_DISABLE) { + f->state = IAVF_VLAN_INACTIVE; + } else { + list_del(&f->list); + kfree(f); + adapter->num_vlan_filters--; + } i++; if (i == count) break; @@ -2192,7 +2203,7 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, list_for_each_entry(vlf, &adapter->vlan_filter_list, list) - vlf->add = true; + vlf->state = IAVF_VLAN_ADD; adapter->aq_required |= IAVF_FLAG_AQ_ADD_VLAN_FILTER; @@ -2260,7 +2271,7 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, list_for_each_entry(vlf, &adapter->vlan_filter_list, list) - vlf->add = true; + vlf->state = IAVF_VLAN_ADD; aq_required |= IAVF_FLAG_AQ_ADD_VLAN_FILTER; } @@ -2444,15 +2455,8 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, spin_lock_bh(&adapter->mac_vlan_list_lock); list_for_each_entry(f, &adapter->vlan_filter_list, list) { - if (f->is_new_vlan) { - f->is_new_vlan = false; - if (f->vlan.tpid == ETH_P_8021Q) - set_bit(f->vlan.vid, - adapter->vsi.active_cvlans); - else - set_bit(f->vlan.vid, - adapter->vsi.active_svlans); - } + if (f->state == IAVF_VLAN_IS_NEW) + f->state = IAVF_VLAN_ACTIVE; } spin_unlock_bh(&adapter->mac_vlan_list_lock); } diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 4b5e459b6d49..332472fe4990 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -681,14 +681,32 @@ int mlx4_en_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) return 0; } -int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash) +int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type) { struct mlx4_en_xdp_buff *_ctx = (void *)ctx; + struct mlx4_cqe *cqe = _ctx->cqe; + enum xdp_rss_hash_type xht = 0; + __be16 status; if (unlikely(!(_ctx->dev->features & NETIF_F_RXHASH))) return -ENODATA; - *hash = be32_to_cpu(_ctx->cqe->immed_rss_invalid); + *hash = be32_to_cpu(cqe->immed_rss_invalid); + status = cqe->status; + if (status & cpu_to_be16(MLX4_CQE_STATUS_TCP)) + xht = XDP_RSS_L4_TCP; + if (status & cpu_to_be16(MLX4_CQE_STATUS_UDP)) + xht = XDP_RSS_L4_UDP; + if (status & cpu_to_be16(MLX4_CQE_STATUS_IPV4 | MLX4_CQE_STATUS_IPV4F)) + xht |= XDP_RSS_L3_IPV4; + if (status & cpu_to_be16(MLX4_CQE_STATUS_IPV6)) { + xht |= XDP_RSS_L3_IPV6; + if (cqe->ipv6_ext_mask) + xht |= XDP_RSS_L3_DYNHDR; + } + *rss_type = xht; + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 544e09b97483..4ac4d883047b 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -798,7 +798,8 @@ int mlx4_en_netdev_event(struct notifier_block *this, struct xdp_md; int mlx4_en_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp); -int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash); +int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type); /* * Functions for time stamping diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c index 445fe30c3d0b..2e7806001fdc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c @@ -59,9 +59,6 @@ bool mlx5_eth_supported(struct mlx5_core_dev *dev) if (!IS_ENABLED(CONFIG_MLX5_CORE_EN)) return false; - if (mlx5_core_is_management_pf(dev)) - return false; - if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH) return false; @@ -201,9 +198,6 @@ bool mlx5_rdma_supported(struct mlx5_core_dev *dev) if (!IS_ENABLED(CONFIG_MLX5_INFINIBAND)) return false; - if (mlx5_core_is_management_pf(dev)) - return false; - if (dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_IB_ADEV) return false; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c index 7c9c4e40c019..d000236ddbac 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c @@ -75,10 +75,6 @@ int mlx5_ec_init(struct mlx5_core_dev *dev) if (!mlx5_core_is_ecpf(dev)) return 0; - /* Management PF don't have a peer PF */ - if (mlx5_core_is_management_pf(dev)) - return 0; - return mlx5_host_pf_init(dev); } @@ -89,10 +85,6 @@ void mlx5_ec_cleanup(struct mlx5_core_dev *dev) if (!mlx5_core_is_ecpf(dev)) return; - /* Management PF don't have a peer PF */ - if (mlx5_core_is_management_pf(dev)) - return; - mlx5_host_pf_cleanup(dev); err = mlx5_wait_for_pages(dev, &dev->priv.page_counters[MLX5_HOST_PF]); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c index ca834bbcb44f..8afcec0c5d3c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c @@ -242,7 +242,7 @@ mlx5e_int_port_remove(struct mlx5e_tc_int_port_priv *priv, mlx5_del_flow_rules(int_port->rx_rule); mapping_remove(ctx, int_port->mapping); mlx5e_int_port_metadata_free(priv, int_port->match_metadata); - kfree_rcu(int_port); + kfree_rcu_mightsleep(int_port); priv->num_ports--; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index c5dae48b7932..d9d3b9e1f15a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -34,6 +34,7 @@ #include <net/xdp_sock_drv.h> #include "en/xdp.h" #include "en/params.h" +#include <linux/bitfield.h> int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk) { @@ -169,14 +170,72 @@ static int mlx5e_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) return 0; } -static int mlx5e_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash) +/* Mapping HW RSS Type bits CQE_RSS_HTYPE_IP + CQE_RSS_HTYPE_L4 into 4-bits*/ +#define RSS_TYPE_MAX_TABLE 16 /* 4-bits max 16 entries */ +#define RSS_L4 GENMASK(1, 0) +#define RSS_L3 GENMASK(3, 2) /* Same as CQE_RSS_HTYPE_IP */ + +/* Valid combinations of CQE_RSS_HTYPE_IP + CQE_RSS_HTYPE_L4 sorted numerical */ +enum mlx5_rss_hash_type { + RSS_TYPE_NO_HASH = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IP_NONE) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_NONE)), + RSS_TYPE_L3_IPV4 = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV4) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_NONE)), + RSS_TYPE_L4_IPV4_TCP = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV4) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_TCP)), + RSS_TYPE_L4_IPV4_UDP = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV4) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_UDP)), + RSS_TYPE_L4_IPV4_IPSEC = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV4) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_IPSEC)), + RSS_TYPE_L3_IPV6 = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV6) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_NONE)), + RSS_TYPE_L4_IPV6_TCP = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV6) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_TCP)), + RSS_TYPE_L4_IPV6_UDP = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV6) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_UDP)), + RSS_TYPE_L4_IPV6_IPSEC = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV6) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_IPSEC)), +}; + +/* Invalid combinations will simply return zero, allows no boundary checks */ +static const enum xdp_rss_hash_type mlx5_xdp_rss_type[RSS_TYPE_MAX_TABLE] = { + [RSS_TYPE_NO_HASH] = XDP_RSS_TYPE_NONE, + [1] = XDP_RSS_TYPE_NONE, /* Implicit zero */ + [2] = XDP_RSS_TYPE_NONE, /* Implicit zero */ + [3] = XDP_RSS_TYPE_NONE, /* Implicit zero */ + [RSS_TYPE_L3_IPV4] = XDP_RSS_TYPE_L3_IPV4, + [RSS_TYPE_L4_IPV4_TCP] = XDP_RSS_TYPE_L4_IPV4_TCP, + [RSS_TYPE_L4_IPV4_UDP] = XDP_RSS_TYPE_L4_IPV4_UDP, + [RSS_TYPE_L4_IPV4_IPSEC] = XDP_RSS_TYPE_L4_IPV4_IPSEC, + [RSS_TYPE_L3_IPV6] = XDP_RSS_TYPE_L3_IPV6, + [RSS_TYPE_L4_IPV6_TCP] = XDP_RSS_TYPE_L4_IPV6_TCP, + [RSS_TYPE_L4_IPV6_UDP] = XDP_RSS_TYPE_L4_IPV6_UDP, + [RSS_TYPE_L4_IPV6_IPSEC] = XDP_RSS_TYPE_L4_IPV6_IPSEC, + [12] = XDP_RSS_TYPE_NONE, /* Implicit zero */ + [13] = XDP_RSS_TYPE_NONE, /* Implicit zero */ + [14] = XDP_RSS_TYPE_NONE, /* Implicit zero */ + [15] = XDP_RSS_TYPE_NONE, /* Implicit zero */ +}; + +static int mlx5e_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type) { const struct mlx5e_xdp_buff *_ctx = (void *)ctx; + const struct mlx5_cqe64 *cqe = _ctx->cqe; + u32 hash_type, l4_type, ip_type, lookup; if (unlikely(!(_ctx->xdp.rxq->dev->features & NETIF_F_RXHASH))) return -ENODATA; - *hash = be32_to_cpu(_ctx->cqe->rss_hash_result); + *hash = be32_to_cpu(cqe->rss_hash_result); + + hash_type = cqe->rss_hash_type; + BUILD_BUG_ON(CQE_RSS_HTYPE_IP != RSS_L3); /* same mask */ + ip_type = hash_type & CQE_RSS_HTYPE_IP; + l4_type = FIELD_GET(CQE_RSS_HTYPE_L4, hash_type); + lookup = ip_type | l4_type; + *rss_type = mlx5_xdp_rss_type[lookup]; + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c index 33b3620ea45c..51f1cd8364c2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -670,7 +670,7 @@ static int mlx5e_macsec_del_txsa(struct macsec_context *ctx) mlx5e_macsec_cleanup_sa(macsec, tx_sa, true); mlx5_destroy_encryption_key(macsec->mdev, tx_sa->enc_key_id); - kfree_rcu(tx_sa); + kfree_rcu_mightsleep(tx_sa); macsec_device->tx_sa[assoc_num] = NULL; out: @@ -849,7 +849,7 @@ static void macsec_del_rxsc_ctx(struct mlx5e_macsec *macsec, struct mlx5e_macsec xa_erase(&macsec->sc_xarray, rx_sc->sc_xarray_element->fs_id); metadata_dst_free(rx_sc->md_dst); kfree(rx_sc->sc_xarray_element); - kfree_rcu(rx_sc); + kfree_rcu_mightsleep(rx_sc); } static int mlx5e_macsec_del_rxsc(struct macsec_context *ctx) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 8bdf28762f41..19fed514fc17 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1488,7 +1488,7 @@ int mlx5_esw_sf_max_hpf_functions(struct mlx5_core_dev *dev, u16 *max_sfs, u16 * void *hca_caps; int err; - if (!mlx5_core_is_ecpf(dev) || mlx5_core_is_management_pf(dev)) { + if (!mlx5_core_is_ecpf(dev)) { *max_sfs = 0; return 0; } diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c index 017d68f1e123..972c571b4158 100644 --- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c +++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c @@ -31,6 +31,8 @@ mlxfw_mfa2_tlv_next(const struct mlxfw_mfa2_file *mfa2_file, if (tlv->type == MLXFW_MFA2_TLV_MULTI_PART) { multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, tlv); + if (!multi) + return NULL; tlv_len = NLA_ALIGN(tlv_len + be16_to_cpu(multi->total_len)); } diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h index 48dbfea0a2a1..7cdf0ce24f28 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h +++ b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h @@ -26,7 +26,7 @@ #define MLXSW_PCI_CIR_TIMEOUT_MSECS 1000 #define MLXSW_PCI_SW_RESET_TIMEOUT_MSECS 900000 -#define MLXSW_PCI_SW_RESET_WAIT_MSECS 200 +#define MLXSW_PCI_SW_RESET_WAIT_MSECS 400 #define MLXSW_PCI_FW_READY 0xA1844 #define MLXSW_PCI_FW_READY_MASK 0xFFFF #define MLXSW_PCI_FW_READY_MAGIC 0x5E diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c index 87f76bac2e46..eb827b86ecae 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c @@ -628,7 +628,13 @@ int qlcnic_fw_create_ctx(struct qlcnic_adapter *dev) int i, err, ring; if (dev->flags & QLCNIC_NEED_FLR) { - pci_reset_function(dev->pdev); + err = pci_reset_function(dev->pdev); + if (err) { + dev_err(&dev->pdev->dev, + "Adapter reset failed (%d). Please reboot\n", + err); + return err; + } dev->flags &= ~QLCNIC_NEED_FLR; } diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 884d8d168862..1eceffa02b55 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -541,7 +541,6 @@ int efx_net_open(struct net_device *net_dev) else efx->state = STATE_NET_UP; - efx_selftest_async_start(efx); return 0; } diff --git a/drivers/net/ethernet/sfc/efx_common.c b/drivers/net/ethernet/sfc/efx_common.c index cc30524c2fe4..361687de308d 100644 --- a/drivers/net/ethernet/sfc/efx_common.c +++ b/drivers/net/ethernet/sfc/efx_common.c @@ -544,6 +544,8 @@ void efx_start_all(struct efx_nic *efx) /* Start the hardware monitor if there is one */ efx_start_monitor(efx); + efx_selftest_async_start(efx); + /* Link state detection is normally event-driven; we have * to poll now because we could have missed a change */ diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index ab8b09a9ef61..7a2e76776297 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -4522,7 +4522,7 @@ static int niu_alloc_channels(struct niu *np) err = niu_rbr_fill(np, rp, GFP_KERNEL); if (err) - return err; + goto out_err; } tx_rings = kcalloc(num_tx_rings, sizeof(struct tx_ring_info), diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 37f0b62ec5d6..f9cd566d1c9b 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -27,7 +27,7 @@ #include <linux/of.h> #include <linux/of_mdio.h> #include <linux/of_net.h> -#include <linux/of_device.h> +#include <linux/of_platform.h> #include <linux/if_vlan.h> #include <linux/kmemleak.h> #include <linux/sys_soc.h> diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 35128dd45ffc..c61e4e44a78f 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -7,6 +7,7 @@ #include <linux/io.h> #include <linux/clk.h> +#include <linux/platform_device.h> #include <linux/timer.h> #include <linux/module.h> #include <linux/irqreturn.h> @@ -23,7 +24,7 @@ #include <linux/of.h> #include <linux/of_mdio.h> #include <linux/of_net.h> -#include <linux/of_device.h> +#include <linux/of_platform.h> #include <linux/if_vlan.h> #include <linux/kmemleak.h> #include <linux/sys_soc.h> diff --git a/drivers/net/hamradio/Kconfig b/drivers/net/hamradio/Kconfig index a9c44f08199d..a94c7bd5db2e 100644 --- a/drivers/net/hamradio/Kconfig +++ b/drivers/net/hamradio/Kconfig @@ -47,7 +47,7 @@ config BPQETHER config SCC tristate "Z8530 SCC driver" - depends on ISA && AX25 && ISA_DMA_API + depends on ISA && AX25 help These cards are used to connect your Linux box to an amateur radio in order to communicate with other computers. If you want to use diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c index 5813b07242ce..029875a59ff8 100644 --- a/drivers/net/phy/nxp-c45-tja11xx.c +++ b/drivers/net/phy/nxp-c45-tja11xx.c @@ -191,7 +191,7 @@ #define MAX_ID_PS 2260U #define DEFAULT_ID_PS 2000U -#define PPM_TO_SUBNS_INC(ppb) div_u64(GENMASK(31, 0) * (ppb) * \ +#define PPM_TO_SUBNS_INC(ppb) div_u64(GENMASK_ULL(31, 0) * (ppb) * \ PTP_CLK_PERIOD_100BT1, NSEC_PER_SEC) #define NXP_C45_SKB_CB(skb) ((struct nxp_c45_skb_cb *)(skb)->cb) @@ -1337,6 +1337,17 @@ no_ptp_support: return ret; } +static void nxp_c45_remove(struct phy_device *phydev) +{ + struct nxp_c45_phy *priv = phydev->priv; + + if (priv->ptp_clock) + ptp_clock_unregister(priv->ptp_clock); + + skb_queue_purge(&priv->tx_queue); + skb_queue_purge(&priv->rx_queue); +} + static struct phy_driver nxp_c45_driver[] = { { PHY_ID_MATCH_MODEL(PHY_ID_TJA_1103), @@ -1359,6 +1370,7 @@ static struct phy_driver nxp_c45_driver[] = { .set_loopback = genphy_c45_loopback, .get_sqi = nxp_c45_get_sqi, .get_sqi_max = nxp_c45_get_sqi_max, + .remove = nxp_c45_remove, }, }; diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index 8af10bb53e57..bf345032d450 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -210,6 +210,12 @@ static const enum gpiod_flags gpio_flags[] = { #define SFP_PHY_ADDR 22 #define SFP_PHY_ADDR_ROLLBALL 17 +/* SFP_EEPROM_BLOCK_SIZE is the size of data chunk to read the EEPROM + * at a time. Some SFP modules and also some Linux I2C drivers do not like + * reads longer than 16 bytes. + */ +#define SFP_EEPROM_BLOCK_SIZE 16 + struct sff_data { unsigned int gpios; bool (*module_supported)(const struct sfp_eeprom_id *id); @@ -1929,11 +1935,7 @@ static int sfp_sm_mod_probe(struct sfp *sfp, bool report) u8 check; int ret; - /* Some SFP modules and also some Linux I2C drivers do not like reads - * longer than 16 bytes, so read the EEPROM in chunks of 16 bytes at - * a time. - */ - sfp->i2c_block_size = 16; + sfp->i2c_block_size = SFP_EEPROM_BLOCK_SIZE; ret = sfp_read(sfp, false, 0, &id.base, sizeof(id.base)); if (ret < 0) { @@ -2485,6 +2487,9 @@ static int sfp_module_eeprom(struct sfp *sfp, struct ethtool_eeprom *ee, unsigned int first, last, len; int ret; + if (!(sfp->state & SFP_F_PRESENT)) + return -ENODEV; + if (ee->len == 0) return -EINVAL; @@ -2517,6 +2522,9 @@ static int sfp_module_eeprom_by_page(struct sfp *sfp, const struct ethtool_module_eeprom *page, struct netlink_ext_ack *extack) { + if (!(sfp->state & SFP_F_PRESENT)) + return -ENODEV; + if (page->bank) { NL_SET_ERR_MSG(extack, "Banks not supported"); return -EOPNOTSUPP; @@ -2621,6 +2629,7 @@ static struct sfp *sfp_alloc(struct device *dev) return ERR_PTR(-ENOMEM); sfp->dev = dev; + sfp->i2c_block_size = SFP_EEPROM_BLOCK_SIZE; mutex_init(&sfp->sm_mutex); mutex_init(&sfp->st_mutex); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index ad653b32b2f0..5df1eba7b30a 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1486,7 +1486,8 @@ static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, skb->truesize += skb->data_len; for (i = 1; i < it->nr_segs; i++) { - size_t fragsz = it->iov[i].iov_len; + const struct iovec *iov = iter_iov(it); + size_t fragsz = iov->iov_len; struct page *page; void *frag; diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index decb5ba56a25..0fc4b959edc1 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -1943,7 +1943,7 @@ static struct rx_agg *alloc_rx_agg(struct r8152 *tp, gfp_t mflags) if (!rx_agg) return NULL; - rx_agg->page = alloc_pages(mflags | __GFP_COMP, order); + rx_agg->page = alloc_pages(mflags | __GFP_COMP | __GFP_NOWARN, order); if (!rx_agg->page) goto free_rx; diff --git a/drivers/net/veth.c b/drivers/net/veth.c index c1178915496d..4b3c6647edc6 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1262,11 +1262,12 @@ static void veth_set_xdp_features(struct net_device *dev) peer = rtnl_dereference(priv->peer); if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { + struct veth_priv *priv_peer = netdev_priv(peer); xdp_features_t val = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_RX_SG; - if (priv->_xdp_prog || veth_gro_requested(dev)) + if (priv_peer->_xdp_prog || veth_gro_requested(peer)) val |= NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG; xdp_set_features_flag(dev, val); @@ -1504,19 +1505,23 @@ static int veth_set_features(struct net_device *dev, { netdev_features_t changed = features ^ dev->features; struct veth_priv *priv = netdev_priv(dev); + struct net_device *peer; int err; if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) return 0; + peer = rtnl_dereference(priv->peer); if (features & NETIF_F_GRO) { err = veth_napi_enable(dev); if (err) return err; - xdp_features_set_redirect_target(dev, true); + if (peer) + xdp_features_set_redirect_target(peer, true); } else { - xdp_features_clear_redirect_target(dev); + if (peer) + xdp_features_clear_redirect_target(peer); veth_napi_del(dev); } return 0; @@ -1598,13 +1603,13 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, peer->max_mtu = max_mtu; } - xdp_features_set_redirect_target(dev, true); + xdp_features_set_redirect_target(peer, true); } if (old_prog) { if (!prog) { - if (!veth_gro_requested(dev)) - xdp_features_clear_redirect_target(dev); + if (peer && !veth_gro_requested(dev)) + xdp_features_clear_redirect_target(peer); if (dev->flags & IFF_UP) veth_disable_xdp(dev); @@ -1648,14 +1653,18 @@ static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) return 0; } -static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash) +static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type) { struct veth_xdp_buff *_ctx = (void *)ctx; + struct sk_buff *skb = _ctx->skb; - if (!_ctx->skb) + if (!skb) return -ENODATA; - *hash = skb_get_hash(_ctx->skb); + *hash = skb_get_hash(skb); + *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE; + return 0; } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 2396c28c0122..ea1bd4bb326d 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -814,8 +814,13 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, int page_off, unsigned int *len) { - struct page *page = alloc_page(GFP_ATOMIC); + int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + struct page *page; + + if (page_off + *len + tailroom > PAGE_SIZE) + return NULL; + page = alloc_page(GFP_ATOMIC); if (!page) return NULL; @@ -823,7 +828,6 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, page_off += *len; while (--*num_buf) { - int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); unsigned int buflen; void *buf; int off; diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index da488cbb0542..f2b76ee866a4 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -1504,7 +1504,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq, goto rcd_done; } - if (rxDataRingUsed) { + if (rxDataRingUsed && adapter->rxdataring_enabled) { size_t sz; BUG_ON(rcd->len > rq->data_ring.desc_size); diff --git a/drivers/net/wireless/ath/ath9k/mci.c b/drivers/net/wireless/ath/ath9k/mci.c index 3363fc4e8966..a0845002d6fe 100644 --- a/drivers/net/wireless/ath/ath9k/mci.c +++ b/drivers/net/wireless/ath/ath9k/mci.c @@ -646,9 +646,7 @@ void ath9k_mci_update_wlan_channels(struct ath_softc *sc, bool allow_all) struct ath_hw *ah = sc->sc_ah; struct ath9k_hw_mci *mci = &ah->btcoex_hw.mci; struct ath9k_channel *chan = ah->curchan; - static const u32 channelmap[] = { - 0x00000000, 0xffff0000, 0xffffffff, 0x7fffffff - }; + u32 channelmap[] = {0x00000000, 0xffff0000, 0xffffffff, 0x7fffffff}; int i; s16 chan_start, chan_end; u16 wlan_chan; diff --git a/drivers/net/wwan/iosm/iosm_ipc_pcie.c b/drivers/net/wwan/iosm/iosm_ipc_pcie.c index 5bf5a93937c9..04517bd3325a 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_pcie.c +++ b/drivers/net/wwan/iosm/iosm_ipc_pcie.c @@ -295,7 +295,7 @@ static int ipc_pcie_probe(struct pci_dev *pci, ret = dma_set_mask(ipc_pcie->dev, DMA_BIT_MASK(64)); if (ret) { dev_err(ipc_pcie->dev, "Could not set PCI DMA mask: %d", ret); - return ret; + goto set_mask_fail; } ipc_pcie_config_aspm(ipc_pcie); @@ -323,6 +323,7 @@ static int ipc_pcie_probe(struct pci_dev *pci, imem_init_fail: ipc_pcie_resources_release(ipc_pcie); resources_req_fail: +set_mask_fail: pci_disable_device(pci); pci_enable_fail: kfree(ipc_pcie); diff --git a/drivers/nubus/bus.c b/drivers/nubus/bus.c index 17fad660032c..72921e4f35f6 100644 --- a/drivers/nubus/bus.c +++ b/drivers/nubus/bus.c @@ -14,11 +14,6 @@ #define to_nubus_board(d) container_of(d, struct nubus_board, dev) #define to_nubus_driver(d) container_of(d, struct nubus_driver, driver) -static int nubus_bus_match(struct device *dev, struct device_driver *driver) -{ - return 1; -} - static int nubus_device_probe(struct device *dev) { struct nubus_driver *ndrv = to_nubus_driver(dev->driver); @@ -39,7 +34,6 @@ static void nubus_device_remove(struct device *dev) struct bus_type nubus_bus_type = { .name = "nubus", - .match = nubus_bus_match, .probe = nubus_device_probe, .remove = nubus_device_remove, }; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 282d808400c5..cd7873de3121 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3443,6 +3443,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(0x1d97, 0x2269), /* Lexar NM760 */ .driver_data = NVME_QUIRK_BOGUS_NID | NVME_QUIRK_IGNORE_DEV_SUBNQN, }, + { PCI_DEVICE(0x10ec, 0x5763), /* TEAMGROUP T-FORCE CARDEA ZERO Z330 SSD */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061), .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065), diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c index 07d93753b12f..e311d406b170 100644 --- a/drivers/of/dynamic.c +++ b/drivers/of/dynamic.c @@ -226,6 +226,7 @@ static void __of_attach_node(struct device_node *np) np->sibling = np->parent->child; np->parent->child = np; of_node_clear_flag(np, OF_DETACHED); + np->fwnode.flags |= FWNODE_FLAG_NOT_DEVICE; } /** diff --git a/drivers/of/platform.c b/drivers/of/platform.c index b2bd2e783445..78ae84187449 100644 --- a/drivers/of/platform.c +++ b/drivers/of/platform.c @@ -737,6 +737,11 @@ static int of_platform_notify(struct notifier_block *nb, if (of_node_check_flag(rd->dn, OF_POPULATED)) return NOTIFY_OK; + /* + * Clear the flag before adding the device so that fw_devlink + * doesn't skip adding consumers to this device. + */ + rd->dn->fwnode.flags &= ~FWNODE_FLAG_NOT_DEVICE; /* pdev_parent may be NULL when no bus platform device */ pdev_parent = of_find_device_by_node(rd->dn->parent); pdev = of_platform_device_create(rd->dn, NULL, diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c index 1f716624ca56..ef1d8857a51b 100644 --- a/drivers/pci/msi/msi.c +++ b/drivers/pci/msi/msi.c @@ -750,8 +750,7 @@ out_disable: return ret; } -static bool pci_msix_validate_entries(struct pci_dev *dev, struct msix_entry *entries, - int nvec, int hwsize) +static bool pci_msix_validate_entries(struct pci_dev *dev, struct msix_entry *entries, int nvec) { bool nogap; int i, j; @@ -762,10 +761,6 @@ static bool pci_msix_validate_entries(struct pci_dev *dev, struct msix_entry *en nogap = pci_msi_domain_supports(dev, MSI_FLAG_MSIX_CONTIGUOUS, DENY_LEGACY); for (i = 0; i < nvec; i++) { - /* Entry within hardware limit? */ - if (entries[i].entry >= hwsize) - return false; - /* Check for duplicate entries */ for (j = i + 1; j < nvec; j++) { if (entries[i].entry == entries[j].entry) @@ -805,7 +800,7 @@ int __pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, int if (hwsize < 0) return hwsize; - if (!pci_msix_validate_entries(dev, entries, nvec, hwsize)) + if (!pci_msix_validate_entries(dev, entries, nvec)) return -EINVAL; if (hwsize < nvec) { diff --git a/drivers/pci/of.c b/drivers/pci/of.c index 196834ed44fe..4c2ef2e28fb5 100644 --- a/drivers/pci/of.c +++ b/drivers/pci/of.c @@ -16,14 +16,32 @@ #include "pci.h" #ifdef CONFIG_PCI -void pci_set_of_node(struct pci_dev *dev) +/** + * pci_set_of_node - Find and set device's DT device_node + * @dev: the PCI device structure to fill + * + * Returns 0 on success with of_node set or when no device is described in the + * DT. Returns -ENODEV if the device is present, but disabled in the DT. + */ +int pci_set_of_node(struct pci_dev *dev) { + struct device_node *node; + if (!dev->bus->dev.of_node) - return; - dev->dev.of_node = of_pci_find_child_device(dev->bus->dev.of_node, - dev->devfn); - if (dev->dev.of_node) - dev->dev.fwnode = &dev->dev.of_node->fwnode; + return 0; + + node = of_pci_find_child_device(dev->bus->dev.of_node, dev->devfn); + if (!node) + return 0; + + if (!of_device_is_available(node)) { + of_node_put(node); + return -ENODEV; + } + + dev->dev.of_node = node; + dev->dev.fwnode = &node->fwnode; + return 0; } void pci_release_of_node(struct pci_dev *dev) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index d2c08670a20e..2b48a0aa8008 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -624,7 +624,7 @@ int of_pci_get_max_link_speed(struct device_node *node); u32 of_pci_get_slot_power_limit(struct device_node *node, u8 *slot_power_limit_value, u8 *slot_power_limit_scale); -void pci_set_of_node(struct pci_dev *dev); +int pci_set_of_node(struct pci_dev *dev); void pci_release_of_node(struct pci_dev *dev); void pci_set_bus_of_node(struct pci_bus *bus); void pci_release_bus_of_node(struct pci_bus *bus); @@ -662,7 +662,7 @@ of_pci_get_slot_power_limit(struct device_node *node, return 0; } -static inline void pci_set_of_node(struct pci_dev *dev) { } +static inline int pci_set_of_node(struct pci_dev *dev) { return 0; } static inline void pci_release_of_node(struct pci_dev *dev) { } static inline void pci_set_bus_of_node(struct pci_bus *bus) { } static inline void pci_release_bus_of_node(struct pci_bus *bus) { } diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index a3f68b6ba6ac..f96fa83f2627 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1826,7 +1826,7 @@ int pci_setup_device(struct pci_dev *dev) u32 class; u16 cmd; u8 hdr_type; - int pos = 0; + int err, pos = 0; struct pci_bus_region region; struct resource *res; @@ -1840,10 +1840,10 @@ int pci_setup_device(struct pci_dev *dev) dev->error_state = pci_channel_io_normal; set_pcie_port_type(dev); - pci_set_of_node(dev); + err = pci_set_of_node(dev); + if (err) + return err; pci_set_acpi_fwnode(dev); - if (dev->dev.fwnode && !fwnode_device_is_available(dev->dev.fwnode)) - return -ENODEV; pci_dev_assign_slot(dev); diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index 0145aef1b930..22d39e12b236 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -157,8 +157,6 @@ void pci_remove_root_bus(struct pci_bus *bus) list_for_each_entry_safe(child, tmp, &bus->devices, bus_list) pci_remove_bus_device(child); - pci_remove_bus(bus); - host_bridge->bus = NULL; #ifdef CONFIG_PCI_DOMAINS_GENERIC /* Release domain_nr if it was dynamically allocated */ @@ -166,6 +164,9 @@ void pci_remove_root_bus(struct pci_bus *bus) pci_bus_release_domain_nr(bus, host_bridge->dev.parent); #endif + pci_remove_bus(bus); + host_bridge->bus = NULL; + /* remove the host bridge */ device_del(&host_bridge->dev); } diff --git a/drivers/perf/amlogic/meson_g12_ddr_pmu.c b/drivers/perf/amlogic/meson_g12_ddr_pmu.c index a78fdb15e26c..8b643888d503 100644 --- a/drivers/perf/amlogic/meson_g12_ddr_pmu.c +++ b/drivers/perf/amlogic/meson_g12_ddr_pmu.c @@ -21,23 +21,23 @@ #define DMC_QOS_IRQ BIT(30) /* DMC bandwidth monitor register address offset */ -#define DMC_MON_G12_CTRL0 (0x20 << 2) -#define DMC_MON_G12_CTRL1 (0x21 << 2) -#define DMC_MON_G12_CTRL2 (0x22 << 2) -#define DMC_MON_G12_CTRL3 (0x23 << 2) -#define DMC_MON_G12_CTRL4 (0x24 << 2) -#define DMC_MON_G12_CTRL5 (0x25 << 2) -#define DMC_MON_G12_CTRL6 (0x26 << 2) -#define DMC_MON_G12_CTRL7 (0x27 << 2) -#define DMC_MON_G12_CTRL8 (0x28 << 2) - -#define DMC_MON_G12_ALL_REQ_CNT (0x29 << 2) -#define DMC_MON_G12_ALL_GRANT_CNT (0x2a << 2) -#define DMC_MON_G12_ONE_GRANT_CNT (0x2b << 2) -#define DMC_MON_G12_SEC_GRANT_CNT (0x2c << 2) -#define DMC_MON_G12_THD_GRANT_CNT (0x2d << 2) -#define DMC_MON_G12_FOR_GRANT_CNT (0x2e << 2) -#define DMC_MON_G12_TIMER (0x2f << 2) +#define DMC_MON_G12_CTRL0 (0x0 << 2) +#define DMC_MON_G12_CTRL1 (0x1 << 2) +#define DMC_MON_G12_CTRL2 (0x2 << 2) +#define DMC_MON_G12_CTRL3 (0x3 << 2) +#define DMC_MON_G12_CTRL4 (0x4 << 2) +#define DMC_MON_G12_CTRL5 (0x5 << 2) +#define DMC_MON_G12_CTRL6 (0x6 << 2) +#define DMC_MON_G12_CTRL7 (0x7 << 2) +#define DMC_MON_G12_CTRL8 (0x8 << 2) + +#define DMC_MON_G12_ALL_REQ_CNT (0x9 << 2) +#define DMC_MON_G12_ALL_GRANT_CNT (0xa << 2) +#define DMC_MON_G12_ONE_GRANT_CNT (0xb << 2) +#define DMC_MON_G12_SEC_GRANT_CNT (0xc << 2) +#define DMC_MON_G12_THD_GRANT_CNT (0xd << 2) +#define DMC_MON_G12_FOR_GRANT_CNT (0xe << 2) +#define DMC_MON_G12_TIMER (0xf << 2) /* Each bit represent a axi line */ PMU_FORMAT_ATTR(event, "config:0-7"); diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 609821b756c2..9236a132c7ba 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -872,34 +872,32 @@ static const struct pinconf_ops amd_pinconf_ops = { .pin_config_group_set = amd_pinconf_group_set, }; -static void amd_gpio_irq_init_pin(struct amd_gpio *gpio_dev, int pin) +static void amd_gpio_irq_init(struct amd_gpio *gpio_dev) { - const struct pin_desc *pd; + struct pinctrl_desc *desc = gpio_dev->pctrl->desc; unsigned long flags; u32 pin_reg, mask; + int i; mask = BIT(WAKE_CNTRL_OFF_S0I3) | BIT(WAKE_CNTRL_OFF_S3) | BIT(INTERRUPT_MASK_OFF) | BIT(INTERRUPT_ENABLE_OFF) | BIT(WAKE_CNTRL_OFF_S4); - pd = pin_desc_get(gpio_dev->pctrl, pin); - if (!pd) - return; + for (i = 0; i < desc->npins; i++) { + int pin = desc->pins[i].number; + const struct pin_desc *pd = pin_desc_get(gpio_dev->pctrl, pin); - raw_spin_lock_irqsave(&gpio_dev->lock, flags); - pin_reg = readl(gpio_dev->base + pin * 4); - pin_reg &= ~mask; - writel(pin_reg, gpio_dev->base + pin * 4); - raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); -} + if (!pd) + continue; -static void amd_gpio_irq_init(struct amd_gpio *gpio_dev) -{ - struct pinctrl_desc *desc = gpio_dev->pctrl->desc; - int i; + raw_spin_lock_irqsave(&gpio_dev->lock, flags); - for (i = 0; i < desc->npins; i++) - amd_gpio_irq_init_pin(gpio_dev, i); + pin_reg = readl(gpio_dev->base + i * 4); + pin_reg &= ~mask; + writel(pin_reg, gpio_dev->base + i * 4); + + raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); + } } #ifdef CONFIG_PM_SLEEP @@ -952,10 +950,8 @@ static int amd_gpio_resume(struct device *dev) for (i = 0; i < desc->npins; i++) { int pin = desc->pins[i].number; - if (!amd_gpio_should_save(gpio_dev, pin)) { - amd_gpio_irq_init_pin(gpio_dev, pin); + if (!amd_gpio_should_save(gpio_dev, pin)) continue; - } raw_spin_lock_irqsave(&gpio_dev->lock, flags); gpio_dev->saved_regs[i] |= readl(gpio_dev->base + pin * 4) & PIN_IRQ_PENDING; diff --git a/drivers/regulator/fan53555.c b/drivers/regulator/fan53555.c index 529963a7e4f5..41537c45f036 100644 --- a/drivers/regulator/fan53555.c +++ b/drivers/regulator/fan53555.c @@ -8,18 +8,19 @@ // Copyright (c) 2012 Marvell Technology Ltd. // Yunfan Zhang <yfzhang@marvell.com> +#include <linux/bits.h> +#include <linux/err.h> +#include <linux/i2c.h> #include <linux/module.h> +#include <linux/of_device.h> #include <linux/param.h> -#include <linux/err.h> #include <linux/platform_device.h> +#include <linux/regmap.h> #include <linux/regulator/driver.h> +#include <linux/regulator/fan53555.h> #include <linux/regulator/machine.h> #include <linux/regulator/of_regulator.h> -#include <linux/of_device.h> -#include <linux/i2c.h> #include <linux/slab.h> -#include <linux/regmap.h> -#include <linux/regulator/fan53555.h> /* Voltage setting */ #define FAN53555_VSEL0 0x00 @@ -60,7 +61,7 @@ #define TCS_VSEL1_MODE (1 << 6) #define TCS_SLEW_SHIFT 3 -#define TCS_SLEW_MASK (0x3 < 3) +#define TCS_SLEW_MASK GENMASK(4, 3) enum fan53555_vendor { FAN53526_VENDOR_FAIRCHILD = 0, diff --git a/drivers/regulator/sm5703-regulator.c b/drivers/regulator/sm5703-regulator.c index 05ad28fc4da8..229df7170792 100644 --- a/drivers/regulator/sm5703-regulator.c +++ b/drivers/regulator/sm5703-regulator.c @@ -42,6 +42,7 @@ static const int sm5703_buck_voltagemap[] = { .type = REGULATOR_VOLTAGE, \ .id = SM5703_USBLDO ## _id, \ .ops = &sm5703_regulator_ops_fixed, \ + .n_voltages = 1, \ .fixed_uV = SM5703_USBLDO_MICROVOLT, \ .enable_reg = SM5703_REG_USBLDO12, \ .enable_mask = SM5703_REG_EN_USBLDO ##_id, \ @@ -56,6 +57,7 @@ static const int sm5703_buck_voltagemap[] = { .type = REGULATOR_VOLTAGE, \ .id = SM5703_VBUS, \ .ops = &sm5703_regulator_ops_fixed, \ + .n_voltages = 1, \ .fixed_uV = SM5703_VBUS_MICROVOLT, \ .enable_reg = SM5703_REG_CNTL, \ .enable_mask = SM5703_OPERATION_MODE_MASK, \ diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c index e300cf26bc2a..d698ca506cca 100644 --- a/drivers/sbus/char/oradax.c +++ b/drivers/sbus/char/oradax.c @@ -18,7 +18,7 @@ * the recommended way for applications to use the coprocessor, and * the driver interface is not intended for general use. * - * See Documentation/sparc/oradax/oracle-dax.rst for more details. + * See Documentation/arch/sparc/oradax/oracle-dax.rst for more details. */ #include <linux/uaccess.h> diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c index b11a9162e73a..b54f2c6c08c3 100644 --- a/drivers/scsi/ses.c +++ b/drivers/scsi/ses.c @@ -509,9 +509,6 @@ static int ses_enclosure_find_by_addr(struct enclosure_device *edev, int i; struct ses_component *scomp; - if (!edev->component[0].scratch) - return 0; - for (i = 0; i < edev->components; i++) { scomp = edev->component[i].scratch; if (scomp->addr != efd->addr) @@ -602,8 +599,10 @@ static void ses_enclosure_data_process(struct enclosure_device *edev, components++, type_ptr[0], name); - else + else if (components < edev->components) ecomp = &edev->component[components++]; + else + ecomp = ERR_PTR(-EINVAL); if (!IS_ERR(ecomp)) { if (addl_desc_ptr) { @@ -734,11 +733,6 @@ static int ses_intf_add(struct device *cdev, components += type_ptr[1]; } - if (components == 0) { - sdev_printk(KERN_WARNING, sdev, "enclosure has no enumerated components\n"); - goto err_free; - } - ses_dev->page1 = buf; ses_dev->page1_len = len; buf = NULL; @@ -780,9 +774,11 @@ static int ses_intf_add(struct device *cdev, buf = NULL; } page2_not_supported: - scomp = kcalloc(components, sizeof(struct ses_component), GFP_KERNEL); - if (!scomp) - goto err_free; + if (components > 0) { + scomp = kcalloc(components, sizeof(struct ses_component), GFP_KERNEL); + if (!scomp) + goto err_free; + } edev = enclosure_register(cdev->parent, dev_name(&sdev->sdev_gendev), components, &ses_enclosure_callbacks); diff --git a/drivers/spi/spi-rockchip-sfc.c b/drivers/spi/spi-rockchip-sfc.c index bd87d3c92dd3..69347b6bf60c 100644 --- a/drivers/spi/spi-rockchip-sfc.c +++ b/drivers/spi/spi-rockchip-sfc.c @@ -632,7 +632,7 @@ static int rockchip_sfc_probe(struct platform_device *pdev) if (ret) { dev_err(dev, "Failed to request irq\n"); - return ret; + goto err_irq; } ret = rockchip_sfc_init(sfc); diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 44b85a8d47f1..7bc14fb309a6 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -4456,6 +4456,11 @@ static int of_spi_notify(struct notifier_block *nb, unsigned long action, return NOTIFY_OK; } + /* + * Clear the flag before adding the device so that fw_devlink + * doesn't skip adding consumers to this device. + */ + rd->dn->fwnode.flags &= ~FWNODE_FLAG_NOT_DEVICE; spi = of_register_spi_device(ctlr, rd->dn); put_device(&ctlr->dev); diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c index 290b1bb0e9cd..df5fb5410b72 100644 --- a/drivers/tee/optee/call.c +++ b/drivers/tee/optee/call.c @@ -488,7 +488,7 @@ static bool is_normal_memory(pgprot_t p) #elif defined(CONFIG_ARM64) return (pgprot_val(p) & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL); #else -#error "Unuspported architecture" +#error "Unsupported architecture" #endif } diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index b1c6231defad..673cf0359494 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -32,7 +32,7 @@ static int shm_get_kernel_pages(unsigned long start, size_t page_count, is_kmap_addr((void *)start))) return -EINVAL; - page = virt_to_page(start); + page = virt_to_page((void *)start); for (n = 0; n < page_count; n++) { pages[n] = page + n; get_page(pages[n]); diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c index 2e22bb82b738..e69868e868eb 100644 --- a/drivers/thermal/intel/therm_throt.c +++ b/drivers/thermal/intel/therm_throt.c @@ -193,8 +193,67 @@ static const struct attribute_group thermal_attr_group = { #define THERM_THROT_POLL_INTERVAL HZ #define THERM_STATUS_PROCHOT_LOG BIT(1) -#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15)) -#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11)) +static u64 therm_intr_core_clear_mask; +static u64 therm_intr_pkg_clear_mask; + +static void thermal_intr_init_core_clear_mask(void) +{ + if (therm_intr_core_clear_mask) + return; + + /* + * Reference: Intel SDM Volume 4 + * "Table 2-2. IA-32 Architectural MSRs", MSR 0x19C + * IA32_THERM_STATUS. + */ + + /* + * Bit 1, 3, 5: CPUID.01H:EDX[22] = 1. This driver will not + * enable interrupts, when 0 as it checks for X86_FEATURE_ACPI. + */ + therm_intr_core_clear_mask = (BIT(1) | BIT(3) | BIT(5)); + + /* + * Bit 7 and 9: Thermal Threshold #1 and #2 log + * If CPUID.01H:ECX[8] = 1 + */ + if (boot_cpu_has(X86_FEATURE_TM2)) + therm_intr_core_clear_mask |= (BIT(7) | BIT(9)); + + /* Bit 11: Power Limitation log (R/WC0) If CPUID.06H:EAX[4] = 1 */ + if (boot_cpu_has(X86_FEATURE_PLN)) + therm_intr_core_clear_mask |= BIT(11); + + /* + * Bit 13: Current Limit log (R/WC0) If CPUID.06H:EAX[7] = 1 + * Bit 15: Cross Domain Limit log (R/WC0) If CPUID.06H:EAX[7] = 1 + */ + if (boot_cpu_has(X86_FEATURE_HWP)) + therm_intr_core_clear_mask |= (BIT(13) | BIT(15)); +} + +static void thermal_intr_init_pkg_clear_mask(void) +{ + if (therm_intr_pkg_clear_mask) + return; + + /* + * Reference: Intel SDM Volume 4 + * "Table 2-2. IA-32 Architectural MSRs", MSR 0x1B1 + * IA32_PACKAGE_THERM_STATUS. + */ + + /* All bits except BIT 26 depend on CPUID.06H: EAX[6] = 1 */ + if (boot_cpu_has(X86_FEATURE_PTS)) + therm_intr_pkg_clear_mask = (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11)); + + /* + * Intel SDM Volume 2A: Thermal and Power Management Leaf + * Bit 26: CPUID.06H: EAX[19] = 1 + */ + if (boot_cpu_has(X86_FEATURE_HFI)) + therm_intr_pkg_clear_mask |= BIT(26); +} /* * Clear the bits in package thermal status register for bit = 1 @@ -207,13 +266,10 @@ void thermal_clear_package_intr_status(int level, u64 bit_mask) if (level == CORE_LEVEL) { msr = MSR_IA32_THERM_STATUS; - msr_val = THERM_STATUS_CLEAR_CORE_MASK; + msr_val = therm_intr_core_clear_mask; } else { msr = MSR_IA32_PACKAGE_THERM_STATUS; - msr_val = THERM_STATUS_CLEAR_PKG_MASK; - if (boot_cpu_has(X86_FEATURE_HFI)) - msr_val |= BIT(26); - + msr_val = therm_intr_pkg_clear_mask; } msr_val &= ~bit_mask; @@ -708,6 +764,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c) h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); + thermal_intr_init_core_clear_mask(); + thermal_intr_init_pkg_clear_mask(); + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) wrmsr(MSR_IA32_THERM_INTERRUPT, diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 520646ae7fa0..195963b82b63 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2467,10 +2467,11 @@ static int setup_driver(struct mlx5_vdpa_dev *mvdev) err = 0; goto out; } + mlx5_vdpa_add_debugfs(ndev); err = setup_virtqueues(mvdev); if (err) { mlx5_vdpa_warn(mvdev, "setup_virtqueues\n"); - goto out; + goto err_setup; } err = create_rqt(ndev); @@ -2500,6 +2501,8 @@ err_tir: destroy_rqt(ndev); err_rqt: teardown_virtqueues(ndev); +err_setup: + mlx5_vdpa_remove_debugfs(ndev->debugfs); out: return err; } @@ -2513,6 +2516,8 @@ static void teardown_driver(struct mlx5_vdpa_net *ndev) if (!ndev->setup) return; + mlx5_vdpa_remove_debugfs(ndev->debugfs); + ndev->debugfs = NULL; teardown_steering(ndev); destroy_tir(ndev); destroy_rqt(ndev); @@ -3261,7 +3266,6 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, if (err) goto err_reg; - mlx5_vdpa_add_debugfs(ndev); mgtdev->ndev = ndev; return 0; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index 862f405362de..dfe2ce341803 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -466,16 +466,21 @@ static int vdpasim_net_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, vdpasim_net_setup_config(simdev, config); - ret = _vdpa_register_device(&simdev->vdpa, VDPASIM_NET_VQ_NUM); - if (ret) - goto reg_err; - net = sim_to_net(simdev); u64_stats_init(&net->tx_stats.syncp); u64_stats_init(&net->rx_stats.syncp); u64_stats_init(&net->cq_stats.syncp); + /* + * Initialization must be completed before this call, since it can + * connect the device to the vDPA bus, so requests can arrive after + * this call. + */ + ret = _vdpa_register_device(&simdev->vdpa, VDPASIM_NET_VQ_NUM); + if (ret) + goto reg_err; + return 0; reg_err: diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 587fbae06182..b455d9ab6f3d 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -13,9 +13,14 @@ config VHOST_RING This option is selected by any driver which needs to access the host side of a virtio ring. +config VHOST_TASK + bool + default n + config VHOST tristate select VHOST_IOTLB + select VHOST_TASK help This option is selected by any driver which needs to access the core of vhost. diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index b244e7c0f514..e68f7d226bc9 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -125,7 +125,6 @@ struct vhost_scsi_tpg { struct se_portal_group se_tpg; /* Pointer back to vhost_scsi, protected by tv_tpg_mutex */ struct vhost_scsi *vhost_scsi; - struct list_head tmf_queue; }; struct vhost_scsi_tport { @@ -206,10 +205,8 @@ struct vhost_scsi { struct vhost_scsi_tmf { struct vhost_work vwork; - struct vhost_scsi_tpg *tpg; struct vhost_scsi *vhost; struct vhost_scsi_virtqueue *svq; - struct list_head queue_entry; struct se_cmd se_cmd; u8 scsi_resp; @@ -352,12 +349,9 @@ static void vhost_scsi_release_cmd_res(struct se_cmd *se_cmd) static void vhost_scsi_release_tmf_res(struct vhost_scsi_tmf *tmf) { - struct vhost_scsi_tpg *tpg = tmf->tpg; struct vhost_scsi_inflight *inflight = tmf->inflight; - mutex_lock(&tpg->tv_tpg_mutex); - list_add_tail(&tpg->tmf_queue, &tmf->queue_entry); - mutex_unlock(&tpg->tv_tpg_mutex); + kfree(tmf); vhost_scsi_put_inflight(inflight); } @@ -671,7 +665,7 @@ vhost_scsi_calc_sgls(struct iov_iter *iter, size_t bytes, int max_sgls) { int sgl_count = 0; - if (!iter || !iter->iov) { + if (!iter || !iter_iov(iter)) { pr_err("%s: iter->iov is NULL, but expected bytes: %zu" " present\n", __func__, bytes); return -EINVAL; @@ -1194,19 +1188,11 @@ vhost_scsi_handle_tmf(struct vhost_scsi *vs, struct vhost_scsi_tpg *tpg, goto send_reject; } - mutex_lock(&tpg->tv_tpg_mutex); - if (list_empty(&tpg->tmf_queue)) { - pr_err("Missing reserve TMF. Could not handle LUN RESET.\n"); - mutex_unlock(&tpg->tv_tpg_mutex); + tmf = kzalloc(sizeof(*tmf), GFP_KERNEL); + if (!tmf) goto send_reject; - } - - tmf = list_first_entry(&tpg->tmf_queue, struct vhost_scsi_tmf, - queue_entry); - list_del_init(&tmf->queue_entry); - mutex_unlock(&tpg->tv_tpg_mutex); - tmf->tpg = tpg; + vhost_work_init(&tmf->vwork, vhost_scsi_tmf_resp_work); tmf->vhost = vs; tmf->svq = svq; tmf->resp_iov = vq->iov[vc->out]; @@ -1658,7 +1644,10 @@ undepend: for (i = 0; i < VHOST_SCSI_MAX_TARGET; i++) { tpg = vs_tpg[i]; if (tpg) { + mutex_lock(&tpg->tv_tpg_mutex); + tpg->vhost_scsi = NULL; tpg->tv_tpg_vhost_count--; + mutex_unlock(&tpg->tv_tpg_mutex); target_undepend_item(&tpg->se_tpg.tpg_group.cg_item); } } @@ -2032,19 +2021,11 @@ static int vhost_scsi_port_link(struct se_portal_group *se_tpg, { struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg, se_tpg); - struct vhost_scsi_tmf *tmf; - - tmf = kzalloc(sizeof(*tmf), GFP_KERNEL); - if (!tmf) - return -ENOMEM; - INIT_LIST_HEAD(&tmf->queue_entry); - vhost_work_init(&tmf->vwork, vhost_scsi_tmf_resp_work); mutex_lock(&vhost_scsi_mutex); mutex_lock(&tpg->tv_tpg_mutex); tpg->tv_tpg_port_count++; - list_add_tail(&tmf->queue_entry, &tpg->tmf_queue); mutex_unlock(&tpg->tv_tpg_mutex); vhost_scsi_hotplug(tpg, lun); @@ -2059,16 +2040,11 @@ static void vhost_scsi_port_unlink(struct se_portal_group *se_tpg, { struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg, se_tpg); - struct vhost_scsi_tmf *tmf; mutex_lock(&vhost_scsi_mutex); mutex_lock(&tpg->tv_tpg_mutex); tpg->tv_tpg_port_count--; - tmf = list_first_entry(&tpg->tmf_queue, struct vhost_scsi_tmf, - queue_entry); - list_del(&tmf->queue_entry); - kfree(tmf); mutex_unlock(&tpg->tv_tpg_mutex); vhost_scsi_hotunplug(tpg, lun); @@ -2329,7 +2305,6 @@ vhost_scsi_make_tpg(struct se_wwn *wwn, const char *name) } mutex_init(&tpg->tv_tpg_mutex); INIT_LIST_HEAD(&tpg->tv_tpg_list); - INIT_LIST_HEAD(&tpg->tmf_queue); tpg->tport = tport; tpg->tport_tpgt = tpgt; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index f11bdbe4c2c5..6d07b42833be 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -22,11 +22,11 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/kthread.h> -#include <linux/cgroup.h> #include <linux/module.h> #include <linux/sort.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> +#include <linux/sched/vhost_task.h> #include <linux/interval_tree_generic.h> #include <linux/nospec.h> #include <linux/kcov.h> @@ -255,8 +255,8 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) * sure it was not in the list. * test_and_set_bit() implies a memory barrier. */ - llist_add(&work->node, &dev->work_list); - wake_up_process(dev->worker); + llist_add(&work->node, &dev->worker->work_list); + wake_up_process(dev->worker->vtsk->task); } } EXPORT_SYMBOL_GPL(vhost_work_queue); @@ -264,7 +264,7 @@ EXPORT_SYMBOL_GPL(vhost_work_queue); /* A lockless hint for busy polling code to exit the loop */ bool vhost_has_work(struct vhost_dev *dev) { - return !llist_empty(&dev->work_list); + return dev->worker && !llist_empty(&dev->worker->work_list); } EXPORT_SYMBOL_GPL(vhost_has_work); @@ -335,22 +335,20 @@ static void vhost_vq_reset(struct vhost_dev *dev, static int vhost_worker(void *data) { - struct vhost_dev *dev = data; + struct vhost_worker *worker = data; struct vhost_work *work, *work_next; struct llist_node *node; - kthread_use_mm(dev->mm); - for (;;) { /* mb paired w/ kthread_stop */ set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) { + if (vhost_task_should_stop(worker->vtsk)) { __set_current_state(TASK_RUNNING); break; } - node = llist_del_all(&dev->work_list); + node = llist_del_all(&worker->work_list); if (!node) schedule(); @@ -360,14 +358,14 @@ static int vhost_worker(void *data) llist_for_each_entry_safe(work, work_next, node, node) { clear_bit(VHOST_WORK_QUEUED, &work->flags); __set_current_state(TASK_RUNNING); - kcov_remote_start_common(dev->kcov_handle); + kcov_remote_start_common(worker->kcov_handle); work->fn(work); kcov_remote_stop(); if (need_resched()) schedule(); } } - kthread_unuse_mm(dev->mm); + return 0; } @@ -479,7 +477,6 @@ void vhost_dev_init(struct vhost_dev *dev, dev->byte_weight = byte_weight; dev->use_worker = use_worker; dev->msg_handler = msg_handler; - init_llist_head(&dev->work_list); init_waitqueue_head(&dev->wait); INIT_LIST_HEAD(&dev->read_list); INIT_LIST_HEAD(&dev->pending_list); @@ -509,31 +506,6 @@ long vhost_dev_check_owner(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_dev_check_owner); -struct vhost_attach_cgroups_struct { - struct vhost_work work; - struct task_struct *owner; - int ret; -}; - -static void vhost_attach_cgroups_work(struct vhost_work *work) -{ - struct vhost_attach_cgroups_struct *s; - - s = container_of(work, struct vhost_attach_cgroups_struct, work); - s->ret = cgroup_attach_task_all(s->owner, current); -} - -static int vhost_attach_cgroups(struct vhost_dev *dev) -{ - struct vhost_attach_cgroups_struct attach; - - attach.owner = current; - vhost_work_init(&attach.work, vhost_attach_cgroups_work); - vhost_work_queue(dev, &attach.work); - vhost_dev_flush(dev); - return attach.ret; -} - /* Caller should have device mutex */ bool vhost_dev_has_owner(struct vhost_dev *dev) { @@ -571,10 +543,54 @@ static void vhost_detach_mm(struct vhost_dev *dev) dev->mm = NULL; } +static void vhost_worker_free(struct vhost_dev *dev) +{ + struct vhost_worker *worker = dev->worker; + + if (!worker) + return; + + dev->worker = NULL; + WARN_ON(!llist_empty(&worker->work_list)); + vhost_task_stop(worker->vtsk); + kfree(worker); +} + +static int vhost_worker_create(struct vhost_dev *dev) +{ + struct vhost_worker *worker; + struct vhost_task *vtsk; + char name[TASK_COMM_LEN]; + int ret; + + worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT); + if (!worker) + return -ENOMEM; + + dev->worker = worker; + worker->kcov_handle = kcov_common_handle(); + init_llist_head(&worker->work_list); + snprintf(name, sizeof(name), "vhost-%d", current->pid); + + vtsk = vhost_task_create(vhost_worker, worker, name); + if (!vtsk) { + ret = -ENOMEM; + goto free_worker; + } + + worker->vtsk = vtsk; + vhost_task_start(vtsk); + return 0; + +free_worker: + kfree(worker); + dev->worker = NULL; + return ret; +} + /* Caller should have device mutex */ long vhost_dev_set_owner(struct vhost_dev *dev) { - struct task_struct *worker; int err; /* Is there an owner already? */ @@ -585,36 +601,21 @@ long vhost_dev_set_owner(struct vhost_dev *dev) vhost_attach_mm(dev); - dev->kcov_handle = kcov_common_handle(); if (dev->use_worker) { - worker = kthread_create(vhost_worker, dev, - "vhost-%d", current->pid); - if (IS_ERR(worker)) { - err = PTR_ERR(worker); - goto err_worker; - } - - dev->worker = worker; - wake_up_process(worker); /* avoid contributing to loadavg */ - - err = vhost_attach_cgroups(dev); + err = vhost_worker_create(dev); if (err) - goto err_cgroup; + goto err_worker; } err = vhost_dev_alloc_iovecs(dev); if (err) - goto err_cgroup; + goto err_iovecs; return 0; -err_cgroup: - if (dev->worker) { - kthread_stop(dev->worker); - dev->worker = NULL; - } +err_iovecs: + vhost_worker_free(dev); err_worker: vhost_detach_mm(dev); - dev->kcov_handle = 0; err_mm: return err; } @@ -705,12 +706,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev) dev->iotlb = NULL; vhost_clear_msg(dev); wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM); - WARN_ON(!llist_empty(&dev->work_list)); - if (dev->worker) { - kthread_stop(dev->worker); - dev->worker = NULL; - dev->kcov_handle = 0; - } + vhost_worker_free(dev); vhost_detach_mm(dev); } EXPORT_SYMBOL_GPL(vhost_dev_cleanup); @@ -1831,7 +1827,7 @@ EXPORT_SYMBOL_GPL(vhost_dev_ioctl); /* TODO: This is really inefficient. We need something like get_user() * (instruction directly accesses the data, with an exception table entry - * returning -EFAULT). See Documentation/x86/exception-tables.rst. + * returning -EFAULT). See Documentation/arch/x86/exception-tables.rst. */ static int set_bit_to_user(int nr, void __user *addr) { diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 1647b750169c..0308638cdeee 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -16,6 +16,7 @@ #include <linux/irqbypass.h> struct vhost_work; +struct vhost_task; typedef void (*vhost_work_fn_t)(struct vhost_work *work); #define VHOST_WORK_QUEUED 1 @@ -25,6 +26,12 @@ struct vhost_work { unsigned long flags; }; +struct vhost_worker { + struct vhost_task *vtsk; + struct llist_head work_list; + u64 kcov_handle; +}; + /* Poll a file (eventfd or socket) */ /* Note: there's nothing vhost specific about this structure. */ struct vhost_poll { @@ -147,8 +154,7 @@ struct vhost_dev { struct vhost_virtqueue **vqs; int nvqs; struct eventfd_ctx *log_ctx; - struct llist_head work_list; - struct task_struct *worker; + struct vhost_worker *worker; struct vhost_iotlb *umem; struct vhost_iotlb *iotlb; spinlock_t iotlb_lock; @@ -158,7 +164,6 @@ struct vhost_dev { int iov_limit; int weight; int byte_weight; - u64 kcov_handle; bool use_worker; int (*msg_handler)(struct vhost_dev *dev, u32 asid, struct vhost_iotlb_msg *msg); diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 0a2c47df01f4..eb565a10e5cd 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -823,7 +823,7 @@ static int set_con2fb_map(int unit, int newidx, int user) int oldidx = con2fb_map[unit]; struct fb_info *info = fbcon_registered_fb[newidx]; struct fb_info *oldinfo = NULL; - int found, err = 0, show_logo; + int err = 0, show_logo; WARN_CONSOLE_UNLOCKED(); @@ -841,26 +841,26 @@ static int set_con2fb_map(int unit, int newidx, int user) if (oldidx != -1) oldinfo = fbcon_registered_fb[oldidx]; - found = search_fb_in_map(newidx); - - if (!err && !found) { + if (!search_fb_in_map(newidx)) { err = con2fb_acquire_newinfo(vc, info, unit); - if (!err) - con2fb_map[unit] = newidx; + if (err) + return err; + + fbcon_add_cursor_work(info); } + con2fb_map[unit] = newidx; + /* * If old fb is not mapped to any of the consoles, * fbcon should release it. */ - if (!err && oldinfo && !search_fb_in_map(oldidx)) + if (oldinfo && !search_fb_in_map(oldidx)) con2fb_release_oldinfo(vc, oldinfo, info); show_logo = (fg_console == 0 && !user && logo_shown != FBCON_LOGO_DONTSHOW); - if (!found) - fbcon_add_cursor_work(info); con2fb_map_boot[unit] = newidx; con2fb_init_display(vc, info, unit, show_logo); diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 875541ff185b..3fd95a79e4c3 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -1116,6 +1116,8 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, case FBIOPUT_VSCREENINFO: if (copy_from_user(&var, argp, sizeof(var))) return -EFAULT; + /* only for kernel-internal use */ + var.activate &= ~FB_ACTIVATE_KD_TEXT; console_lock(); lock_fb_info(info); ret = fbcon_modechange_possible(info, &var); diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c index 46f1a8d558b0..97dbe715e96a 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.c +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -46,7 +46,15 @@ struct snp_guest_dev { void *certs_data; struct snp_guest_crypto *crypto; + /* request and response are in unencrypted memory */ struct snp_guest_msg *request, *response; + + /* + * Avoid information leakage by double-buffering shared messages + * in fields that are in regular encrypted memory. + */ + struct snp_guest_msg secret_request, secret_response; + struct snp_secrets_page_layout *layout; struct snp_req_data input; u32 *os_area_msg_seqno; @@ -266,14 +274,17 @@ static int dec_payload(struct snp_guest_dev *snp_dev, struct snp_guest_msg *msg, static int verify_and_dec_payload(struct snp_guest_dev *snp_dev, void *payload, u32 sz) { struct snp_guest_crypto *crypto = snp_dev->crypto; - struct snp_guest_msg *resp = snp_dev->response; - struct snp_guest_msg *req = snp_dev->request; + struct snp_guest_msg *resp = &snp_dev->secret_response; + struct snp_guest_msg *req = &snp_dev->secret_request; struct snp_guest_msg_hdr *req_hdr = &req->hdr; struct snp_guest_msg_hdr *resp_hdr = &resp->hdr; dev_dbg(snp_dev->dev, "response [seqno %lld type %d version %d sz %d]\n", resp_hdr->msg_seqno, resp_hdr->msg_type, resp_hdr->msg_version, resp_hdr->msg_sz); + /* Copy response from shared memory to encrypted memory. */ + memcpy(resp, snp_dev->response, sizeof(*resp)); + /* Verify that the sequence counter is incremented by 1 */ if (unlikely(resp_hdr->msg_seqno != (req_hdr->msg_seqno + 1))) return -EBADMSG; @@ -297,7 +308,7 @@ static int verify_and_dec_payload(struct snp_guest_dev *snp_dev, void *payload, static int enc_payload(struct snp_guest_dev *snp_dev, u64 seqno, int version, u8 type, void *payload, size_t sz) { - struct snp_guest_msg *req = snp_dev->request; + struct snp_guest_msg *req = &snp_dev->secret_request; struct snp_guest_msg_hdr *hdr = &req->hdr; memset(req, 0, sizeof(*req)); @@ -321,11 +332,12 @@ static int enc_payload(struct snp_guest_dev *snp_dev, u64 seqno, int version, u8 return __enc_payload(snp_dev, req, payload, sz); } -static int __handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code, __u64 *fw_err) +static int __handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code, + struct snp_guest_request_ioctl *rio) { - unsigned long err = 0xff, override_err = 0; unsigned long req_start = jiffies; unsigned int override_npages = 0; + u64 override_err = 0; int rc; retry_request: @@ -335,7 +347,7 @@ retry_request: * sequence number must be incremented or the VMPCK must be deleted to * prevent reuse of the IV. */ - rc = snp_issue_guest_request(exit_code, &snp_dev->input, &err); + rc = snp_issue_guest_request(exit_code, &snp_dev->input, rio); switch (rc) { case -ENOSPC: /* @@ -353,7 +365,7 @@ retry_request: * request buffer size was too small and give the caller the * required buffer size. */ - override_err = SNP_GUEST_REQ_INVALID_LEN; + override_err = SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN); /* * If this call to the firmware succeeds, the sequence number can @@ -366,7 +378,7 @@ retry_request: goto retry_request; /* - * The host may return SNP_GUEST_REQ_ERR_EBUSY if the request has been + * The host may return SNP_GUEST_VMM_ERR_BUSY if the request has been * throttled. Retry in the driver to avoid returning and reusing the * message sequence number on a different message. */ @@ -387,27 +399,29 @@ retry_request: */ snp_inc_msg_seqno(snp_dev); - if (fw_err) - *fw_err = override_err ?: err; + if (override_err) { + rio->exitinfo2 = override_err; + + /* + * If an extended guest request was issued and the supplied certificate + * buffer was not large enough, a standard guest request was issued to + * prevent IV reuse. If the standard request was successful, return -EIO + * back to the caller as would have originally been returned. + */ + if (!rc && override_err == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN)) + rc = -EIO; + } if (override_npages) snp_dev->input.data_npages = override_npages; - /* - * If an extended guest request was issued and the supplied certificate - * buffer was not large enough, a standard guest request was issued to - * prevent IV reuse. If the standard request was successful, return -EIO - * back to the caller as would have originally been returned. - */ - if (!rc && override_err == SNP_GUEST_REQ_INVALID_LEN) - return -EIO; - return rc; } -static int handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code, int msg_ver, - u8 type, void *req_buf, size_t req_sz, void *resp_buf, - u32 resp_sz, __u64 *fw_err) +static int handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code, + struct snp_guest_request_ioctl *rio, u8 type, + void *req_buf, size_t req_sz, void *resp_buf, + u32 resp_sz) { u64 seqno; int rc; @@ -417,19 +431,31 @@ static int handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code, in if (!seqno) return -EIO; + /* Clear shared memory's response for the host to populate. */ memset(snp_dev->response, 0, sizeof(struct snp_guest_msg)); - /* Encrypt the userspace provided payload */ - rc = enc_payload(snp_dev, seqno, msg_ver, type, req_buf, req_sz); + /* Encrypt the userspace provided payload in snp_dev->secret_request. */ + rc = enc_payload(snp_dev, seqno, rio->msg_version, type, req_buf, req_sz); if (rc) return rc; - rc = __handle_guest_request(snp_dev, exit_code, fw_err); + /* + * Write the fully encrypted request to the shared unencrypted + * request page. + */ + memcpy(snp_dev->request, &snp_dev->secret_request, + sizeof(snp_dev->secret_request)); + + rc = __handle_guest_request(snp_dev, exit_code, rio); if (rc) { - if (rc == -EIO && *fw_err == SNP_GUEST_REQ_INVALID_LEN) + if (rc == -EIO && + rio->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN)) return rc; - dev_alert(snp_dev->dev, "Detected error from ASP request. rc: %d, fw_err: %llu\n", rc, *fw_err); + dev_alert(snp_dev->dev, + "Detected error from ASP request. rc: %d, exitinfo2: 0x%llx\n", + rc, rio->exitinfo2); + snp_disable_vmpck(snp_dev); return rc; } @@ -469,9 +495,9 @@ static int get_report(struct snp_guest_dev *snp_dev, struct snp_guest_request_io if (!resp) return -ENOMEM; - rc = handle_guest_request(snp_dev, SVM_VMGEXIT_GUEST_REQUEST, arg->msg_version, + rc = handle_guest_request(snp_dev, SVM_VMGEXIT_GUEST_REQUEST, arg, SNP_MSG_REPORT_REQ, &req, sizeof(req), resp->data, - resp_len, &arg->fw_err); + resp_len); if (rc) goto e_free; @@ -509,9 +535,8 @@ static int get_derived_key(struct snp_guest_dev *snp_dev, struct snp_guest_reque if (copy_from_user(&req, (void __user *)arg->req_data, sizeof(req))) return -EFAULT; - rc = handle_guest_request(snp_dev, SVM_VMGEXIT_GUEST_REQUEST, arg->msg_version, - SNP_MSG_KEY_REQ, &req, sizeof(req), buf, resp_len, - &arg->fw_err); + rc = handle_guest_request(snp_dev, SVM_VMGEXIT_GUEST_REQUEST, arg, + SNP_MSG_KEY_REQ, &req, sizeof(req), buf, resp_len); if (rc) return rc; @@ -571,12 +596,12 @@ cmd: return -ENOMEM; snp_dev->input.data_npages = npages; - ret = handle_guest_request(snp_dev, SVM_VMGEXIT_EXT_GUEST_REQUEST, arg->msg_version, + ret = handle_guest_request(snp_dev, SVM_VMGEXIT_EXT_GUEST_REQUEST, arg, SNP_MSG_REPORT_REQ, &req.data, - sizeof(req.data), resp->data, resp_len, &arg->fw_err); + sizeof(req.data), resp->data, resp_len); /* If certs length is invalid then copy the returned length */ - if (arg->fw_err == SNP_GUEST_REQ_INVALID_LEN) { + if (arg->vmm_error == SNP_GUEST_VMM_ERR_INVALID_LEN) { req.certs_len = snp_dev->input.data_npages << PAGE_SHIFT; if (copy_to_user((void __user *)arg->req_data, &req, sizeof(req))) @@ -611,7 +636,7 @@ static long snp_guest_ioctl(struct file *file, unsigned int ioctl, unsigned long if (copy_from_user(&input, argp, sizeof(input))) return -EFAULT; - input.fw_err = 0xff; + input.exitinfo2 = 0xff; /* Message version must be non-zero */ if (!input.msg_version) @@ -642,7 +667,7 @@ static long snp_guest_ioctl(struct file *file, unsigned int ioctl, unsigned long mutex_unlock(&snp_cmd_mutex); - if (input.fw_err && copy_to_user(argp, &input, sizeof(input))) + if (input.exitinfo2 && copy_to_user(argp, &input, sizeof(input))) return -EFAULT; return ret; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 50f7f3f6b55e..e00cf8109b3f 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -35,10 +35,12 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, return retval; } if (attr_size > buffer_size) { - if (!buffer_size) /* request to get the attr_size */ - retval = attr_size; - else + if (buffer_size) retval = -ERANGE; + else if (attr_size > SSIZE_MAX) + retval = -EOVERFLOW; + else /* request to get the attr_size */ + retval = attr_size; } else { iov_iter_truncate(&to, attr_size); retval = p9_client_read(attr_fid, 0, &to, &err); @@ -183,10 +185,6 @@ static struct xattr_handler v9fs_xattr_security_handler = { const struct xattr_handler *v9fs_xattr_handlers[] = { &v9fs_xattr_user_handler, &v9fs_xattr_trusted_handler, -#ifdef CONFIG_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif #ifdef CONFIG_9P_FS_SECURITY &v9fs_xattr_security_handler, #endif diff --git a/fs/attr.c b/fs/attr.c index aca9ff7aed33..d60dc1edb526 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -47,6 +47,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap, return ATTR_KILL_SGID; return 0; } +EXPORT_SYMBOL(setattr_should_drop_sgid); /** * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 317aeff6c1da..a6d77fe41e1a 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -56,11 +56,9 @@ #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC) #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC) -/* Target completion latency of discarding all discardable extents */ -#define BTRFS_DISCARD_TARGET_MSEC (6 * 60 * 60UL * MSEC_PER_SEC) #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL) #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) -#define BTRFS_DISCARD_MAX_IOPS (10U) +#define BTRFS_DISCARD_MAX_IOPS (1000U) /* Monotonically decreasing minimum length filters after index 0 */ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { @@ -577,6 +575,7 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) s32 discardable_extents; s64 discardable_bytes; u32 iops_limit; + unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC; unsigned long delay; discardable_extents = atomic_read(&discard_ctl->discardable_extents); @@ -607,13 +606,19 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) } iops_limit = READ_ONCE(discard_ctl->iops_limit); - if (iops_limit) + + if (iops_limit) { delay = MSEC_PER_SEC / iops_limit; - else - delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents; + } else { + /* + * Unset iops_limit means go as fast as possible, so allow a + * delay of 0. + */ + delay = 0; + min_delay = 0; + } - delay = clamp(delay, BTRFS_DISCARD_MIN_DELAY_MSEC, - BTRFS_DISCARD_MAX_DELAY_MSEC); + delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC); discard_ctl->delay_ms = delay; spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b53f0e30ce2b..9e1596bb208d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2250,6 +2250,20 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) fs_info->csum_shash = csum_shash; + /* + * Check if the checksum implementation is a fast accelerated one. + * As-is this is a bit of a hack and should be replaced once the csum + * implementations provide that information themselves. + */ + switch (csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + if (!strstr(crypto_shash_driver_name(csum_shash), "generic")) + set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); + break; + default: + break; + } + btrfs_info(fs_info, "using %s (%s) checksum algorithm", btrfs_super_csum_name(csum_type), crypto_shash_driver_name(csum_shash)); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5cc5a1faaef5..f649647392e0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3730,10 +3730,15 @@ static int check_direct_read(struct btrfs_fs_info *fs_info, if (!iter_is_iovec(iter)) return 0; - for (seg = 0; seg < iter->nr_segs; seg++) - for (i = seg + 1; i < iter->nr_segs; i++) - if (iter->iov[seg].iov_base == iter->iov[i].iov_base) + for (seg = 0; seg < iter->nr_segs; seg++) { + for (i = seg + 1; i < iter->nr_segs; i++) { + const struct iovec *iov1 = iter_iov(iter) + seg; + const struct iovec *iov2 = iter_iov(iter) + i; + + if (iov1->iov_base == iov2->iov_base) return -EINVAL; + } + } return 0; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 581845bc206a..366fb4cde145 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1516,8 +1516,6 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name, s->s_id); btrfs_sb(s)->bdev_holder = fs_type; - if (!strstr(crc32c_impl(), "generic")) - set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); error = btrfs_fill_super(s, fs_devices, data); } if (!error) @@ -1631,6 +1629,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); + workqueue_set_max_active(fs_info->endio_workers, new_pool_size); + workqueue_set_max_active(fs_info->endio_meta_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 0ebeaf4e81f9..fc4b20c2688a 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -444,10 +444,6 @@ static const struct xattr_handler btrfs_btrfs_xattr_handler = { const struct xattr_handler *btrfs_xattr_handlers[] = { &btrfs_security_xattr_handler, -#ifdef CONFIG_BTRFS_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &btrfs_trusted_xattr_handler, &btrfs_user_xattr_handler, &btrfs_btrfs_xattr_handler, diff --git a/fs/buffer.c b/fs/buffer.c index 9e1e2add541e..10390f53f3f5 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2581,7 +2581,7 @@ int block_truncate_page(struct address_space *mapping, struct inode *inode = mapping->host; struct page *page; struct buffer_head *bh; - int err; + int err = 0; blocksize = i_blocksize(inode); length = offset & (blocksize - 1); @@ -2594,9 +2594,8 @@ int block_truncate_page(struct address_space *mapping, iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits); page = grab_cache_page(mapping, index); - err = -ENOMEM; if (!page) - goto out; + return -ENOMEM; if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); @@ -2610,7 +2609,6 @@ int block_truncate_page(struct address_space *mapping, pos += blocksize; } - err = 0; if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, iblock, bh, 0); @@ -2634,12 +2632,11 @@ int block_truncate_page(struct address_space *mapping, zero_user(page, offset, length); mark_buffer_dirty(bh); - err = 0; unlock: unlock_page(page); put_page(page); -out: + return err; } EXPORT_SYMBOL(block_truncate_page); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f65b07cc33a2..1fe1b62abebd 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1411,10 +1411,6 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) * attributes are handled directly. */ const struct xattr_handler *ceph_xattr_handlers[] = { -#ifdef CONFIG_CEPH_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &ceph_other_xattr_handler, NULL, }; diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index cb40074feb3e..0329a907bdfe 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -171,8 +171,6 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path) mnt = ERR_CAST(full_path); goto out; } - - convert_delimiter(full_path, '/'); cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path); tmp = *cur_ctx; diff --git a/fs/cifs/dfs.h b/fs/cifs/dfs.h index 13f26e01f7b9..0b8cbf721fff 100644 --- a/fs/cifs/dfs.h +++ b/fs/cifs/dfs.h @@ -34,19 +34,33 @@ static inline int dfs_get_referral(struct cifs_mount_ctx *mnt_ctx, const char *p cifs_remap(cifs_sb), path, ref, tl); } +/* Return DFS full path out of a dentry set for automount */ static inline char *dfs_get_automount_devname(struct dentry *dentry, void *page) { struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct TCP_Server_Info *server = tcon->ses->server; + size_t len; + char *s; if (unlikely(!server->origin_fullpath)) return ERR_PTR(-EREMOTE); - return __build_path_from_dentry_optional_prefix(dentry, page, - server->origin_fullpath, - strlen(server->origin_fullpath), - true); + s = dentry_path_raw(dentry, page, PATH_MAX); + if (IS_ERR(s)) + return s; + /* for root, we want "" */ + if (!s[1]) + s++; + + len = strlen(server->origin_fullpath); + if (s < (char *)page + len) + return ERR_PTR(-ENAMETOOLONG); + + s -= len; + memcpy(s, server->origin_fullpath, len); + convert_delimiter(s, '/'); + return s; } static inline void dfs_put_root_smb_sessions(struct list_head *head) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6831a9949c43..b33d2e7b0f98 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4010,7 +4010,6 @@ static void collect_uncached_read_data(struct cifs_aio_ctx *ctx) { struct cifs_readdata *rdata, *tmp; - struct iov_iter *to = &ctx->iter; struct cifs_sb_info *cifs_sb; int rc; @@ -4076,9 +4075,6 @@ again: kref_put(&rdata->refcount, cifs_readdata_release); } - if (!ctx->direct_io) - ctx->total_len = ctx->len - iov_iter_count(to); - /* mask nodata case */ if (rc == -ENODATA) rc = 0; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2b92132097dc..366f0c3b799b 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -587,11 +587,15 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, } +/* If invalid preauth context warn but use what we requested, SHA-512 */ static void decode_preauth_context(struct smb2_preauth_neg_context *ctxt) { unsigned int len = le16_to_cpu(ctxt->DataLength); - /* If invalid preauth context warn but use what we requested, SHA-512 */ + /* + * Caller checked that DataLength remains within SMB boundary. We still + * need to confirm that one HashAlgorithms member is accounted for. + */ if (len < MIN_PREAUTH_CTXT_DATA_LEN) { pr_warn_once("server sent bad preauth context\n"); return; @@ -610,7 +614,11 @@ static void decode_compress_ctx(struct TCP_Server_Info *server, { unsigned int len = le16_to_cpu(ctxt->DataLength); - /* sizeof compress context is a one element compression capbility struct */ + /* + * Caller checked that DataLength remains within SMB boundary. We still + * need to confirm that one CompressionAlgorithms member is accounted + * for. + */ if (len < 10) { pr_warn_once("server sent bad compression cntxt\n"); return; @@ -632,6 +640,11 @@ static int decode_encrypt_ctx(struct TCP_Server_Info *server, unsigned int len = le16_to_cpu(ctxt->DataLength); cifs_dbg(FYI, "decode SMB3.11 encryption neg context of len %d\n", len); + /* + * Caller checked that DataLength remains within SMB boundary. We still + * need to confirm that one Cipher flexible array member is accounted + * for. + */ if (len < MIN_ENCRYPT_CTXT_DATA_LEN) { pr_warn_once("server sent bad crypto ctxt len\n"); return -EINVAL; @@ -678,6 +691,11 @@ static void decode_signing_ctx(struct TCP_Server_Info *server, { unsigned int len = le16_to_cpu(pctxt->DataLength); + /* + * Caller checked that DataLength remains within SMB boundary. We still + * need to confirm that one SigningAlgorithms flexible array member is + * accounted for. + */ if ((len < 4) || (len > 16)) { pr_warn_once("server sent bad signing negcontext\n"); return; @@ -719,14 +737,19 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, for (i = 0; i < ctxt_cnt; i++) { int clen; /* check that offset is not beyond end of SMB */ - if (len_of_ctxts == 0) - break; - if (len_of_ctxts < sizeof(struct smb2_neg_context)) break; pctx = (struct smb2_neg_context *)(offset + (char *)rsp); - clen = le16_to_cpu(pctx->DataLength); + clen = sizeof(struct smb2_neg_context) + + le16_to_cpu(pctx->DataLength); + /* + * 2.2.4 SMB2 NEGOTIATE Response + * Subsequent negotiate contexts MUST appear at the first 8-byte + * aligned offset following the previous negotiate context. + */ + if (i + 1 != ctxt_cnt) + clen = ALIGN(clen, 8); if (clen > len_of_ctxts) break; @@ -747,12 +770,10 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, else cifs_server_dbg(VFS, "unknown negcontext of type %d ignored\n", le16_to_cpu(pctx->ContextType)); - if (rc) break; - /* offsets must be 8 byte aligned */ - clen = ALIGN(clen, 8); - offset += clen + sizeof(struct smb2_neg_context); + + offset += clen; len_of_ctxts -= clen; } return rc; @@ -4159,10 +4180,12 @@ smb2_readv_callback(struct mid_q_entry *mid) struct smb2_hdr *shdr = (struct smb2_hdr *)rdata->iov[0].iov_base; struct cifs_credits credits = { .value = 0, .instance = 0 }; - struct smb_rqst rqst = { .rq_iov = &rdata->iov[1], - .rq_nvec = 1, - .rq_iter = rdata->iter, - .rq_iter_size = iov_iter_count(&rdata->iter), }; + struct smb_rqst rqst = { .rq_iov = &rdata->iov[1], .rq_nvec = 1 }; + + if (rdata->got_bytes) { + rqst.rq_iter = rdata->iter; + rqst.rq_iter_size = iov_iter_count(&rdata->iter); + }; WARN_ONCE(rdata->server != mid->server, "rdata server %p != mid server %p", diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 50e762fa1a14..4ad5531686d8 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -487,9 +487,5 @@ const struct xattr_handler *cifs_xattr_handlers[] = { &smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */ &cifs_cifs_ntsd_full_xattr_handler, &smb3_ntsd_full_xattr_handler, /* alias for above since avoiding "cifs" */ -#ifdef CONFIG_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif NULL }; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 4afcbbe63e68..18677cd4e62f 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1599,12 +1599,6 @@ static int configfs_dir_close(struct inode *inode, struct file *file) return 0; } -/* Relationship between s_mode and the DT_xxx types */ -static inline unsigned char dt_type(struct configfs_dirent *sd) -{ - return (sd->s_mode >> 12) & 15; -} - static int configfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; @@ -1654,7 +1648,8 @@ static int configfs_readdir(struct file *file, struct dir_context *ctx) name = configfs_get_name(next); len = strlen(name); - if (!dir_emit(ctx, name, len, ino, dt_type(next))) + if (!dir_emit(ctx, name, len, ino, + fs_umode_to_dtype(next->s_mode))) return 0; spin_lock(&configfs_dirent_lock); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 4f25015aa534..fe3db0eda8e4 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -72,24 +72,6 @@ static struct ctl_table pty_table[] = { {} }; -static struct ctl_table pty_kern_table[] = { - { - .procname = "pty", - .mode = 0555, - .child = pty_table, - }, - {} -}; - -static struct ctl_table pty_root_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = pty_kern_table, - }, - {} -}; - struct pts_mount_opts { int setuid; int setgid; @@ -630,7 +612,7 @@ static int __init init_devpts_fs(void) { int err = register_filesystem(&devpts_fs_type); if (!err) { - register_sysctl_table(pty_root_table); + register_sysctl("kernel/pty", pty_table); } return err; } diff --git a/fs/direct-io.c b/fs/direct-io.c index ab0d7ea89813..0b380bb8a81e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -86,7 +86,6 @@ struct dio_submit { sector_t final_block_in_request;/* doesn't change */ int boundary; /* prev block is at a boundary */ get_block_t *get_block; /* block mapping function */ - dio_submit_t *submit_io; /* IO submition function */ loff_t logical_offset_in_bio; /* current first logical block in bio */ sector_t final_block_in_bio; /* current final block in bio + 1 */ @@ -431,10 +430,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) dio->bio_disk = bio->bi_bdev->bd_disk; - if (sdio->submit_io) - sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio); - else - submit_bio(bio); + submit_bio(bio); sdio->bio = NULL; sdio->boundary = 0; @@ -1098,7 +1094,7 @@ static inline int drop_refcount(struct dio *dio) ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, get_block_t get_block, dio_iodone_t end_io, - dio_submit_t submit_io, int flags) + int flags) { unsigned i_blkbits = READ_ONCE(inode->i_blkbits); unsigned blkbits = i_blkbits; @@ -1215,7 +1211,6 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, sdio.get_block = get_block; dio->end_io = end_io; - sdio.submit_io = submit_io; sdio.final_block_in_bio = -1; sdio.next_block_for_io = -1; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 144ace9e0dd9..83274915ba6d 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1210,10 +1210,6 @@ static const struct xattr_handler ecryptfs_xattr_handler = { }; const struct xattr_handler *ecryptfs_xattr_handlers[] = { -#ifdef CONFIG_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &ecryptfs_xattr_handler, NULL }; diff --git a/fs/erofs/data.c b/fs/erofs/data.c index c08c0f578bc6..6fe9a779fa91 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -27,11 +27,15 @@ void erofs_put_metabuf(struct erofs_buf *buf) buf->page = NULL; } -void *erofs_bread(struct erofs_buf *buf, struct inode *inode, - erofs_blk_t blkaddr, enum erofs_kmap_type type) +/* + * Derive the block size from inode->i_blkbits to make compatible with + * anonymous inode in fscache mode. + */ +void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, + enum erofs_kmap_type type) { - struct address_space *const mapping = inode->i_mapping; - erofs_off_t offset = blknr_to_addr(blkaddr); + struct inode *inode = buf->inode; + erofs_off_t offset = (erofs_off_t)blkaddr << inode->i_blkbits; pgoff_t index = offset >> PAGE_SHIFT; struct page *page = buf->page; struct folio *folio; @@ -41,7 +45,7 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode, erofs_put_metabuf(buf); nofs_flag = memalloc_nofs_save(); - folio = read_cache_folio(mapping, index, NULL, NULL); + folio = read_cache_folio(inode->i_mapping, index, NULL, NULL); memalloc_nofs_restore(nofs_flag); if (IS_ERR(folio)) return folio; @@ -63,14 +67,19 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode, return buf->base + (offset & ~PAGE_MASK); } -void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_blk_t blkaddr, enum erofs_kmap_type type) +void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) { if (erofs_is_fscache_mode(sb)) - return erofs_bread(buf, EROFS_SB(sb)->s_fscache->inode, - blkaddr, type); + buf->inode = EROFS_SB(sb)->s_fscache->inode; + else + buf->inode = sb->s_bdev->bd_inode; +} - return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type); +void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, + erofs_blk_t blkaddr, enum erofs_kmap_type type) +{ + erofs_init_metabuf(buf, sb); + return erofs_bread(buf, blkaddr, type); } static int erofs_map_blocks_flatmode(struct inode *inode, @@ -79,33 +88,32 @@ static int erofs_map_blocks_flatmode(struct inode *inode, erofs_blk_t nblocks, lastblk; u64 offset = map->m_la; struct erofs_inode *vi = EROFS_I(inode); + struct super_block *sb = inode->i_sb; bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); - nblocks = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); + nblocks = erofs_iblks(inode); lastblk = nblocks - tailendpacking; /* there is no hole in flatmode */ map->m_flags = EROFS_MAP_MAPPED; - if (offset < blknr_to_addr(lastblk)) { - map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la; - map->m_plen = blknr_to_addr(lastblk) - offset; + if (offset < erofs_pos(sb, lastblk)) { + map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la; + map->m_plen = erofs_pos(sb, lastblk) - offset; } else if (tailendpacking) { map->m_pa = erofs_iloc(inode) + vi->inode_isize + - vi->xattr_isize + erofs_blkoff(offset); + vi->xattr_isize + erofs_blkoff(sb, offset); map->m_plen = inode->i_size - offset; /* inline data should be located in the same meta block */ - if (erofs_blkoff(map->m_pa) + map->m_plen > EROFS_BLKSIZ) { - erofs_err(inode->i_sb, - "inline data cross block boundary @ nid %llu", + if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { + erofs_err(sb, "inline data cross block boundary @ nid %llu", vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } map->m_flags |= EROFS_MAP_META; } else { - erofs_err(inode->i_sb, - "internal error @ nid: %llu (size %llu), m_la 0x%llx", + erofs_err(sb, "internal error @ nid: %llu (size %llu), m_la 0x%llx", vi->nid, inode->i_size, map->m_la); DBG_BUGON(1); return -EIO; @@ -148,29 +156,29 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP); if (IS_ERR(kaddr)) { err = PTR_ERR(kaddr); goto out; } map->m_la = chunknr << vi->chunkbits; map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits, - roundup(inode->i_size - map->m_la, EROFS_BLKSIZ)); + round_up(inode->i_size - map->m_la, sb->s_blocksize)); /* handle block map */ if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) { - __le32 *blkaddr = kaddr + erofs_blkoff(pos); + __le32 *blkaddr = kaddr + erofs_blkoff(sb, pos); if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) { map->m_flags = 0; } else { - map->m_pa = blknr_to_addr(le32_to_cpu(*blkaddr)); + map->m_pa = erofs_pos(sb, le32_to_cpu(*blkaddr)); map->m_flags = EROFS_MAP_MAPPED; } goto out_unlock; } /* parse chunk indexes */ - idx = kaddr + erofs_blkoff(pos); + idx = kaddr + erofs_blkoff(sb, pos); switch (le32_to_cpu(idx->blkaddr)) { case EROFS_NULL_ADDR: map->m_flags = 0; @@ -178,7 +186,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) default: map->m_deviceid = le16_to_cpu(idx->device_id) & EROFS_SB(sb)->device_id_mask; - map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr)); + map->m_pa = erofs_pos(sb, le32_to_cpu(idx->blkaddr)); map->m_flags = EROFS_MAP_MAPPED; break; } @@ -197,7 +205,6 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) struct erofs_device_info *dif; int id; - /* primary device by default */ map->m_bdev = sb->s_bdev; map->m_daxdev = EROFS_SB(sb)->dax_dev; map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; @@ -210,20 +217,25 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return -ENODEV; } + if (devs->flatdev) { + map->m_pa += erofs_pos(sb, dif->mapped_blkaddr); + up_read(&devs->rwsem); + return 0; + } map->m_bdev = dif->bdev; map->m_daxdev = dif->dax_dev; map->m_dax_part_off = dif->dax_part_off; map->m_fscache = dif->fscache; up_read(&devs->rwsem); - } else if (devs->extra_devices) { + } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); idr_for_each_entry(&devs->tree, dif, id) { erofs_off_t startoff, length; if (!dif->mapped_blkaddr) continue; - startoff = blknr_to_addr(dif->mapped_blkaddr); - length = blknr_to_addr(dif->blocks); + startoff = erofs_pos(sb, dif->mapped_blkaddr); + length = erofs_pos(sb, dif->blocks); if (map->m_pa >= startoff && map->m_pa < startoff + length) { @@ -244,6 +256,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; + struct super_block *sb = inode->i_sb; struct erofs_map_blocks map; struct erofs_map_dev mdev; @@ -258,7 +271,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, .m_deviceid = map.m_deviceid, .m_pa = map.m_pa, }; - ret = erofs_map_dev(inode->i_sb, &mdev); + ret = erofs_map_dev(sb, &mdev); if (ret) return ret; @@ -284,11 +297,11 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, struct erofs_buf buf = __EROFS_BUF_INITIALIZER; iomap->type = IOMAP_INLINE; - ptr = erofs_read_metabuf(&buf, inode->i_sb, - erofs_blknr(mdev.m_pa), EROFS_KMAP); + ptr = erofs_read_metabuf(&buf, sb, + erofs_blknr(sb, mdev.m_pa), EROFS_KMAP); if (IS_ERR(ptr)) return PTR_ERR(ptr); - iomap->inline_data = ptr + erofs_blkoff(mdev.m_pa); + iomap->inline_data = ptr + erofs_blkoff(sb, mdev.m_pa); iomap->private = buf.base; } else { iomap->type = IOMAP_MAPPED; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 51b7ac7166d9..7021e2cf6146 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -42,7 +42,7 @@ int z_erofs_load_lz4_config(struct super_block *sb, if (!sbi->lz4.max_pclusterblks) { sbi->lz4.max_pclusterblks = 1; /* reserved case */ } else if (sbi->lz4.max_pclusterblks > - Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) { + erofs_blknr(sb, Z_EROFS_PCLUSTER_MAX_SIZE)) { erofs_err(sb, "too large lz4 pclusterblks %u", sbi->lz4.max_pclusterblks); return -EINVAL; @@ -221,13 +221,13 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, support_0padding = true; ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in, min_t(unsigned int, rq->inputsize, - EROFS_BLKSIZ - rq->pageofs_in)); + rq->sb->s_blocksize - rq->pageofs_in)); if (ret) { kunmap_atomic(headpage); return ret; } may_inplace = !((rq->pageofs_in + rq->inputsize) & - (EROFS_BLKSIZ - 1)); + (rq->sb->s_blocksize - 1)); } inputmargin = rq->pageofs_in; diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index d38e19c11270..73091fbe3ea4 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -166,8 +166,8 @@ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, /* 1. get the exact LZMA compressed size */ kin = kmap(*rq->in); err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in, - min_t(unsigned int, rq->inputsize, - EROFS_BLKSIZ - rq->pageofs_in)); + min_t(unsigned int, rq->inputsize, + rq->sb->s_blocksize - rq->pageofs_in)); if (err) { kunmap(*rq->in); return err; diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 6970b09b8307..b80abec0531a 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -50,44 +50,43 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) { struct inode *dir = file_inode(f); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + struct super_block *sb = dir->i_sb; + unsigned long bsz = sb->s_blocksize; const size_t dirsize = i_size_read(dir); - unsigned int i = ctx->pos / EROFS_BLKSIZ; - unsigned int ofs = ctx->pos % EROFS_BLKSIZ; + unsigned int i = erofs_blknr(sb, ctx->pos); + unsigned int ofs = erofs_blkoff(sb, ctx->pos); int err = 0; bool initial = true; + buf.inode = dir; while (ctx->pos < dirsize) { struct erofs_dirent *de; unsigned int nameoff, maxsize; - de = erofs_bread(&buf, dir, i, EROFS_KMAP); + de = erofs_bread(&buf, i, EROFS_KMAP); if (IS_ERR(de)) { - erofs_err(dir->i_sb, - "fail to readdir of logical block %u of nid %llu", + erofs_err(sb, "fail to readdir of logical block %u of nid %llu", i, EROFS_I(dir)->nid); err = PTR_ERR(de); break; } nameoff = le16_to_cpu(de->nameoff); - if (nameoff < sizeof(struct erofs_dirent) || - nameoff >= EROFS_BLKSIZ) { - erofs_err(dir->i_sb, - "invalid de[0].nameoff %u @ nid %llu", + if (nameoff < sizeof(struct erofs_dirent) || nameoff >= bsz) { + erofs_err(sb, "invalid de[0].nameoff %u @ nid %llu", nameoff, EROFS_I(dir)->nid); err = -EFSCORRUPTED; break; } - maxsize = min_t(unsigned int, - dirsize - ctx->pos + ofs, EROFS_BLKSIZ); + maxsize = min_t(unsigned int, dirsize - ctx->pos + ofs, bsz); /* search dirents at the arbitrary position */ if (initial) { initial = false; ofs = roundup(ofs, sizeof(struct erofs_dirent)); - ctx->pos = blknr_to_addr(i) + ofs; + ctx->pos = erofs_pos(sb, i) + ofs; if (ofs >= nameoff) goto skip_this; } @@ -97,7 +96,7 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) if (err) break; skip_this: - ctx->pos = blknr_to_addr(i) + maxsize; + ctx->pos = erofs_pos(sb, i) + maxsize; ++i; ofs = 0; } diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index dbcd24371002..2c7b16e340fe 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -27,6 +27,7 @@ #define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010 #define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 #define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020 +#define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ @@ -36,7 +37,8 @@ EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \ EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \ EROFS_FEATURE_INCOMPAT_FRAGMENTS | \ - EROFS_FEATURE_INCOMPAT_DEDUPE) + EROFS_FEATURE_INCOMPAT_DEDUPE | \ + EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES) #define EROFS_SB_EXTSLOT_SIZE 16 @@ -53,7 +55,7 @@ struct erofs_super_block { __le32 magic; /* file system magic number */ __le32 checksum; /* crc32c(super_block) */ __le32 feature_compat; - __u8 blkszbits; /* support block_size == PAGE_SIZE only */ + __u8 blkszbits; /* filesystem block size in bit shift */ __u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */ __le16 root_nid; /* nid of root directory */ @@ -75,49 +77,46 @@ struct erofs_super_block { } __packed u1; __le16 extra_devices; /* # of devices besides the primary device */ __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */ - __u8 reserved[6]; + __u8 dirblkbits; /* directory block size in bit shift */ + __u8 xattr_prefix_count; /* # of long xattr name prefixes */ + __le32 xattr_prefix_start; /* start of long xattr prefixes */ __le64 packed_nid; /* nid of the special packed inode */ __u8 reserved2[24]; }; /* - * erofs inode datalayout (i_format in on-disk inode): + * EROFS inode datalayout (i_format in on-disk inode): * 0 - uncompressed flat inode without tail-packing inline data: - * inode, [xattrs], ... | ... | no-holed data * 1 - compressed inode with non-compact indexes: - * inode, [xattrs], [map_header], extents ... | ... * 2 - uncompressed flat inode with tail-packing inline data: - * inode, [xattrs], tailpacking data, ... | ... | no-holed data * 3 - compressed inode with compact indexes: - * inode, [xattrs], map_header, extents ... | ... * 4 - chunk-based inode with (optional) multi-device support: - * inode, [xattrs], chunk indexes ... | ... * 5~7 - reserved */ enum { EROFS_INODE_FLAT_PLAIN = 0, - EROFS_INODE_FLAT_COMPRESSION_LEGACY = 1, + EROFS_INODE_COMPRESSED_FULL = 1, EROFS_INODE_FLAT_INLINE = 2, - EROFS_INODE_FLAT_COMPRESSION = 3, + EROFS_INODE_COMPRESSED_COMPACT = 3, EROFS_INODE_CHUNK_BASED = 4, EROFS_INODE_DATALAYOUT_MAX }; static inline bool erofs_inode_is_data_compressed(unsigned int datamode) { - return datamode == EROFS_INODE_FLAT_COMPRESSION || - datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY; + return datamode == EROFS_INODE_COMPRESSED_COMPACT || + datamode == EROFS_INODE_COMPRESSED_FULL; } /* bit definitions of inode i_format */ -#define EROFS_I_VERSION_BITS 1 -#define EROFS_I_DATALAYOUT_BITS 3 +#define EROFS_I_VERSION_MASK 0x01 +#define EROFS_I_DATALAYOUT_MASK 0x07 #define EROFS_I_VERSION_BIT 0 #define EROFS_I_DATALAYOUT_BIT 1 +#define EROFS_I_ALL_BIT 4 -#define EROFS_I_ALL \ - ((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1) +#define EROFS_I_ALL ((1 << EROFS_I_ALL_BIT) - 1) /* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */ #define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F @@ -127,11 +126,30 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode) #define EROFS_CHUNK_FORMAT_ALL \ (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES) +/* 32-byte on-disk inode */ +#define EROFS_INODE_LAYOUT_COMPACT 0 +/* 64-byte on-disk inode */ +#define EROFS_INODE_LAYOUT_EXTENDED 1 + struct erofs_inode_chunk_info { __le16 format; /* chunk blkbits, etc. */ __le16 reserved; }; +union erofs_inode_i_u { + /* total compressed blocks for compressed inodes */ + __le32 compressed_blocks; + + /* block address for uncompressed flat inodes */ + __le32 raw_blkaddr; + + /* for device files, used to indicate old/new device # */ + __le32 rdev; + + /* for chunk-based files, it contains the summary info */ + struct erofs_inode_chunk_info c; +}; + /* 32-byte reduced form of an ondisk inode */ struct erofs_inode_compact { __le16 i_format; /* inode format hints */ @@ -142,29 +160,14 @@ struct erofs_inode_compact { __le16 i_nlink; __le32 i_size; __le32 i_reserved; - union { - /* total compressed blocks for compressed inodes */ - __le32 compressed_blocks; - /* block address for uncompressed flat inodes */ - __le32 raw_blkaddr; - - /* for device files, used to indicate old/new device # */ - __le32 rdev; - - /* for chunk-based files, it contains the summary info */ - struct erofs_inode_chunk_info c; - } i_u; - __le32 i_ino; /* only used for 32-bit stat compatibility */ + union erofs_inode_i_u i_u; + + __le32 i_ino; /* only used for 32-bit stat compatibility */ __le16 i_uid; __le16 i_gid; __le32 i_reserved2; }; -/* 32-byte on-disk inode */ -#define EROFS_INODE_LAYOUT_COMPACT 0 -/* 64-byte on-disk inode */ -#define EROFS_INODE_LAYOUT_EXTENDED 1 - /* 64-byte complete form of an ondisk inode */ struct erofs_inode_extended { __le16 i_format; /* inode format hints */ @@ -174,22 +177,9 @@ struct erofs_inode_extended { __le16 i_mode; __le16 i_reserved; __le64 i_size; - union { - /* total compressed blocks for compressed inodes */ - __le32 compressed_blocks; - /* block address for uncompressed flat inodes */ - __le32 raw_blkaddr; - - /* for device files, used to indicate old/new device # */ - __le32 rdev; - - /* for chunk-based files, it contains the summary info */ - struct erofs_inode_chunk_info c; - } i_u; - - /* only used for 32-bit stat compatibility */ - __le32 i_ino; + union erofs_inode_i_u i_u; + __le32 i_ino; /* only used for 32-bit stat compatibility */ __le32 i_uid; __le32 i_gid; __le64 i_mtime; @@ -198,10 +188,6 @@ struct erofs_inode_extended { __u8 i_reserved2[16]; }; -#define EROFS_MAX_SHARED_XATTRS (128) -/* h_shared_count between 129 ... 255 are special # */ -#define EROFS_SHARED_XATTR_EXTENT (255) - /* * inline xattrs (n == i_xattr_icount): * erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes @@ -228,6 +214,13 @@ struct erofs_xattr_ibody_header { #define EROFS_XATTR_INDEX_LUSTRE 5 #define EROFS_XATTR_INDEX_SECURITY 6 +/* + * bit 7 of e_name_index is set when it refers to a long xattr name prefix, + * while the remained lower bits represent the index of the prefix. + */ +#define EROFS_XATTR_LONG_PREFIX 0x80 +#define EROFS_XATTR_LONG_PREFIX_MASK 0x7f + /* xattr entry (for both inline & shared xattrs) */ struct erofs_xattr_entry { __u8 e_name_len; /* length of name */ @@ -237,6 +230,12 @@ struct erofs_xattr_entry { char e_name[]; /* attribute name */ }; +/* long xattr name prefix */ +struct erofs_xattr_long_prefix { + __u8 base_index; /* short xattr name prefix index */ + char infix[]; /* infix apart from short prefix */ +}; + static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount) { if (!i_xattr_icount) @@ -267,6 +266,22 @@ struct erofs_inode_chunk_index { __le32 blkaddr; /* start block address of this inode chunk */ }; +/* dirent sorts in alphabet order, thus we can do binary search */ +struct erofs_dirent { + __le64 nid; /* node number */ + __le16 nameoff; /* start offset of file name */ + __u8 file_type; /* file type */ + __u8 reserved; /* reserved */ +} __packed; + +/* + * EROFS file types should match generic FT_* types and + * it seems no need to add BUILD_BUG_ONs since potential + * unmatchness will break other fses as well... + */ + +#define EROFS_NAME_LEN 255 + /* maximum supported size of a physical compression cluster */ #define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024) @@ -336,10 +351,8 @@ struct z_erofs_map_header { __u8 h_clusterbits; }; -#define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8 - /* - * Fixed-sized output compression on-disk logical cluster type: + * On-disk logical cluster type: * 0 - literal (uncompressed) lcluster * 1,3 - compressed lcluster (for HEAD lclusters) * 2 - compressed lcluster (for NONHEAD lclusters) @@ -363,27 +376,27 @@ struct z_erofs_map_header { * di_u.delta[1] = distance to the next HEAD lcluster */ enum { - Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0, - Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 = 1, - Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2, - Z_EROFS_VLE_CLUSTER_TYPE_HEAD2 = 3, - Z_EROFS_VLE_CLUSTER_TYPE_MAX + Z_EROFS_LCLUSTER_TYPE_PLAIN = 0, + Z_EROFS_LCLUSTER_TYPE_HEAD1 = 1, + Z_EROFS_LCLUSTER_TYPE_NONHEAD = 2, + Z_EROFS_LCLUSTER_TYPE_HEAD2 = 3, + Z_EROFS_LCLUSTER_TYPE_MAX }; -#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2 -#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0 +#define Z_EROFS_LI_LCLUSTER_TYPE_BITS 2 +#define Z_EROFS_LI_LCLUSTER_TYPE_BIT 0 /* (noncompact only, HEAD) This pcluster refers to partial decompressed data */ -#define Z_EROFS_VLE_DI_PARTIAL_REF (1 << 15) +#define Z_EROFS_LI_PARTIAL_REF (1 << 15) /* * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the * compressed block count of a compressed extent (in logical clusters, aka. * block count of a pcluster). */ -#define Z_EROFS_VLE_DI_D0_CBLKCNT (1 << 11) +#define Z_EROFS_LI_D0_CBLKCNT (1 << 11) -struct z_erofs_vle_decompressed_index { +struct z_erofs_lcluster_index { __le16 di_advise; /* where to decompress in the head lcluster */ __le16 di_clusterofs; @@ -400,25 +413,8 @@ struct z_erofs_vle_decompressed_index { } di_u; }; -#define Z_EROFS_VLE_LEGACY_INDEX_ALIGN(size) \ - (round_up(size, sizeof(struct z_erofs_vle_decompressed_index)) + \ - sizeof(struct z_erofs_map_header) + Z_EROFS_VLE_LEGACY_HEADER_PADDING) - -/* dirent sorts in alphabet order, thus we can do binary search */ -struct erofs_dirent { - __le64 nid; /* node number */ - __le16 nameoff; /* start offset of file name */ - __u8 file_type; /* file type */ - __u8 reserved; /* reserved */ -} __packed; - -/* - * EROFS file types should match generic FT_* types and - * it seems no need to add BUILD_BUG_ONs since potential - * unmatchness will break other fses as well... - */ - -#define EROFS_NAME_LEN 255 +#define Z_EROFS_FULL_INDEX_ALIGN(end) \ + (ALIGN(end, 8) + sizeof(struct z_erofs_map_header) + 8) /* check the EROFS on-disk layout strictly at compile time */ static inline void erofs_check_ondisk_layout_definitions(void) @@ -435,15 +431,15 @@ static inline void erofs_check_ondisk_layout_definitions(void) BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_info) != 4); BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != 8); BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8); - BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8); + BUILD_BUG_ON(sizeof(struct z_erofs_lcluster_index) != 8); BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12); /* keep in sync between 2 index structures for better extendibility */ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != - sizeof(struct z_erofs_vle_decompressed_index)); + sizeof(struct z_erofs_lcluster_index)); BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128); - BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < - Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); + BUILD_BUG_ON(BIT(Z_EROFS_LI_LCLUSTER_TYPE_BITS) < + Z_EROFS_LCLUSTER_TYPE_MAX - 1); /* exclude old compiler versions like gcc 7.5.0 */ BUILD_BUG_ON(__builtin_constant_p(fmh) ? fmh != cpu_to_le64(1ULL << 63) : 0); diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 96a87c023128..87ff35bff8d5 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -209,8 +209,8 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) void *src; /* For tail packing layout, the offset may be non-zero. */ - offset = erofs_blkoff(map.m_pa); - blknr = erofs_blknr(map.m_pa); + offset = erofs_blkoff(sb, map.m_pa); + blknr = erofs_blknr(sb, map.m_pa); size = map.m_llen; src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP); @@ -460,6 +460,7 @@ static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &erofs_fscache_meta_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + inode->i_blkbits = EROFS_SB(sb)->blkszbits; inode->i_private = ctx; ctx->cookie = cookie; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 4be7dda3cd24..d70b12b81507 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -23,11 +23,8 @@ static void *erofs_read_inode(struct erofs_buf *buf, unsigned int ifmt; int err; - blkaddr = erofs_blknr(inode_loc); - *ofs = erofs_blkoff(inode_loc); - - erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u", - __func__, vi->nid, *ofs, blkaddr); + blkaddr = erofs_blknr(sb, inode_loc); + *ofs = erofs_blkoff(sb, inode_loc); kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP); if (IS_ERR(kaddr)) { @@ -58,11 +55,11 @@ static void *erofs_read_inode(struct erofs_buf *buf, case EROFS_INODE_LAYOUT_EXTENDED: vi->inode_isize = sizeof(struct erofs_inode_extended); /* check if the extended inode acrosses block boundary */ - if (*ofs + vi->inode_isize <= EROFS_BLKSIZ) { + if (*ofs + vi->inode_isize <= sb->s_blocksize) { *ofs += vi->inode_isize; die = (struct erofs_inode_extended *)dic; } else { - const unsigned int gotten = EROFS_BLKSIZ - *ofs; + const unsigned int gotten = sb->s_blocksize - *ofs; copied = kmalloc(vi->inode_isize, GFP_NOFS); if (!copied) { @@ -176,7 +173,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, err = -EOPNOTSUPP; goto err_out; } - vi->chunkbits = LOG_BLOCK_SIZE + + vi->chunkbits = sb->s_blocksize_bits + (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK); } inode->i_mtime.tv_sec = inode->i_ctime.tv_sec; @@ -188,11 +185,12 @@ static void *erofs_read_inode(struct erofs_buf *buf, if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && vi->datalayout == EROFS_INODE_FLAT_PLAIN) inode->i_flags |= S_DAX; + if (!nblks) /* measure inode.i_blocks as generic filesystems */ - inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; + inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; else - inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK; + inode->i_blocks = nblks << (sb->s_blocksize_bits - 9); return kaddr; bogusimode: @@ -210,11 +208,12 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, unsigned int m_pofs) { struct erofs_inode *vi = EROFS_I(inode); + unsigned int bsz = i_blocksize(inode); char *lnk; /* if it cannot be handled with fast symlink scheme */ if (vi->datalayout != EROFS_INODE_FLAT_INLINE || - inode->i_size >= EROFS_BLKSIZ || inode->i_size < 0) { + inode->i_size >= bsz || inode->i_size < 0) { inode->i_op = &erofs_symlink_iops; return 0; } @@ -225,7 +224,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, m_pofs += vi->xattr_isize; /* inline symlink data shouldn't cross block boundary */ - if (m_pofs + inode->i_size > EROFS_BLKSIZ) { + if (m_pofs + inode->i_size > bsz) { kfree(lnk); erofs_err(inode->i_sb, "inline data cross block boundary @ nid %llu", @@ -289,10 +288,15 @@ static int erofs_fill_inode(struct inode *inode) } if (erofs_inode_is_data_compressed(vi->datalayout)) { - if (!erofs_is_fscache_mode(inode->i_sb)) - err = z_erofs_fill_inode(inode); - else - err = -EOPNOTSUPP; +#ifdef CONFIG_EROFS_FS_ZIP + if (!erofs_is_fscache_mode(inode->i_sb) && + inode->i_sb->s_blocksize_bits == PAGE_SHIFT) { + inode->i_mapping->a_ops = &z_erofs_aops; + err = 0; + goto out_unlock; + } +#endif + err = -EOPNOTSUPP; goto out_unlock; } inode->i_mapping->a_ops = &erofs_raw_access_aops; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 1db018f8c2e8..af0431a40647 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -31,10 +31,8 @@ __printf(3, 4) void _erofs_info(struct super_block *sb, #define erofs_info(sb, fmt, ...) \ _erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__) #ifdef CONFIG_EROFS_FS_DEBUG -#define erofs_dbg(x, ...) pr_debug(x "\n", ##__VA_ARGS__) #define DBG_BUGON BUG_ON #else -#define erofs_dbg(x, ...) ((void)0) #define DBG_BUGON(x) ((void)(x)) #endif /* !CONFIG_EROFS_FS_DEBUG */ @@ -81,6 +79,7 @@ struct erofs_dev_context { struct rw_semaphore rwsem; unsigned int extra_devices; + bool flatdev; }; struct erofs_fs_context { @@ -116,6 +115,11 @@ struct erofs_fscache { char *name; }; +struct erofs_xattr_prefix_item { + struct erofs_xattr_long_prefix *prefix; + u8 infix_len; +}; + struct erofs_sb_info { struct erofs_mount_opts opt; /* options */ #ifdef CONFIG_EROFS_FS_ZIP @@ -133,8 +137,8 @@ struct erofs_sb_info { struct inode *managed_cache; struct erofs_sb_lz4_info lz4; - struct inode *packed_inode; #endif /* CONFIG_EROFS_FS_ZIP */ + struct inode *packed_inode; struct erofs_dev_context *devs; struct dax_device *dax_dev; u64 dax_part_off; @@ -144,11 +148,14 @@ struct erofs_sb_info { u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR u32 xattr_blkaddr; + u32 xattr_prefix_start; + u8 xattr_prefix_count; + struct erofs_xattr_prefix_item *xattr_prefixes; #endif u16 device_id_mask; /* valid bits of device id to be used */ - /* inode slot unit size in bit shift */ - unsigned char islotbits; + unsigned char islotbits; /* inode slot unit size in bit shift */ + unsigned char blkszbits; /* filesystem block size in bit shift */ u32 sb_size; /* total superblock size */ u32 build_time_nsec; @@ -156,6 +163,7 @@ struct erofs_sb_info { /* what we really care is nid, rather than ino.. */ erofs_nid_t root_nid; + erofs_nid_t packed_nid; /* used for statfs, f_files - f_favail */ u64 inos; @@ -240,27 +248,13 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) VAL != EROFS_LOCKED_MAGIC); } -/* we strictly follow PAGE_SIZE and no buffer head yet */ -#define LOG_BLOCK_SIZE PAGE_SHIFT - -#undef LOG_SECTORS_PER_BLOCK -#define LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9) - -#undef SECTORS_PER_BLOCK -#define SECTORS_PER_BLOCK (1 << SECTORS_PER_BLOCK) - -#define EROFS_BLKSIZ (1 << LOG_BLOCK_SIZE) - -#if (EROFS_BLKSIZ % 4096 || !EROFS_BLKSIZ) -#error erofs cannot be used in this platform -#endif - enum erofs_kmap_type { EROFS_NO_KMAP, /* don't map the buffer */ EROFS_KMAP, /* use kmap_local_page() to map the buffer */ }; struct erofs_buf { + struct inode *inode; struct page *page; void *base; enum erofs_kmap_type kmap_type; @@ -269,9 +263,10 @@ struct erofs_buf { #define ROOT_NID(sb) ((sb)->root_nid) -#define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ) -#define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ) -#define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ) +#define erofs_blknr(sb, addr) ((addr) >> (sb)->s_blocksize_bits) +#define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1)) +#define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits) +#define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits) #define EROFS_FEATURE_FUNCS(name, compat, feature) \ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ @@ -288,6 +283,7 @@ EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2) EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING) EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) +EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) /* atomic flag definitions */ @@ -306,7 +302,7 @@ struct erofs_inode { unsigned char datalayout; unsigned char inode_isize; - unsigned short xattr_isize; + unsigned int xattr_isize; unsigned int xattr_shared_count; unsigned int *xattr_shared_xattrs; @@ -343,28 +339,18 @@ static inline erofs_off_t erofs_iloc(struct inode *inode) { struct erofs_sb_info *sbi = EROFS_I_SB(inode); - return blknr_to_addr(sbi->meta_blkaddr) + + return erofs_pos(inode->i_sb, sbi->meta_blkaddr) + (EROFS_I(inode)->nid << sbi->islotbits); } -static inline unsigned int erofs_bitrange(unsigned int value, unsigned int bit, - unsigned int bits) -{ - - return (value >> bit) & ((1 << bits) - 1); -} - - -static inline unsigned int erofs_inode_version(unsigned int value) +static inline unsigned int erofs_inode_version(unsigned int ifmt) { - return erofs_bitrange(value, EROFS_I_VERSION_BIT, - EROFS_I_VERSION_BITS); + return (ifmt >> EROFS_I_VERSION_BIT) & EROFS_I_VERSION_MASK; } -static inline unsigned int erofs_inode_datalayout(unsigned int value) +static inline unsigned int erofs_inode_datalayout(unsigned int ifmt) { - return erofs_bitrange(value, EROFS_I_DATALAYOUT_BIT, - EROFS_I_DATALAYOUT_BITS); + return (ifmt >> EROFS_I_DATALAYOUT_BIT) & EROFS_I_DATALAYOUT_MASK; } /* @@ -451,10 +437,13 @@ extern const struct iomap_ops z_erofs_iomap_report_ops; #define EROFS_REG_COOKIE_SHARE 0x0001 #define EROFS_REG_COOKIE_NEED_NOEXIST 0x0002 +void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, + erofs_off_t *offset, int *lengthp); void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); -void *erofs_bread(struct erofs_buf *buf, struct inode *inode, - erofs_blk_t blkaddr, enum erofs_kmap_type type); +void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, + enum erofs_kmap_type type); +void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb); void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, erofs_blk_t blkaddr, enum erofs_kmap_type type); int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); @@ -521,7 +510,6 @@ int erofs_try_to_free_cached_page(struct page *page); int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int len); -int z_erofs_fill_inode(struct inode *inode); int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags); #else @@ -541,7 +529,6 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } return 0; } -static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; } #endif /* !CONFIG_EROFS_FS_ZIP */ #ifdef CONFIG_EROFS_FS_ZIP_LZMA diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index 966eabc61c13..d4f631d39f0f 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -89,7 +89,8 @@ static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name, static void *erofs_find_target_block(struct erofs_buf *target, struct inode *dir, struct erofs_qstr *name, int *_ndirents) { - int head = 0, back = DIV_ROUND_UP(dir->i_size, EROFS_BLKSIZ) - 1; + unsigned int bsz = i_blocksize(dir); + int head = 0, back = erofs_iblks(dir) - 1; unsigned int startprfx = 0, endprfx = 0; void *candidate = ERR_PTR(-ENOENT); @@ -98,10 +99,10 @@ static void *erofs_find_target_block(struct erofs_buf *target, struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_dirent *de; - de = erofs_bread(&buf, dir, mid, EROFS_KMAP); + buf.inode = dir; + de = erofs_bread(&buf, mid, EROFS_KMAP); if (!IS_ERR(de)) { - const int nameoff = nameoff_from_disk(de->nameoff, - EROFS_BLKSIZ); + const int nameoff = nameoff_from_disk(de->nameoff, bsz); const int ndirents = nameoff / sizeof(*de); int diff; unsigned int matched; @@ -121,11 +122,10 @@ static void *erofs_find_target_block(struct erofs_buf *target, dname.name = (u8 *)de + nameoff; if (ndirents == 1) - dname.end = (u8 *)de + EROFS_BLKSIZ; + dname.end = (u8 *)de + bsz; else dname.end = (u8 *)de + - nameoff_from_disk(de[1].nameoff, - EROFS_BLKSIZ); + nameoff_from_disk(de[1].nameoff, bsz); /* string comparison without already matched prefix */ diff = erofs_dirnamecmp(name, &dname, &matched); @@ -171,6 +171,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, qn.name = name->name; qn.end = name->name + name->len; + buf.inode = dir; ndirents = 0; de = erofs_find_target_block(&buf, dir, &qn, &ndirents); @@ -178,7 +179,8 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, return PTR_ERR(de); if (ndirents) - de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents); + de = find_target_dirent(&qn, (u8 *)de, i_blocksize(dir), + ndirents); if (!IS_ERR(de)) { *nid = le64_to_cpu(de->nid); @@ -203,16 +205,13 @@ static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry, err = erofs_namei(dir, &dentry->d_name, &nid, &d_type); - if (err == -ENOENT) { + if (err == -ENOENT) /* negative dentry */ inode = NULL; - } else if (err) { + else if (err) inode = ERR_PTR(err); - } else { - erofs_dbg("%s, %pd (nid %llu) found, d_type %u", __func__, - dentry, nid, d_type); + else inode = erofs_iget(dir->i_sb, nid); - } return d_splice_alias(inode, dentry); } diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 19b1ae79cec4..811ab66d805e 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -52,18 +52,21 @@ void _erofs_info(struct super_block *sb, const char *function, static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata) { + size_t len = 1 << EROFS_SB(sb)->blkszbits; struct erofs_super_block *dsb; u32 expected_crc, crc; - dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET, - EROFS_BLKSIZ - EROFS_SUPER_OFFSET, GFP_KERNEL); + if (len > EROFS_SUPER_OFFSET) + len -= EROFS_SUPER_OFFSET; + + dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET, len, GFP_KERNEL); if (!dsb) return -ENOMEM; expected_crc = le32_to_cpu(dsb->checksum); dsb->checksum = 0; /* to allow for x86 boot sectors and other oddities. */ - crc = crc32c(~0, dsb, EROFS_BLKSIZ - EROFS_SUPER_OFFSET); + crc = crc32c(~0, dsb, len); kfree(dsb); if (crc != expected_crc) { @@ -123,20 +126,19 @@ static bool check_layout_compatibility(struct super_block *sb, return true; } -#ifdef CONFIG_EROFS_FS_ZIP /* read variable-sized metadata, offset will be aligned by 4-byte */ -static void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, - erofs_off_t *offset, int *lengthp) +void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, + erofs_off_t *offset, int *lengthp) { u8 *buffer, *ptr; int len, i, cnt; *offset = round_up(*offset, 4); - ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset), EROFS_KMAP); + ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP); if (IS_ERR(ptr)) return ptr; - len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(*offset)]); + len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(sb, *offset)]); if (!len) len = U16_MAX + 1; buffer = kmalloc(len, GFP_KERNEL); @@ -146,19 +148,20 @@ static void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, *lengthp = len; for (i = 0; i < len; i += cnt) { - cnt = min(EROFS_BLKSIZ - (int)erofs_blkoff(*offset), len - i); - ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset), - EROFS_KMAP); + cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset), + len - i); + ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP); if (IS_ERR(ptr)) { kfree(buffer); return ptr; } - memcpy(buffer + i, ptr + erofs_blkoff(*offset), cnt); + memcpy(buffer + i, ptr + erofs_blkoff(sb, *offset), cnt); *offset += cnt; } return buffer; } +#ifdef CONFIG_EROFS_FS_ZIP static int erofs_load_compr_cfgs(struct super_block *sb, struct erofs_super_block *dsb) { @@ -175,6 +178,7 @@ static int erofs_load_compr_cfgs(struct super_block *sb, return -EINVAL; } + erofs_init_metabuf(&buf, sb); offset = EROFS_SUPER_OFFSET + sbi->sb_size; alg = 0; for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) { @@ -228,10 +232,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct block_device *bdev; void *ptr; - ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP); + ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP); if (IS_ERR(ptr)) return PTR_ERR(ptr); - dis = ptr + erofs_blkoff(*pos); + dis = ptr + erofs_blkoff(sb, *pos); if (!dif->path) { if (!dis->tag[0]) { @@ -248,7 +252,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, if (IS_ERR(fscache)) return PTR_ERR(fscache); dif->fscache = fscache; - } else { + } else if (!sbi->devs->flatdev) { bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL, sb->s_type); if (IS_ERR(bdev)) @@ -290,6 +294,9 @@ static int erofs_scan_devices(struct super_block *sb, if (!ondisk_extradevs) return 0; + if (!sbi->devs->extra_devices && !erofs_is_fscache_mode(sb)) + sbi->devs->flatdev = true; + sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1; pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE; down_read(&sbi->devs->rwsem); @@ -329,7 +336,6 @@ static int erofs_read_superblock(struct super_block *sb) struct erofs_sb_info *sbi; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_super_block *dsb; - unsigned int blkszbits; void *data; int ret; @@ -348,6 +354,16 @@ static int erofs_read_superblock(struct super_block *sb) goto out; } + sbi->blkszbits = dsb->blkszbits; + if (sbi->blkszbits < 9 || sbi->blkszbits > PAGE_SHIFT) { + erofs_err(sb, "blkszbits %u isn't supported", sbi->blkszbits); + goto out; + } + if (dsb->dirblkbits) { + erofs_err(sb, "dirblkbits %u isn't supported", dsb->dirblkbits); + goto out; + } + sbi->feature_compat = le32_to_cpu(dsb->feature_compat); if (erofs_sb_has_sb_chksum(sbi)) { ret = erofs_superblock_csum_verify(sb, data); @@ -356,19 +372,11 @@ static int erofs_read_superblock(struct super_block *sb) } ret = -EINVAL; - blkszbits = dsb->blkszbits; - /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ - if (blkszbits != LOG_BLOCK_SIZE) { - erofs_err(sb, "blkszbits %u isn't supported on this platform", - blkszbits); - goto out; - } - if (!check_layout_compatibility(sb, dsb)) goto out; sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE; - if (sbi->sb_size > EROFS_BLKSIZ) { + if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) { erofs_err(sb, "invalid sb_extslots %u (more than a fs block)", sbi->sb_size); goto out; @@ -377,20 +385,12 @@ static int erofs_read_superblock(struct super_block *sb) sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); + sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start); + sbi->xattr_prefix_count = dsb->xattr_prefix_count; #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); sbi->root_nid = le16_to_cpu(dsb->root_nid); -#ifdef CONFIG_EROFS_FS_ZIP - sbi->packed_inode = NULL; - if (erofs_sb_has_fragments(sbi) && dsb->packed_nid) { - sbi->packed_inode = - erofs_iget(sb, le64_to_cpu(dsb->packed_nid)); - if (IS_ERR(sbi->packed_inode)) { - ret = PTR_ERR(sbi->packed_inode); - goto out; - } - } -#endif + sbi->packed_nid = le64_to_cpu(dsb->packed_nid); sbi->inos = le64_to_cpu(dsb->inos); sbi->build_time = le64_to_cpu(dsb->build_time); @@ -417,8 +417,6 @@ static int erofs_read_superblock(struct super_block *sb) /* handle multiple devices */ ret = erofs_scan_devices(sb, dsb); - if (erofs_sb_has_ztailpacking(sbi)) - erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!"); if (erofs_is_fscache_mode(sb)) erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); if (erofs_sb_has_fragments(sbi)) @@ -733,9 +731,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sbi->domain_id = ctx->domain_id; ctx->domain_id = NULL; + sbi->blkszbits = PAGE_SHIFT; if (erofs_is_fscache_mode(sb)) { - sb->s_blocksize = EROFS_BLKSIZ; - sb->s_blocksize_bits = LOG_BLOCK_SIZE; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; err = erofs_fscache_register_fs(sb); if (err) @@ -745,8 +744,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; } else { - if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) { - erofs_err(sb, "failed to set erofs blksize"); + if (!sb_set_blocksize(sb, PAGE_SIZE)) { + errorfc(fc, "failed to set initial blksize"); return -EINVAL; } @@ -759,12 +758,24 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; - if (test_opt(&sbi->opt, DAX_ALWAYS)) { - BUILD_BUG_ON(EROFS_BLKSIZ != PAGE_SIZE); + if (sb->s_blocksize_bits != sbi->blkszbits) { + if (erofs_is_fscache_mode(sb)) { + errorfc(fc, "unsupported blksize for fscache mode"); + return -EINVAL; + } + if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) { + errorfc(fc, "failed to set erofs blksize"); + return -EINVAL; + } + } + if (test_opt(&sbi->opt, DAX_ALWAYS)) { if (!sbi->dax_dev) { errorfc(fc, "DAX unsupported by block device. Turning off DAX."); clear_opt(&sbi->opt, DAX_ALWAYS); + } else if (sbi->blkszbits != PAGE_SHIFT) { + errorfc(fc, "unsupported blocksize for DAX"); + clear_opt(&sbi->opt, DAX_ALWAYS); } } @@ -799,10 +810,22 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) erofs_shrinker_register(sb); /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */ + if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { + sbi->packed_inode = erofs_iget(sb, sbi->packed_nid); + if (IS_ERR(sbi->packed_inode)) { + err = PTR_ERR(sbi->packed_inode); + sbi->packed_inode = NULL; + return err; + } + } err = erofs_init_managed_cache(sb); if (err) return err; + err = erofs_xattr_prefixes_init(sb); + if (err) + return err; + err = erofs_register_sysfs(sb); if (err) return err; @@ -962,12 +985,13 @@ static void erofs_put_super(struct super_block *sb) erofs_unregister_sysfs(sb); erofs_shrinker_unregister(sb); + erofs_xattr_prefixes_cleanup(sb); #ifdef CONFIG_EROFS_FS_ZIP iput(sbi->managed_cache); sbi->managed_cache = NULL; +#endif iput(sbi->packed_inode); sbi->packed_inode = NULL; -#endif erofs_free_dev_context(sbi->devs); sbi->devs = NULL; erofs_fscache_unregister_fs(sb); @@ -1060,7 +1084,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = sb->s_magic; - buf->f_bsize = EROFS_BLKSIZ; + buf->f_bsize = sb->s_blocksize; buf->f_blocks = sbi->total_blocks; buf->f_bfree = buf->f_bavail = 0; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 60729b1220b6..cd80499351e0 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -7,6 +7,19 @@ #include <linux/security.h> #include "xattr.h" +static inline erofs_blk_t erofs_xattr_blkaddr(struct super_block *sb, + unsigned int xattr_id) +{ + return EROFS_SB(sb)->xattr_blkaddr + + erofs_blknr(sb, xattr_id * sizeof(__u32)); +} + +static inline unsigned int erofs_xattr_blkoff(struct super_block *sb, + unsigned int xattr_id) +{ + return erofs_blkoff(sb, xattr_id * sizeof(__u32)); +} + struct xattr_iter { struct super_block *sb; struct erofs_buf buf; @@ -16,7 +29,7 @@ struct xattr_iter { unsigned int ofs; }; -static int init_inode_xattrs(struct inode *inode) +static int erofs_init_inode_xattrs(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); struct xattr_iter it; @@ -68,8 +81,8 @@ static int init_inode_xattrs(struct inode *inode) } it.buf = __EROFS_BUF_INITIALIZER; - it.blkaddr = erofs_blknr(erofs_iloc(inode) + vi->inode_isize); - it.ofs = erofs_blkoff(erofs_iloc(inode) + vi->inode_isize); + it.blkaddr = erofs_blknr(sb, erofs_iloc(inode) + vi->inode_isize); + it.ofs = erofs_blkoff(sb, erofs_iloc(inode) + vi->inode_isize); /* read in shared xattr array (non-atomic, see kmalloc below) */ it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP); @@ -92,9 +105,9 @@ static int init_inode_xattrs(struct inode *inode) it.ofs += sizeof(struct erofs_xattr_ibody_header); for (i = 0; i < vi->xattr_shared_count; ++i) { - if (it.ofs >= EROFS_BLKSIZ) { + if (it.ofs >= sb->s_blocksize) { /* cannot be unaligned */ - DBG_BUGON(it.ofs != EROFS_BLKSIZ); + DBG_BUGON(it.ofs != sb->s_blocksize); it.kaddr = erofs_read_metabuf(&it.buf, sb, ++it.blkaddr, EROFS_KMAP); @@ -139,15 +152,15 @@ struct xattr_iter_handlers { static inline int xattr_iter_fixup(struct xattr_iter *it) { - if (it->ofs < EROFS_BLKSIZ) + if (it->ofs < it->sb->s_blocksize) return 0; - it->blkaddr += erofs_blknr(it->ofs); + it->blkaddr += erofs_blknr(it->sb, it->ofs); it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr, EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); - it->ofs = erofs_blkoff(it->ofs); + it->ofs = erofs_blkoff(it->sb, it->ofs); return 0; } @@ -157,7 +170,8 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, struct erofs_inode *const vi = EROFS_I(inode); unsigned int xattr_header_sz, inline_xattr_ofs; - xattr_header_sz = inlinexattr_header_size(inode); + xattr_header_sz = sizeof(struct erofs_xattr_ibody_header) + + sizeof(u32) * vi->xattr_shared_count; if (xattr_header_sz >= vi->xattr_isize) { DBG_BUGON(xattr_header_sz > vi->xattr_isize); return -ENOATTR; @@ -165,8 +179,8 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, inline_xattr_ofs = vi->inode_isize + xattr_header_sz; - it->blkaddr = erofs_blknr(erofs_iloc(inode) + inline_xattr_ofs); - it->ofs = erofs_blkoff(erofs_iloc(inode) + inline_xattr_ofs); + it->blkaddr = erofs_blknr(it->sb, erofs_iloc(inode) + inline_xattr_ofs); + it->ofs = erofs_blkoff(it->sb, erofs_iloc(inode) + inline_xattr_ofs); it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr, EROFS_KMAP); if (IS_ERR(it->kaddr)) @@ -222,8 +236,8 @@ static int xattr_foreach(struct xattr_iter *it, processed = 0; while (processed < entry.e_name_len) { - if (it->ofs >= EROFS_BLKSIZ) { - DBG_BUGON(it->ofs > EROFS_BLKSIZ); + if (it->ofs >= it->sb->s_blocksize) { + DBG_BUGON(it->ofs > it->sb->s_blocksize); err = xattr_iter_fixup(it); if (err) @@ -231,7 +245,7 @@ static int xattr_foreach(struct xattr_iter *it, it->ofs = 0; } - slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs, + slice = min_t(unsigned int, it->sb->s_blocksize - it->ofs, entry.e_name_len - processed); /* handle name */ @@ -257,8 +271,8 @@ static int xattr_foreach(struct xattr_iter *it, } while (processed < value_sz) { - if (it->ofs >= EROFS_BLKSIZ) { - DBG_BUGON(it->ofs > EROFS_BLKSIZ); + if (it->ofs >= it->sb->s_blocksize) { + DBG_BUGON(it->ofs > it->sb->s_blocksize); err = xattr_iter_fixup(it); if (err) @@ -266,7 +280,7 @@ static int xattr_foreach(struct xattr_iter *it, it->ofs = 0; } - slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs, + slice = min_t(unsigned int, it->sb->s_blocksize - it->ofs, value_sz - processed); op->value(it, processed, it->kaddr + it->ofs, slice); it->ofs += slice; @@ -283,17 +297,45 @@ struct getxattr_iter { struct xattr_iter it; char *buffer; - int buffer_size, index; + int buffer_size, index, infix_len; struct qstr name; }; +static int erofs_xattr_long_entrymatch(struct getxattr_iter *it, + struct erofs_xattr_entry *entry) +{ + struct erofs_sb_info *sbi = EROFS_SB(it->it.sb); + struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + + (entry->e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); + + if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) + return -ENOATTR; + + if (it->index != pf->prefix->base_index || + it->name.len != entry->e_name_len + pf->infix_len) + return -ENOATTR; + + if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len)) + return -ENOATTR; + + it->infix_len = pf->infix_len; + return 0; +} + static int xattr_entrymatch(struct xattr_iter *_it, struct erofs_xattr_entry *entry) { struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - return (it->index != entry->e_name_index || - it->name.len != entry->e_name_len) ? -ENOATTR : 0; + /* should also match the infix for long name prefixes */ + if (entry->e_name_index & EROFS_XATTR_LONG_PREFIX) + return erofs_xattr_long_entrymatch(it, entry); + + if (it->index != entry->e_name_index || + it->name.len != entry->e_name_len) + return -ENOATTR; + it->infix_len = 0; + return 0; } static int xattr_namematch(struct xattr_iter *_it, @@ -301,7 +343,9 @@ static int xattr_namematch(struct xattr_iter *_it, { struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - return memcmp(buf, it->name.name + processed, len) ? -ENOATTR : 0; + if (memcmp(buf, it->name.name + it->infix_len + processed, len)) + return -ENOATTR; + return 0; } static int xattr_checkbuffer(struct xattr_iter *_it, @@ -351,21 +395,18 @@ static int inline_getxattr(struct inode *inode, struct getxattr_iter *it) static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) { struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = inode->i_sb; - struct erofs_sb_info *const sbi = EROFS_SB(sb); - unsigned int i; + struct super_block *const sb = it->it.sb; + unsigned int i, xsid; int ret = -ENOATTR; for (i = 0; i < vi->xattr_shared_count; ++i) { - erofs_blk_t blkaddr = - xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); - - it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); - it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, - EROFS_KMAP); + xsid = vi->xattr_shared_xattrs[i]; + it->it.blkaddr = erofs_xattr_blkaddr(sb, xsid); + it->it.ofs = erofs_xattr_blkoff(sb, xsid); + it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, + it->it.blkaddr, EROFS_KMAP); if (IS_ERR(it->it.kaddr)) return PTR_ERR(it->it.kaddr); - it->it.blkaddr = blkaddr; ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL); if (ret != -ENOATTR) @@ -394,7 +435,7 @@ int erofs_getxattr(struct inode *inode, int index, if (!name) return -EINVAL; - ret = init_inode_xattrs(inode); + ret = erofs_init_inode_xattrs(inode); if (ret) return ret; @@ -421,20 +462,9 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - - switch (handler->flags) { - case EROFS_XATTR_INDEX_USER: - if (!test_opt(&sbi->opt, XATTR_USER)) - return -EOPNOTSUPP; - break; - case EROFS_XATTR_INDEX_TRUSTED: - break; - case EROFS_XATTR_INDEX_SECURITY: - break; - default: - return -EINVAL; - } + if (handler->flags == EROFS_XATTR_INDEX_USER && + !test_opt(&EROFS_I_SB(inode)->opt, XATTR_USER)) + return -EOPNOTSUPP; return erofs_getxattr(inode, handler->flags, name, buffer, size); } @@ -463,10 +493,6 @@ const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { const struct xattr_handler *erofs_xattr_handlers[] = { &erofs_xattr_user_handler, -#ifdef CONFIG_EROFS_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &erofs_xattr_trusted_handler, #ifdef CONFIG_EROFS_FS_SECURITY &erofs_xattr_security_handler, @@ -487,29 +513,40 @@ static int xattr_entrylist(struct xattr_iter *_it, { struct listxattr_iter *it = container_of(_it, struct listxattr_iter, it); - unsigned int prefix_len; - const char *prefix; - - const struct xattr_handler *h = - erofs_xattr_handler(entry->e_name_index); + unsigned int base_index = entry->e_name_index; + unsigned int prefix_len, infix_len = 0; + const char *prefix, *infix = NULL; + + if (entry->e_name_index & EROFS_XATTR_LONG_PREFIX) { + struct erofs_sb_info *sbi = EROFS_SB(_it->sb); + struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + + (entry->e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); + + if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) + return 1; + infix = pf->prefix->infix; + infix_len = pf->infix_len; + base_index = pf->prefix->base_index; + } - if (!h || (h->list && !h->list(it->dentry))) + prefix = erofs_xattr_prefix(base_index, it->dentry); + if (!prefix) return 1; - - prefix = xattr_prefix(h); prefix_len = strlen(prefix); if (!it->buffer) { - it->buffer_ofs += prefix_len + entry->e_name_len + 1; + it->buffer_ofs += prefix_len + infix_len + + entry->e_name_len + 1; return 1; } - if (it->buffer_ofs + prefix_len + if (it->buffer_ofs + prefix_len + infix_len + + entry->e_name_len + 1 > it->buffer_size) return -ERANGE; memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len); - it->buffer_ofs += prefix_len; + memcpy(it->buffer + it->buffer_ofs + prefix_len, infix, infix_len); + it->buffer_ofs += prefix_len + infix_len; return 0; } @@ -563,21 +600,18 @@ static int shared_listxattr(struct listxattr_iter *it) { struct inode *const inode = d_inode(it->dentry); struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = inode->i_sb; - struct erofs_sb_info *const sbi = EROFS_SB(sb); - unsigned int i; + struct super_block *const sb = it->it.sb; + unsigned int i, xsid; int ret = 0; for (i = 0; i < vi->xattr_shared_count; ++i) { - erofs_blk_t blkaddr = - xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); - - it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); - it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, - EROFS_KMAP); + xsid = vi->xattr_shared_xattrs[i]; + it->it.blkaddr = erofs_xattr_blkaddr(sb, xsid); + it->it.ofs = erofs_xattr_blkoff(sb, xsid); + it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, + it->it.blkaddr, EROFS_KMAP); if (IS_ERR(it->it.kaddr)) return PTR_ERR(it->it.kaddr); - it->it.blkaddr = blkaddr; ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL); if (ret) @@ -592,7 +626,7 @@ ssize_t erofs_listxattr(struct dentry *dentry, int ret; struct listxattr_iter it; - ret = init_inode_xattrs(d_inode(dentry)); + ret = erofs_init_inode_xattrs(d_inode(dentry)); if (ret == -ENOATTR) return 0; if (ret) @@ -613,6 +647,62 @@ ssize_t erofs_listxattr(struct dentry *dentry, return ret; } +void erofs_xattr_prefixes_cleanup(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + int i; + + if (sbi->xattr_prefixes) { + for (i = 0; i < sbi->xattr_prefix_count; i++) + kfree(sbi->xattr_prefixes[i].prefix); + kfree(sbi->xattr_prefixes); + sbi->xattr_prefixes = NULL; + } +} + +int erofs_xattr_prefixes_init(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2; + struct erofs_xattr_prefix_item *pfs; + int ret = 0, i, len; + + if (!sbi->xattr_prefix_count) + return 0; + + pfs = kzalloc(sbi->xattr_prefix_count * sizeof(*pfs), GFP_KERNEL); + if (!pfs) + return -ENOMEM; + + if (erofs_sb_has_fragments(sbi)) + buf.inode = sbi->packed_inode; + else + erofs_init_metabuf(&buf, sb); + + for (i = 0; i < sbi->xattr_prefix_count; i++) { + void *ptr = erofs_read_metadata(sb, &buf, &pos, &len); + + if (IS_ERR(ptr)) { + ret = PTR_ERR(ptr); + break; + } else if (len < sizeof(*pfs->prefix) || + len > EROFS_NAME_LEN + sizeof(*pfs->prefix)) { + kfree(ptr); + ret = -EFSCORRUPTED; + break; + } + pfs[i].prefix = ptr; + pfs[i].infix_len = len - sizeof(struct erofs_xattr_long_prefix); + } + + erofs_put_metabuf(&buf); + sbi->xattr_prefixes = pfs; + if (ret) + erofs_xattr_prefixes_cleanup(sb); + return ret; +} + #ifdef CONFIG_EROFS_FS_POSIX_ACL struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu) { diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 0a43c9ee9f8f..f16283cb8c93 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -13,43 +13,21 @@ /* Attribute not found */ #define ENOATTR ENODATA -static inline unsigned int inlinexattr_header_size(struct inode *inode) -{ - return sizeof(struct erofs_xattr_ibody_header) + - sizeof(u32) * EROFS_I(inode)->xattr_shared_count; -} - -static inline erofs_blk_t xattrblock_addr(struct erofs_sb_info *sbi, - unsigned int xattr_id) -{ -#ifdef CONFIG_EROFS_FS_XATTR - return sbi->xattr_blkaddr + - xattr_id * sizeof(__u32) / EROFS_BLKSIZ; -#else - return 0; -#endif -} - -static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi, - unsigned int xattr_id) -{ - return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ; -} - #ifdef CONFIG_EROFS_FS_XATTR extern const struct xattr_handler erofs_xattr_user_handler; extern const struct xattr_handler erofs_xattr_trusted_handler; extern const struct xattr_handler erofs_xattr_security_handler; -static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx) +static inline const char *erofs_xattr_prefix(unsigned int idx, + struct dentry *dentry) { + const struct xattr_handler *handler = NULL; + static const struct xattr_handler *xattr_handler_map[] = { [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, #ifdef CONFIG_EROFS_FS_POSIX_ACL - [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = - &posix_acl_access_xattr_handler, - [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = - &posix_acl_default_xattr_handler, + [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, #endif [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, #ifdef CONFIG_EROFS_FS_SECURITY @@ -57,15 +35,24 @@ static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx) #endif }; - return idx && idx < ARRAY_SIZE(xattr_handler_map) ? - xattr_handler_map[idx] : NULL; + if (idx && idx < ARRAY_SIZE(xattr_handler_map)) + handler = xattr_handler_map[idx]; + + if (!xattr_handler_can_list(handler, dentry)) + return NULL; + + return xattr_prefix(handler); } extern const struct xattr_handler *erofs_xattr_handlers[]; +int erofs_xattr_prefixes_init(struct super_block *sb); +void erofs_xattr_prefixes_cleanup(struct super_block *sb); int erofs_getxattr(struct inode *, int, const char *, void *, size_t); ssize_t erofs_listxattr(struct dentry *, char *, size_t); #else +static inline int erofs_xattr_prefixes_init(struct super_block *sb) { return 0; } +static inline void erofs_xattr_prefixes_cleanup(struct super_block *sb) {} static inline int erofs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index f1708c77a991..45f21db2303a 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -807,7 +807,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (ztailpacking) { pcl->obj.index = 0; /* which indicates ztailpacking */ - pcl->pageofs_in = erofs_blkoff(map->m_pa); + pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa); pcl->tailpacking_size = map->m_plen; } else { pcl->obj.index = map->m_pa >> PAGE_SHIFT; @@ -930,6 +930,7 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, struct page *page, unsigned int pageofs, unsigned int len) { + struct super_block *sb = inode->i_sb; struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; u8 *src, *dst; @@ -938,19 +939,19 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, if (!packed_inode) return -EFSCORRUPTED; + buf.inode = packed_inode; pos += EROFS_I(inode)->z_fragmentoff; for (i = 0; i < len; i += cnt) { cnt = min_t(unsigned int, len - i, - EROFS_BLKSIZ - erofs_blkoff(pos)); - src = erofs_bread(&buf, packed_inode, - erofs_blknr(pos), EROFS_KMAP); + sb->s_blocksize - erofs_blkoff(sb, pos)); + src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); } dst = kmap_local_page(page); - memcpy(dst + pageofs + i, src + erofs_blkoff(pos), cnt); + memcpy(dst + pageofs + i, src + erofs_blkoff(sb, pos), cnt); kunmap_local(dst); pos += cnt; } @@ -978,8 +979,6 @@ repeat: if (offset + cur < map->m_la || offset + cur >= map->m_la + map->m_llen) { - erofs_dbg("out-of-range map @ pos %llu", offset + cur); - if (z_erofs_collector_end(fe)) fe->backmost = false; map->m_la = offset + cur; @@ -1005,7 +1004,8 @@ repeat: void *mp; mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, - erofs_blknr(map->m_pa), EROFS_NO_KMAP); + erofs_blknr(inode->i_sb, map->m_pa), + EROFS_NO_KMAP); if (IS_ERR(mp)) { err = PTR_ERR(mp); erofs_err(inode->i_sb, @@ -1103,9 +1103,6 @@ out: if (err) z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); - - erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", - __func__, page, spiltted, map->m_llen); return err; } @@ -1726,11 +1723,11 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, /* no device id here, thus it will always succeed */ mdev = (struct erofs_map_dev) { - .m_pa = blknr_to_addr(pcl->obj.index), + .m_pa = erofs_pos(sb, pcl->obj.index), }; (void)erofs_map_dev(sb, &mdev); - cur = erofs_blknr(mdev.m_pa); + cur = erofs_blknr(sb, mdev.m_pa); end = cur + pcl->pclusterpages; do { @@ -1764,7 +1761,7 @@ submit_bio_retry: last_bdev = mdev.m_bdev; bio->bi_iter.bi_sector = (sector_t)cur << - LOG_SECTORS_PER_BLOCK; + (sb->s_blocksize_bits - 9); bio->bi_private = q[JQ_SUBMIT]; if (f->readahead) bio->bi_opf |= REQ_RAHEAD; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 655da4d739cb..d37c5c89c728 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -7,24 +7,6 @@ #include <asm/unaligned.h> #include <trace/events/erofs.h> -int z_erofs_fill_inode(struct inode *inode) -{ - struct erofs_inode *const vi = EROFS_I(inode); - struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); - - if (!erofs_sb_has_big_pcluster(sbi) && - !erofs_sb_has_ztailpacking(sbi) && !erofs_sb_has_fragments(sbi) && - vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { - vi->z_advise = 0; - vi->z_algorithmtype[0] = 0; - vi->z_algorithmtype[1] = 0; - vi->z_logical_clusterbits = LOG_BLOCK_SIZE; - set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); - } - inode->i_mapping->a_ops = &z_erofs_aops; - return 0; -} - struct z_erofs_maprecorder { struct inode *inode; struct erofs_map_blocks *map; @@ -45,47 +27,50 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); - const erofs_off_t pos = - Z_EROFS_VLE_LEGACY_INDEX_ALIGN(erofs_iloc(inode) + - vi->inode_isize + vi->xattr_isize) + - lcn * sizeof(struct z_erofs_vle_decompressed_index); - struct z_erofs_vle_decompressed_index *di; + const erofs_off_t pos = Z_EROFS_FULL_INDEX_ALIGN(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize) + + lcn * sizeof(struct z_erofs_lcluster_index); + struct z_erofs_lcluster_index *di; unsigned int advise, type; m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - erofs_blknr(pos), EROFS_KMAP); + erofs_blknr(inode->i_sb, pos), EROFS_KMAP); if (IS_ERR(m->kaddr)) return PTR_ERR(m->kaddr); - m->nextpackoff = pos + sizeof(struct z_erofs_vle_decompressed_index); + m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); m->lcn = lcn; - di = m->kaddr + erofs_blkoff(pos); + di = m->kaddr + erofs_blkoff(inode->i_sb, pos); advise = le16_to_cpu(di->di_advise); - type = (advise >> Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT) & - ((1 << Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) - 1); + type = (advise >> Z_EROFS_LI_LCLUSTER_TYPE_BIT) & + ((1 << Z_EROFS_LI_LCLUSTER_TYPE_BITS) - 1); switch (type) { - case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + case Z_EROFS_LCLUSTER_TYPE_NONHEAD: m->clusterofs = 1 << vi->z_logical_clusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); - if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) { + if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) { if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { DBG_BUGON(1); return -EFSCORRUPTED; } m->compressedblks = m->delta[0] & - ~Z_EROFS_VLE_DI_D0_CBLKCNT; + ~Z_EROFS_LI_D0_CBLKCNT; m->delta[0] = 1; } m->delta[1] = le16_to_cpu(di->di_u.delta[1]); break; - case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: - if (advise & Z_EROFS_VLE_DI_PARTIAL_REF) + case Z_EROFS_LCLUSTER_TYPE_PLAIN: + case Z_EROFS_LCLUSTER_TYPE_HEAD1: + case Z_EROFS_LCLUSTER_TYPE_HEAD2: + if (advise & Z_EROFS_LI_PARTIAL_REF) m->partialref = true; m->clusterofs = le16_to_cpu(di->di_clusterofs); + if (m->clusterofs >= 1 << vi->z_logical_clusterbits) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } m->pblk = le32_to_cpu(di->di_u.blkaddr); break; default: @@ -121,13 +106,13 @@ static int get_compacted_la_distance(unsigned int lclusterbits, lo = decode_compactedbits(lclusterbits, lomask, in, encodebits * i, &type); - if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) return d1; ++d1; } while (++i < vcnt); - /* vcnt - 1 (Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) item */ - if (!(lo & Z_EROFS_VLE_DI_D0_CBLKCNT)) + /* vcnt - 1 (Z_EROFS_LCLUSTER_TYPE_NONHEAD) item */ + if (!(lo & Z_EROFS_LI_D0_CBLKCNT)) d1 += lo - 1; return d1; } @@ -156,7 +141,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, (vcnt << amortizedshift); big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; - eofs = erofs_blkoff(pos); + eofs = erofs_blkoff(m->inode->i_sb, pos); base = round_down(eofs, vcnt << amortizedshift); in = m->kaddr + base; @@ -165,19 +150,19 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, lo = decode_compactedbits(lclusterbits, lomask, in, encodebits * i, &type); m->type = type; - if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { + if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << lclusterbits; /* figure out lookahead_distance: delta[1] if needed */ if (lookahead) m->delta[1] = get_compacted_la_distance(lclusterbits, encodebits, vcnt, in, i); - if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { + if (lo & Z_EROFS_LI_D0_CBLKCNT) { if (!big_pcluster) { DBG_BUGON(1); return -EFSCORRUPTED; } - m->compressedblks = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT; + m->compressedblks = lo & ~Z_EROFS_LI_D0_CBLKCNT; m->delta[0] = 1; return 0; } else if (i + 1 != (int)vcnt) { @@ -191,9 +176,9 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, */ lo = decode_compactedbits(lclusterbits, lomask, in, encodebits * (i - 1), &type); - if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) lo = 0; - else if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) + else if (lo & Z_EROFS_LI_D0_CBLKCNT) lo = 1; m->delta[0] = lo + 1; return 0; @@ -207,7 +192,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, --i; lo = decode_compactedbits(lclusterbits, lomask, in, encodebits * i, &type); - if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) i -= lo; if (i >= 0) @@ -219,10 +204,10 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, --i; lo = decode_compactedbits(lclusterbits, lomask, in, encodebits * i, &type); - if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { - if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { + if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { + if (lo & Z_EROFS_LI_D0_CBLKCNT) { --i; - nblk += lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT; + nblk += lo & ~Z_EROFS_LI_D0_CBLKCNT; continue; } /* bigpcluster shouldn't have plain d0 == 1 */ @@ -249,7 +234,7 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, const unsigned int lclusterbits = vi->z_logical_clusterbits; const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); - const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); + unsigned int totalidx = erofs_iblks(inode); unsigned int compacted_4b_initial, compacted_2b; unsigned int amortizedshift; erofs_off_t pos; @@ -290,7 +275,7 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, out: pos += lcn * (1 << amortizedshift); m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - erofs_blknr(pos), EROFS_KMAP); + erofs_blknr(inode->i_sb, pos), EROFS_KMAP); if (IS_ERR(m->kaddr)) return PTR_ERR(m->kaddr); return unpack_compacted_index(m, amortizedshift, pos, lookahead); @@ -301,10 +286,10 @@ static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m, { const unsigned int datamode = EROFS_I(m->inode)->datalayout; - if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY) + if (datamode == EROFS_INODE_COMPRESSED_FULL) return legacy_load_cluster_from_disk(m, lcn); - if (datamode == EROFS_INODE_FLAT_COMPRESSION) + if (datamode == EROFS_INODE_COMPRESSED_COMPACT) return compacted_load_cluster_from_disk(m, lcn, lookahead); return -EINVAL; @@ -326,7 +311,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, return err; switch (m->type) { - case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + case Z_EROFS_LCLUSTER_TYPE_NONHEAD: if (!m->delta[0]) { erofs_err(m->inode->i_sb, "invalid lookback distance 0 @ nid %llu", @@ -336,9 +321,9 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, } lookback_distance = m->delta[0]; continue; - case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: + case Z_EROFS_LCLUSTER_TYPE_PLAIN: + case Z_EROFS_LCLUSTER_TYPE_HEAD1: + case Z_EROFS_LCLUSTER_TYPE_HEAD2: m->headtype = m->type; m->map->m_la = (lcn << lclusterbits) | m->clusterofs; return 0; @@ -360,21 +345,22 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, unsigned int initial_lcn) { + struct super_block *sb = m->inode->i_sb; struct erofs_inode *const vi = EROFS_I(m->inode); struct erofs_map_blocks *const map = m->map; const unsigned int lclusterbits = vi->z_logical_clusterbits; unsigned long lcn; int err; - DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN && - m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 && - m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2); + DBG_BUGON(m->type != Z_EROFS_LCLUSTER_TYPE_PLAIN && + m->type != Z_EROFS_LCLUSTER_TYPE_HEAD1 && + m->type != Z_EROFS_LCLUSTER_TYPE_HEAD2); DBG_BUGON(m->type != m->headtype); - if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || - ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) && + if (m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN || + ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1) && !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) || - ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) && + ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { map->m_plen = 1ULL << lclusterbits; return 0; @@ -396,19 +382,19 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, * BUG_ON in the debugging mode only for developers to notice that. */ DBG_BUGON(lcn == initial_lcn && - m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD); + m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD); switch (m->type) { - case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: + case Z_EROFS_LCLUSTER_TYPE_PLAIN: + case Z_EROFS_LCLUSTER_TYPE_HEAD1: + case Z_EROFS_LCLUSTER_TYPE_HEAD2: /* * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type * rather than CBLKCNT, it's a 1 lcluster-sized pcluster. */ - m->compressedblks = 1 << (lclusterbits - LOG_BLOCK_SIZE); + m->compressedblks = 1 << (lclusterbits - sb->s_blocksize_bits); break; - case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + case Z_EROFS_LCLUSTER_TYPE_NONHEAD: if (m->delta[0] != 1) goto err_bonus_cblkcnt; if (m->compressedblks) @@ -422,7 +408,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, return -EFSCORRUPTED; } out: - map->m_plen = (u64)m->compressedblks << LOG_BLOCK_SIZE; + map->m_plen = erofs_pos(sb, m->compressedblks); return 0; err_bonus_cblkcnt: erofs_err(m->inode->i_sb, @@ -452,12 +438,12 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) if (err) return err; - if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { + if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { DBG_BUGON(!m->delta[1] && m->clusterofs != 1 << lclusterbits); - } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || - m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 || - m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) { + } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN || + m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 || + m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) { /* go on until the next HEAD lcluster */ if (lcn != headlcn) break; @@ -476,8 +462,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) } static int z_erofs_do_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, - int flags) + struct erofs_map_blocks *map, int flags) { struct erofs_inode *const vi = EROFS_I(inode); bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; @@ -507,9 +492,9 @@ static int z_erofs_do_map_blocks(struct inode *inode, end = (m.lcn + 1ULL) << lclusterbits; switch (m.type) { - case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: + case Z_EROFS_LCLUSTER_TYPE_PLAIN: + case Z_EROFS_LCLUSTER_TYPE_HEAD1: + case Z_EROFS_LCLUSTER_TYPE_HEAD2: if (endoff >= m.clusterofs) { m.headtype = m.type; map->m_la = (m.lcn << lclusterbits) | m.clusterofs; @@ -534,7 +519,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, map->m_flags |= EROFS_MAP_FULL_MAPPED; m.delta[0] = 1; fallthrough; - case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + case Z_EROFS_LCLUSTER_TYPE_NONHEAD: /* get the corresponding first chunk */ err = z_erofs_extent_lookback(&m, m.delta[0]); if (err) @@ -555,7 +540,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, vi->z_tailextent_headlcn = m.lcn; /* for non-compact indexes, fragmentoff is 64 bits */ if (fragment && - vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) + vi->datalayout == EROFS_INODE_COMPRESSED_FULL) vi->z_fragmentoff |= (u64)m.pblk << 32; } if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { @@ -565,13 +550,13 @@ static int z_erofs_do_map_blocks(struct inode *inode, } else if (fragment && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_FRAGMENT; } else { - map->m_pa = blknr_to_addr(m.pblk); + map->m_pa = erofs_pos(inode->i_sb, m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto unmap_out; } - if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) { + if (m.headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN) { if (map->m_llen > map->m_plen) { DBG_BUGON(1); err = -EFSCORRUPTED; @@ -583,7 +568,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, else map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; - } else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) { + } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) { map->m_algorithmformat = vi->z_algorithmtype[1]; } else { map->m_algorithmformat = vi->z_algorithmtype[0]; @@ -592,7 +577,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA && - map->m_llen >= EROFS_BLKSIZ)) { + map->m_llen >= i_blocksize(inode))) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; @@ -600,9 +585,6 @@ static int z_erofs_do_map_blocks(struct inode *inode, unmap_out: erofs_unmap_metabuf(&m.map->buf); - erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o", - __func__, map->m_la, map->m_pa, - map->m_llen, map->m_plen, map->m_flags); return err; } @@ -633,13 +615,13 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_unlock; pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP); if (IS_ERR(kaddr)) { err = PTR_ERR(kaddr); goto out_unlock; } - h = kaddr + erofs_blkoff(pos); + h = kaddr + erofs_blkoff(sb, pos); /* * if the highest bit of the 8-byte map header is set, the whole file * is stored in the packed inode. The rest bits keeps z_fragmentoff. @@ -663,7 +645,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_put_metabuf; } - vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); + vi->z_logical_clusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 7); if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { @@ -672,7 +654,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) err = -EFSCORRUPTED; goto out_put_metabuf; } - if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && + if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT && !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", @@ -692,7 +674,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_put_metabuf(&map.buf); if (!map.m_plen || - erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) { + erofs_blkoff(sb, map.m_pa) + map.m_plen > sb->s_blocksize) { erofs_err(sb, "invalid tail-packing pclustersize %llu", map.m_plen); err = -EFSCORRUPTED; diff --git a/fs/eventfd.c b/fs/eventfd.c index 249ca6c0b784..95850a13ce8d 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -228,7 +228,6 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to) struct file *file = iocb->ki_filp; struct eventfd_ctx *ctx = file->private_data; __u64 ucnt = 0; - DECLARE_WAITQUEUE(wait, current); if (iov_iter_count(to) < sizeof(ucnt)) return -EINVAL; @@ -239,23 +238,11 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to) spin_unlock_irq(&ctx->wqh.lock); return -EAGAIN; } - __add_wait_queue(&ctx->wqh, &wait); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (ctx->count) - break; - if (signal_pending(current)) { - __remove_wait_queue(&ctx->wqh, &wait); - __set_current_state(TASK_RUNNING); - spin_unlock_irq(&ctx->wqh.lock); - return -ERESTARTSYS; - } + + if (wait_event_interruptible_locked_irq(ctx->wqh, ctx->count)) { spin_unlock_irq(&ctx->wqh.lock); - schedule(); - spin_lock_irq(&ctx->wqh.lock); + return -ERESTARTSYS; } - __remove_wait_queue(&ctx->wqh, &wait); - __set_current_state(TASK_RUNNING); } eventfd_ctx_do_read(ctx, &ucnt); current->in_eventfd = 1; @@ -275,7 +262,6 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c struct eventfd_ctx *ctx = file->private_data; ssize_t res; __u64 ucnt; - DECLARE_WAITQUEUE(wait, current); if (count < sizeof(ucnt)) return -EINVAL; @@ -288,23 +274,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c if (ULLONG_MAX - ctx->count > ucnt) res = sizeof(ucnt); else if (!(file->f_flags & O_NONBLOCK)) { - __add_wait_queue(&ctx->wqh, &wait); - for (res = 0;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (ULLONG_MAX - ctx->count > ucnt) { - res = sizeof(ucnt); - break; - } - if (signal_pending(current)) { - res = -ERESTARTSYS; - break; - } - spin_unlock_irq(&ctx->wqh.lock); - schedule(); - spin_lock_irq(&ctx->wqh.lock); - } - __remove_wait_queue(&ctx->wqh, &wait); - __set_current_state(TASK_RUNNING); + res = wait_event_interruptible_locked_irq(ctx->wqh, + ULLONG_MAX - ctx->count > ucnt); + if (!res) + res = sizeof(ucnt); } if (likely(res > 0)) { ctx->count += ucnt; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 64659b110973..f6d25050dd7a 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -483,8 +483,8 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) * (efd1) notices that it may have some event ready, so it needs to wake up * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() * that ends up in another wake_up(), after having checked about the - * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to - * avoid stack blasting. + * recursion constraints. That are, no more than EP_MAX_NESTS, to avoid + * stack blasting. * * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle * this special case of epoll. diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 641abfa4b718..b126af5f8b15 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -101,8 +101,8 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *, static const struct xattr_handler *ext2_xattr_handler_map[] = { [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL - [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, - [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, + [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, #endif [EXT2_XATTR_INDEX_TRUSTED] = &ext2_xattr_trusted_handler, #ifdef CONFIG_EXT2_FS_SECURITY @@ -113,10 +113,6 @@ static const struct xattr_handler *ext2_xattr_handler_map[] = { const struct xattr_handler *ext2_xattr_handlers[] = { &ext2_xattr_user_handler, &ext2_xattr_trusted_handler, -#ifdef CONFIG_EXT2_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif #ifdef CONFIG_EXT2_FS_SECURITY &ext2_xattr_security_handler, #endif @@ -125,14 +121,18 @@ const struct xattr_handler *ext2_xattr_handlers[] = { #define EA_BLOCK_CACHE(inode) (EXT2_SB(inode->i_sb)->s_ea_block_cache) -static inline const struct xattr_handler * -ext2_xattr_handler(int name_index) +static inline const char *ext2_xattr_prefix(int name_index, + struct dentry *dentry) { const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map)) handler = ext2_xattr_handler_map[name_index]; - return handler; + + if (!xattr_handler_can_list(handler, dentry)) + return NULL; + + return xattr_prefix(handler); } static bool @@ -333,11 +333,10 @@ bad_block: /* list the attribute names */ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); entry = EXT2_XATTR_NEXT(entry)) { - const struct xattr_handler *handler = - ext2_xattr_handler(entry->e_name_index); + const char *prefix; - if (handler && (!handler->list || handler->list(dentry))) { - const char *prefix = handler->prefix ?: handler->name; + prefix = ext2_xattr_prefix(entry->e_name_index, dentry); + if (prefix) { size_t prefix_len = strlen(prefix); size_t size = prefix_len + entry->e_name_len + 1; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f43e526112ae..76d1ef0efbf4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2500,7 +2500,7 @@ static void ext4_apply_quota_options(struct fs_context *fc, qname = rcu_replace_pointer(sbi->s_qf_names[i], qname, lockdep_is_held(&sb->s_umount)); if (qname) - kfree_rcu(qname); + kfree_rcu_mightsleep(qname); } } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 767454d74cd6..dadad29bd81b 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -88,8 +88,8 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *); static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL - [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, - [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, + [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, #endif [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY @@ -101,10 +101,6 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = { const struct xattr_handler *ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, -#ifdef CONFIG_EXT4_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif #ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, #endif @@ -173,14 +169,18 @@ static void ext4_xattr_block_csum_set(struct inode *inode, bh->b_blocknr, BHDR(bh)); } -static inline const struct xattr_handler * -ext4_xattr_handler(int name_index) +static inline const char *ext4_xattr_prefix(int name_index, + struct dentry *dentry) { const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) handler = ext4_xattr_handler_map[name_index]; - return handler; + + if (!xattr_handler_can_list(handler, dentry)) + return NULL; + + return xattr_prefix(handler); } static int @@ -740,11 +740,10 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, size_t rest = buffer_size; for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { - const struct xattr_handler *handler = - ext4_xattr_handler(entry->e_name_index); + const char *prefix; - if (handler && (!handler->list || handler->list(dentry))) { - const char *prefix = handler->prefix ?: handler->name; + prefix = ext4_xattr_prefix(entry->e_name_index, dentry); + if (prefix) { size_t prefix_len = strlen(prefix); size_t size = prefix_len + entry->e_name_len + 1; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index d92edbbdc30e..213805d3592c 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -192,8 +192,8 @@ const struct xattr_handler f2fs_xattr_security_handler = { static const struct xattr_handler *f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, #ifdef CONFIG_F2FS_FS_POSIX_ACL - [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, - [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, + [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, #endif [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, #ifdef CONFIG_F2FS_FS_SECURITY @@ -204,10 +204,6 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = { const struct xattr_handler *f2fs_xattr_handlers[] = { &f2fs_xattr_user_handler, -#ifdef CONFIG_F2FS_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &f2fs_xattr_trusted_handler, #ifdef CONFIG_F2FS_FS_SECURITY &f2fs_xattr_security_handler, @@ -216,13 +212,18 @@ const struct xattr_handler *f2fs_xattr_handlers[] = { NULL, }; -static inline const struct xattr_handler *f2fs_xattr_handler(int index) +static inline const char *f2fs_xattr_prefix(int index, + struct dentry *dentry) { const struct xattr_handler *handler = NULL; if (index > 0 && index < ARRAY_SIZE(f2fs_xattr_handler_map)) handler = f2fs_xattr_handler_map[index]; - return handler; + + if (!xattr_handler_can_list(handler, dentry)) + return NULL; + + return xattr_prefix(handler); } static struct f2fs_xattr_entry *__find_xattr(void *base_addr, @@ -573,12 +574,12 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) last_base_addr = (void *)base_addr + XATTR_SIZE(inode); list_for_each_xattr(entry, base_addr) { - const struct xattr_handler *handler = - f2fs_xattr_handler(entry->e_name_index); const char *prefix; size_t prefix_len; size_t size; + prefix = f2fs_xattr_prefix(entry->e_name_index, dentry); + if ((void *)(entry) + sizeof(__u32) > last_base_addr || (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", @@ -590,10 +591,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) goto cleanup; } - if (!handler || (handler->list && !handler->list(dentry))) + if (!prefix) continue; - prefix = xattr_prefix(handler); prefix_len = strlen(prefix); size = prefix_len + entry->e_name_len + 1; if (buffer) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 195dc23e0d83..1db3e3c24b43 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -978,6 +978,16 @@ restart: continue; } + /* + * If wb_tryget fails, the wb has been shutdown, skip it. + * + * Pin @wb so that it stays on @bdi->wb_list. This allows + * continuing iteration from @wb after dropping and + * regrabbing rcu read lock. + */ + if (!wb_tryget(wb)) + continue; + /* alloc failed, execute synchronously using on-stack fallback */ work = &fallback_work; *work = *base_work; @@ -986,13 +996,6 @@ restart: work->done = &fallback_work_done; wb_queue_work(wb, work); - - /* - * Pin @wb so that it stays on @bdi->wb_list. This allows - * continuing iteration from @wb after dropping and - * regrabbing rcu read lock. - */ - wb_get(wb); last_wb = wb; rcu_read_unlock(); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index eb4f88e3dc97..1a8f82f478cb 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2257,30 +2257,31 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, int res; int oldfd; struct fuse_dev *fud = NULL; + struct fd f; switch (cmd) { case FUSE_DEV_IOC_CLONE: - res = -EFAULT; - if (!get_user(oldfd, (__u32 __user *)arg)) { - struct file *old = fget(oldfd); - - res = -EINVAL; - if (old) { - /* - * Check against file->f_op because CUSE - * uses the same ioctl handler. - */ - if (old->f_op == file->f_op) - fud = fuse_get_dev(old); - - if (fud) { - mutex_lock(&fuse_mutex); - res = fuse_device_clone(fud->fc, file); - mutex_unlock(&fuse_mutex); - } - fput(old); - } + if (get_user(oldfd, (__u32 __user *)arg)) + return -EFAULT; + + f = fdget(oldfd); + if (!f.file) + return -EINVAL; + + /* + * Check against file->f_op because CUSE + * uses the same ioctl handler. + */ + if (f.file->f_op == file->f_op) + fud = fuse_get_dev(f.file); + + res = -EINVAL; + if (fud) { + mutex_lock(&fuse_mutex); + res = fuse_device_clone(fud->fc, file); + mutex_unlock(&fuse_mutex); } + fdput(f); break; default: res = -ENOTTY; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index de37a3a06a71..89d97f6188e0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1419,7 +1419,7 @@ out: static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) { - return (unsigned long)ii->iov->iov_base + ii->iov_offset; + return (unsigned long)iter_iov(ii)->iov_base + ii->iov_offset; } static inline size_t fuse_get_frag_size(const struct iov_iter *ii, diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index adf6d17cf033..93b36d026bb4 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1501,8 +1501,6 @@ const struct xattr_handler *gfs2_xattr_handlers_max[] = { /* GFS2_FS_FORMAT_MIN */ &gfs2_xattr_user_handler, &gfs2_xattr_security_handler, - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, NULL, }; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index abb91f5fae92..b21660475ac1 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -511,7 +511,11 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) if (type == HFSPLUS_FOLDER) { struct hfsplus_cat_folder *folder = &entry.folder; - WARN_ON(fd->entrylength < sizeof(struct hfsplus_cat_folder)); + if (fd->entrylength < sizeof(struct hfsplus_cat_folder)) { + pr_err("bad catalog folder entry\n"); + res = -EIO; + goto out; + } hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, sizeof(struct hfsplus_cat_folder)); hfsplus_get_perms(inode, &folder->permissions, 1); @@ -531,7 +535,11 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) } else if (type == HFSPLUS_FILE) { struct hfsplus_cat_file *file = &entry.file; - WARN_ON(fd->entrylength < sizeof(struct hfsplus_cat_file)); + if (fd->entrylength < sizeof(struct hfsplus_cat_file)) { + pr_err("bad catalog file entry\n"); + res = -EIO; + goto out; + } hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, sizeof(struct hfsplus_cat_file)); @@ -562,6 +570,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) pr_err("bad catalog entry used to create inode\n"); res = -EIO; } +out: return res; } @@ -570,6 +579,7 @@ int hfsplus_cat_write_inode(struct inode *inode) struct inode *main_inode = inode; struct hfs_find_data fd; hfsplus_cat_entry entry; + int res = 0; if (HFSPLUS_IS_RSRC(inode)) main_inode = HFSPLUS_I(inode)->rsrc_inode; @@ -588,7 +598,11 @@ int hfsplus_cat_write_inode(struct inode *inode) if (S_ISDIR(main_inode->i_mode)) { struct hfsplus_cat_folder *folder = &entry.folder; - WARN_ON(fd.entrylength < sizeof(struct hfsplus_cat_folder)); + if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) { + pr_err("bad catalog folder entry\n"); + res = -EIO; + goto out; + } hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_folder)); /* simple node checks? */ @@ -613,7 +627,11 @@ int hfsplus_cat_write_inode(struct inode *inode) } else { struct hfsplus_cat_file *file = &entry.file; - WARN_ON(fd.entrylength < sizeof(struct hfsplus_cat_file)); + if (fd.entrylength < sizeof(struct hfsplus_cat_file)) { + pr_err("bad catalog file entry\n"); + res = -EIO; + goto out; + } hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_file)); hfsplus_inode_write_fork(inode, &file->data_fork); @@ -634,7 +652,7 @@ int hfsplus_cat_write_inode(struct inode *inode) set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags); out: hfs_find_exit(&fd); - return 0; + return res; } int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa) diff --git a/fs/inode.c b/fs/inode.c index 4558dc2f1355..3ec5a8f7b644 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1804,8 +1804,8 @@ EXPORT_SYMBOL(bmap); /* * With relative atime, only update atime if the previous atime is - * earlier than either the ctime or mtime or if at least a day has - * passed since the last atime update. + * earlier than or equal to either the ctime or mtime, + * or if at least a day has passed since the last atime update. */ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, struct timespec64 now) @@ -1814,12 +1814,12 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, if (!(mnt->mnt_flags & MNT_RELATIME)) return 1; /* - * Is mtime younger than atime? If yes, update atime: + * Is mtime younger than or equal to atime? If yes, update atime: */ if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0) return 1; /* - * Is ctime younger than atime? If yes, update atime: + * Is ctime younger than or equal to atime? If yes, update atime: */ if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0) return 1; diff --git a/fs/internal.h b/fs/internal.h index dc4eb91a577a..ab36ed8fa41c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -259,8 +259,6 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po /* * fs/attr.c */ -int setattr_should_drop_sgid(struct mnt_idmap *idmap, - const struct inode *inode); struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); void mnt_idmap_put(struct mnt_idmap *idmap); diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index da3e18503c65..aa4048a27f31 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -920,16 +920,13 @@ const struct xattr_handler *jffs2_xattr_handlers[] = { #ifdef CONFIG_JFFS2_FS_SECURITY &jffs2_security_xattr_handler, #endif -#ifdef CONFIG_JFFS2_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &jffs2_trusted_xattr_handler, NULL }; -static const struct xattr_handler *xprefix_to_handler(int xprefix) { - const struct xattr_handler *ret; +static const char *jffs2_xattr_prefix(int xprefix, struct dentry *dentry) +{ + const struct xattr_handler *ret = NULL; switch (xprefix) { case JFFS2_XPREFIX_USER: @@ -942,20 +939,23 @@ static const struct xattr_handler *xprefix_to_handler(int xprefix) { #endif #ifdef CONFIG_JFFS2_FS_POSIX_ACL case JFFS2_XPREFIX_ACL_ACCESS: - ret = &posix_acl_access_xattr_handler; + ret = &nop_posix_acl_access; break; case JFFS2_XPREFIX_ACL_DEFAULT: - ret = &posix_acl_default_xattr_handler; + ret = &nop_posix_acl_default; break; #endif case JFFS2_XPREFIX_TRUSTED: ret = &jffs2_trusted_xattr_handler; break; default: - ret = NULL; - break; + return NULL; } - return ret; + + if (!xattr_handler_can_list(ret, dentry)) + return NULL; + + return xattr_prefix(ret); } ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) @@ -966,7 +966,6 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) struct jffs2_inode_cache *ic = f->inocache; struct jffs2_xattr_ref *ref, **pref; struct jffs2_xattr_datum *xd; - const struct xattr_handler *xhandle; const char *prefix; ssize_t prefix_len, len, rc; int retry = 0; @@ -998,10 +997,10 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) goto out; } } - xhandle = xprefix_to_handler(xd->xprefix); - if (!xhandle || (xhandle->list && !xhandle->list(dentry))) + + prefix = jffs2_xattr_prefix(xd->xprefix, dentry); + if (!prefix) continue; - prefix = xhandle->prefix ?: xhandle->name; prefix_len = strlen(prefix); rc = prefix_len + xd->name_len + 1; diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 2e8461ce74de..961569c11159 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -691,6 +691,35 @@ void grab_metapage(struct metapage * mp) unlock_page(mp->page); } +static int metapage_write_one(struct page *page) +{ + struct folio *folio = page_folio(page); + struct address_space *mapping = folio->mapping; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = folio_nr_pages(folio), + }; + int ret = 0; + + BUG_ON(!folio_test_locked(folio)); + + folio_wait_writeback(folio); + + if (folio_clear_dirty_for_io(folio)) { + folio_get(folio); + ret = metapage_writepage(page, &wbc); + if (ret == 0) + folio_wait_writeback(folio); + folio_put(folio); + } else { + folio_unlock(folio); + } + + if (!ret) + ret = filemap_check_errors(mapping); + return ret; +} + void force_metapage(struct metapage *mp) { struct page *page = mp->page; @@ -700,8 +729,8 @@ void force_metapage(struct metapage *mp) get_page(page); lock_page(page); set_page_dirty(page); - if (write_one_page(page)) - jfs_error(mp->sb, "write_one_page() failed\n"); + if (metapage_write_one(page)) + jfs_error(mp->sb, "metapage_write_one() failed\n"); clear_bit(META_forcewrite, &mp->flag); put_page(page); } @@ -746,9 +775,9 @@ void release_metapage(struct metapage * mp) set_page_dirty(page); if (test_bit(META_sync, &mp->flag)) { clear_bit(META_sync, &mp->flag); - if (write_one_page(page)) - jfs_error(mp->sb, "write_one_page() failed\n"); - lock_page(page); /* write_one_page unlocks the page */ + if (metapage_write_one(page)) + jfs_error(mp->sb, "metapage_write_one() failed\n"); + lock_page(page); } } else if (mp->lsn) /* discard_metapage doesn't remove it */ remove_from_logsync(mp); diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index f817798fa1eb..931e50018f88 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -986,10 +986,6 @@ static const struct xattr_handler jfs_trusted_xattr_handler = { }; const struct xattr_handler *jfs_xattr_handlers[] = { -#ifdef CONFIG_JFS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &jfs_os2_xattr_handler, &jfs_user_xattr_handler, &jfs_security_xattr_handler, diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index ef00b5fe8cee..90de0e498371 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1748,12 +1748,6 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, return error; } -/* Relationship between mode and the DT_xxx types */ -static inline unsigned char dt_type(struct kernfs_node *kn) -{ - return (kn->mode >> 12) & 15; -} - static int kernfs_dir_fop_release(struct inode *inode, struct file *filp) { kernfs_put(filp->private_data); @@ -1831,7 +1825,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) pos; pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { const char *name = pos->name; - unsigned int type = dt_type(pos); + unsigned int type = fs_umode_to_dtype(pos->mode); int len = strlen(name); ino_t ino = kernfs_ino(pos); diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 8af939a181be..67b7e766a06b 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -876,17 +876,21 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn, } static __le32 decode_preauth_ctxt(struct ksmbd_conn *conn, - struct smb2_preauth_neg_context *pneg_ctxt) + struct smb2_preauth_neg_context *pneg_ctxt, + int len_of_ctxts) { - __le32 err = STATUS_NO_PREAUTH_INTEGRITY_HASH_OVERLAP; + /* + * sizeof(smb2_preauth_neg_context) assumes SMB311_SALT_SIZE Salt, + * which may not be present. Only check for used HashAlgorithms[1]. + */ + if (len_of_ctxts < MIN_PREAUTH_CTXT_DATA_LEN) + return STATUS_INVALID_PARAMETER; - if (pneg_ctxt->HashAlgorithms == SMB2_PREAUTH_INTEGRITY_SHA512) { - conn->preauth_info->Preauth_HashId = - SMB2_PREAUTH_INTEGRITY_SHA512; - err = STATUS_SUCCESS; - } + if (pneg_ctxt->HashAlgorithms != SMB2_PREAUTH_INTEGRITY_SHA512) + return STATUS_NO_PREAUTH_INTEGRITY_HASH_OVERLAP; - return err; + conn->preauth_info->Preauth_HashId = SMB2_PREAUTH_INTEGRITY_SHA512; + return STATUS_SUCCESS; } static void decode_encrypt_ctxt(struct ksmbd_conn *conn, @@ -1014,7 +1018,8 @@ static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn, break; status = decode_preauth_ctxt(conn, - (struct smb2_preauth_neg_context *)pctx); + (struct smb2_preauth_neg_context *)pctx, + len_of_ctxts); if (status != STATUS_SUCCESS) break; } else if (pctx->ContextType == SMB2_ENCRYPTION_CAPABILITIES) { diff --git a/fs/libfs.c b/fs/libfs.c index 4eda519c3002..89cf614a3271 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -174,12 +174,6 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) } EXPORT_SYMBOL(dcache_dir_lseek); -/* Relationship between i_mode and the DT_xxx types */ -static inline unsigned char dt_type(struct inode *inode) -{ - return (inode->i_mode >> 12) & 15; -} - /* * Directory is locked and all positive dentries in it are safe, since * for ramfs-type trees they can't go away without unlink() or rmdir(), @@ -206,7 +200,8 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) while ((next = scan_positives(cursor, p, 1, next)) != NULL) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, - d_inode(next)->i_ino, dt_type(d_inode(next)))) + d_inode(next)->i_ino, + fs_umode_to_dtype(d_inode(next)->i_mode))) break; ctx->pos++; p = &next->d_child; diff --git a/fs/namei.c b/fs/namei.c index edfedfbccaef..f04f7be58933 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3574,9 +3574,9 @@ static int do_open(struct nameidata *nd, /** * vfs_tmpfile - create tmpfile * @idmap: idmap of the mount the inode was found from - * @dentry: pointer to dentry of the base directory + * @parentpath: pointer to the path of the base directory + * @file: file descriptor of the new tmpfile * @mode: mode of the new tmpfile - * @open_flag: flags * * Create a temporary file. * diff --git a/fs/namespace.c b/fs/namespace.c index 6836e937ee61..54847db5b819 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2617,15 +2617,12 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { char *buf = (char *)__get_free_page(GFP_KERNEL); char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM); - struct tm tm; - time64_to_tm(sb->s_time_max, 0, &tm); - - pr_warn("%s filesystem being %s at %s supports timestamps until %04ld (0x%llx)\n", + pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n", sb->s_type->name, is_mounted(mnt) ? "remounted" : "mounted", - mntpath, - tm.tm_year+1900, (unsigned long long)sb->s_time_max); + mntpath, &sb->s_time_max, + (unsigned long long)sb->s_time_max); free_page((unsigned long)buf); sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; @@ -4197,7 +4194,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, int err = 0; struct ns_common *ns; struct user_namespace *mnt_userns; - struct file *file; + struct fd f; if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) return 0; @@ -4213,16 +4210,16 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, if (attr->userns_fd > INT_MAX) return -EINVAL; - file = fget(attr->userns_fd); - if (!file) + f = fdget(attr->userns_fd); + if (!f.file) return -EBADF; - if (!proc_ns_file(file)) { + if (!proc_ns_file(f.file)) { err = -EINVAL; goto out_fput; } - ns = get_proc_ns(file_inode(file)); + ns = get_proc_ns(file_inode(f.file)); if (ns->ops->type != CLONE_NEWUSER) { err = -EINVAL; goto out_fput; @@ -4251,7 +4248,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, kattr->mnt_userns = get_user_ns(mnt_userns); out_fput: - fput(file); + fdput(f); return err; } diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c index e9a45dea748a..8a4c86687429 100644 --- a/fs/netfs/iterator.c +++ b/fs/netfs/iterator.c @@ -139,7 +139,7 @@ static ssize_t netfs_extract_user_to_sg(struct iov_iter *iter, size_t seg = min_t(size_t, PAGE_SIZE - off, len); *pages++ = NULL; - sg_set_page(sg, page, len, off); + sg_set_page(sg, page, seg, off); sgtable->nents++; sg++; len -= seg; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 222a28320e1c..97a76706fd54 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -717,9 +717,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, if ((attr->ia_valid & ATTR_KILL_SUID) != 0 && inode->i_mode & S_ISUID) inode->i_mode &= ~S_ISUID; - if ((attr->ia_valid & ATTR_KILL_SGID) != 0 && - (inode->i_mode & (S_ISGID | S_IXGRP)) == - (S_ISGID | S_IXGRP)) + if (setattr_should_drop_sgid(&nop_mnt_idmap, inode)) inode->i_mode &= ~S_ISGID; if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h index 4fa37dc038b5..b333ea119ef5 100644 --- a/fs/nfs/nfs3_fs.h +++ b/fs/nfs/nfs3_fs.h @@ -17,7 +17,6 @@ extern int nfs3_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl); extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t); -extern const struct xattr_handler *nfs3_xattr_handlers[]; #else static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl) diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 1247f544a440..349cc4f60a28 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -300,12 +300,6 @@ fail: goto out; } -const struct xattr_handler *nfs3_xattr_handlers[] = { - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, - NULL, -}; - static int nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, size_t size, ssize_t *result) diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c index 7c5809431e61..8a9be9e47f76 100644 --- a/fs/nfs/nfs3super.c +++ b/fs/nfs/nfs3super.c @@ -14,9 +14,6 @@ struct nfs_subversion nfs_v3 = { .rpc_vers = &nfs_version3, .rpc_ops = &nfs_v3_clientops, .sops = &nfs_sops, -#ifdef CONFIG_NFS_V3_ACL - .xattr = nfs3_xattr_handlers, -#endif }; static int __init init_nfs_v3(void) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 05ae23657527..397c096d874e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1274,9 +1274,6 @@ int nfs_get_tree_common(struct fs_context *fc) if (ctx->clone_data.sb->s_flags & SB_SYNCHRONOUS) fc->sb_flags |= SB_SYNCHRONOUS; - if (server->caps & NFS_CAP_SECURITY_LABEL) - fc->lsm_flags |= SECURITY_LSM_NATIVE_LABELS; - /* Get a superblock - note that we may end up sharing one that already exists */ fc->s_fs_info = server; s = sget_fc(fc, compare_super, nfs_set_super); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index e2e485167ac4..76db2fe29624 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3446,8 +3446,7 @@ out_acl: p = xdr_reserve_space(xdr, 4); if (!p) goto out_resource; - err = xattr_supported_namespace(d_inode(dentry), - XATTR_USER_PREFIX); + err = xattr_supports_user_prefix(d_inode(dentry)); *p++ = cpu_to_be32(err == 0); } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 6ad41390fa74..228659612c0d 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -430,6 +430,23 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci) return 0; } +/** + * nilfs_segctor_zeropad_segsum - zero pad the rest of the segment summary area + * @sci: segment constructor object + * + * nilfs_segctor_zeropad_segsum() zero-fills unallocated space at the end of + * the current segment summary block. + */ +static void nilfs_segctor_zeropad_segsum(struct nilfs_sc_info *sci) +{ + struct nilfs_segsum_pointer *ssp; + + ssp = sci->sc_blk_cnt > 0 ? &sci->sc_binfo_ptr : &sci->sc_finfo_ptr; + if (ssp->offset < ssp->bh->b_size) + memset(ssp->bh->b_data + ssp->offset, 0, + ssp->bh->b_size - ssp->offset); +} + static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci) { sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; @@ -438,6 +455,7 @@ static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci) * The current segment is filled up * (internal code) */ + nilfs_segctor_zeropad_segsum(sci); sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg); return nilfs_segctor_reset_segment_buffer(sci); } @@ -542,6 +560,7 @@ static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci, goto retry; } if (unlikely(required)) { + nilfs_segctor_zeropad_segsum(sci); err = nilfs_segbuf_extend_segsum(segbuf); if (unlikely(err)) goto failed; @@ -1533,6 +1552,7 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci, nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); sci->sc_stage = prev_stage; } + nilfs_segctor_zeropad_segsum(sci); nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile); return 0; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 8f430bfad487..22fb1cf7e1fc 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -663,7 +663,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_info *info = fanotify_event_info(event); unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; - struct file *f = NULL; + struct file *f = NULL, *pidfd_file = NULL; int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD; pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -718,7 +718,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, !pid_has_task(event->pid, PIDTYPE_TGID)) { pidfd = FAN_NOPIDFD; } else { - pidfd = pidfd_create(event->pid, 0); + pidfd = pidfd_prepare(event->pid, 0, &pidfd_file); if (pidfd < 0) pidfd = FAN_EPIDFD; } @@ -751,6 +751,9 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (f) fd_install(fd, f); + if (pidfd_file) + fd_install(pidfd, pidfd_file); + return metadata.event_len; out_close_fd: @@ -759,8 +762,10 @@ out_close_fd: fput(f); } - if (pidfd >= 0) - close_fd(pidfd); + if (pidfd >= 0) { + put_unused_fd(pidfd); + fput(pidfd_file); + } return ret; } diff --git a/fs/nsfs.c b/fs/nsfs.c index f8df60b3b901..f602a96a1afe 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -235,24 +235,6 @@ bool proc_ns_file(const struct file *file) return file->f_op == &ns_file_operations; } -struct file *proc_ns_fget(int fd) -{ - struct file *file; - - file = fget(fd); - if (!file) - return ERR_PTR(-EBADF); - - if (file->f_op != &ns_file_operations) - goto out_invalid; - - return file; - -out_invalid: - fput(file); - return ERR_PTR(-EINVAL); -} - /** * ns_match() - Returns true if current namespace matches dev/ino provided. * @ns: current namespace diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index ff64302e87e5..7324cf924932 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -1033,10 +1033,6 @@ static const struct xattr_handler ntfs_other_xattr_handler = { }; const struct xattr_handler *ntfs_xattr_handlers[] = { -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &ntfs_other_xattr_handler, NULL, }; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0394505fdce3..8dfc284e85f0 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2463,7 +2463,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block, - ocfs2_dio_end_io, NULL, 0); + ocfs2_dio_end_io, 0); } const struct address_space_operations ocfs2_aops = { diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 9175dbc47201..17c52225b87d 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -242,6 +242,7 @@ static int ocfs2_mknod(struct mnt_idmap *idmap, int want_meta = 0; int xattr_credits = 0; struct ocfs2_security_xattr_info si = { + .name = NULL, .enable = 1, }; int did_quota_inode = 0; @@ -1805,6 +1806,7 @@ static int ocfs2_symlink(struct mnt_idmap *idmap, int want_clusters = 0; int xattr_credits = 0; struct ocfs2_security_xattr_info si = { + .name = NULL, .enable = 1, }; int did_quota = 0, did_quota_inode = 0; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 5a656dc683f1..564ab48d03ef 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2952,10 +2952,11 @@ retry: */ if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) { if (PageDirty(page)) { - /* - * write_on_page will unlock the page on return - */ - ret = write_one_page(page); + unlock_page(page); + put_page(page); + + ret = filemap_write_and_wait_range(mapping, + offset, map_end - 1); goto retry; } } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 389308efe854..4ac77ff6e676 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -89,21 +89,17 @@ static struct ocfs2_xattr_def_value_root def_xv = { const struct xattr_handler *ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, &ocfs2_xattr_trusted_handler, &ocfs2_xattr_security_handler, NULL }; static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { - [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, - [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] - = &posix_acl_access_xattr_handler, - [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] - = &posix_acl_default_xattr_handler, - [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, - [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, + [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, + [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, + [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, + [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, }; struct ocfs2_xattr_info { @@ -7259,9 +7255,21 @@ static int ocfs2_xattr_security_set(const struct xattr_handler *handler, static int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { + struct ocfs2_security_xattr_info *si = fs_info; const struct xattr *xattr; int err = 0; + if (si) { + si->value = kmemdup(xattr_array->value, xattr_array->value_len, + GFP_KERNEL); + if (!si->value) + return -ENOMEM; + + si->name = xattr_array->name; + si->value_len = xattr_array->value_len; + return 0; + } + for (xattr = xattr_array; xattr->name != NULL; xattr++) { err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, xattr->name, xattr->value, @@ -7277,13 +7285,23 @@ int ocfs2_init_security_get(struct inode *inode, const struct qstr *qstr, struct ocfs2_security_xattr_info *si) { + int ret; + /* check whether ocfs2 support feature xattr */ if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) return -EOPNOTSUPP; - if (si) - return security_old_inode_init_security(inode, dir, qstr, - &si->name, &si->value, - &si->value_len); + if (si) { + ret = security_inode_init_security(inode, dir, qstr, + &ocfs2_initxattrs, si); + /* + * security_inode_init_security() does not return -EOPNOTSUPP, + * we have to check the xattr ourselves. + */ + if (!ret && !si->name) + si->enable = 0; + + return ret; + } return security_inode_init_security(inode, dir, qstr, &ocfs2_initxattrs, NULL); diff --git a/fs/open.c b/fs/open.c index 4401a73d4032..4478adcc4f3a 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1196,13 +1196,21 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) } /* - * In order to ensure programs get explicit errors when trying to use - * O_TMPFILE on old kernels, O_TMPFILE is implemented such that it - * looks like (O_DIRECTORY|O_RDWR & ~O_CREAT) to old kernels. But we - * have to require userspace to explicitly set it. + * Block bugs where O_DIRECTORY | O_CREAT created regular files. + * Note, that blocking O_DIRECTORY | O_CREAT here also protects + * O_TMPFILE below which requires O_DIRECTORY being raised. */ + if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT)) + return -EINVAL; + + /* Now handle the creative implementation of O_TMPFILE. */ if (flags & __O_TMPFILE) { - if ((flags & O_TMPFILE_MASK) != O_TMPFILE) + /* + * In order to ensure programs get explicit errors when trying + * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY + * is raised alongside __O_TMPFILE. + */ + if (!(flags & O_DIRECTORY)) return -EINVAL; if (!(acc_mode & MAY_WRITE)) return -EINVAL; diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 6ecad4f94ae6..68b62689a63e 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -555,8 +555,6 @@ static const struct xattr_handler orangefs_xattr_default_handler = { }; const struct xattr_handler *orangefs_xattr_handlers[] = { - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, &orangefs_xattr_default_handler, NULL }; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index c14e90764e35..f658cc8ea492 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -81,8 +81,7 @@ int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct de int error = 0; size_t slen; - if (!(old->d_inode->i_opflags & IOP_XATTR) || - !(new->d_inode->i_opflags & IOP_XATTR)) + if (!old->d_inode->i_op->listxattr || !new->d_inode->i_op->listxattr) return 0; list_size = vfs_listxattr(old, NULL, 0); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index f1d9f75f8786..f97ad8b40dbb 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1055,20 +1055,12 @@ static const struct xattr_handler ovl_other_xattr_handler = { }; static const struct xattr_handler *ovl_trusted_xattr_handlers[] = { -#ifdef CONFIG_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &ovl_own_trusted_xattr_handler, &ovl_other_xattr_handler, NULL }; static const struct xattr_handler *ovl_user_xattr_handlers[] = { -#ifdef CONFIG_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &ovl_own_user_xattr_handler, &ovl_other_xattr_handler, NULL diff --git a/fs/pnode.c b/fs/pnode.c index 468e4e65a615..3cede8b18c8b 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -214,7 +214,6 @@ static struct mount *next_group(struct mount *m, struct mount *origin) /* all accesses are serialized by namespace_sem */ static struct mount *last_dest, *first_source, *last_source, *dest_master; -static struct mountpoint *mp; static struct hlist_head *list; static inline bool peers(struct mount *m1, struct mount *m2) @@ -222,7 +221,7 @@ static inline bool peers(struct mount *m1, struct mount *m2) return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; } -static int propagate_one(struct mount *m) +static int propagate_one(struct mount *m, struct mountpoint *dest_mp) { struct mount *child; int type; @@ -230,7 +229,7 @@ static int propagate_one(struct mount *m) if (IS_MNT_NEW(m)) return 0; /* skip if mountpoint isn't covered by it */ - if (!is_subdir(mp->m_dentry, m->mnt.mnt_root)) + if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) return 0; if (peers(m, last_dest)) { type = CL_MAKE_SHARED; @@ -262,7 +261,7 @@ static int propagate_one(struct mount *m) if (IS_ERR(child)) return PTR_ERR(child); read_seqlock_excl(&mount_lock); - mnt_set_mountpoint(m, mp, child); + mnt_set_mountpoint(m, dest_mp, child); if (m->mnt_master != dest_master) SET_MNT_MARK(m->mnt_master); read_sequnlock_excl(&mount_lock); @@ -299,13 +298,12 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, last_dest = dest_mnt; first_source = source_mnt; last_source = source_mnt; - mp = dest_mp; list = tree_list; dest_master = dest_mnt->mnt_master; /* all peers of dest_mnt, except dest_mnt itself */ for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) { - ret = propagate_one(n); + ret = propagate_one(n, dest_mp); if (ret) goto out; } @@ -316,7 +314,7 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, /* everything in that slave group */ n = m; do { - ret = propagate_one(n); + ret = propagate_one(n, dest_mp); if (ret) goto out; n = next_peer(n); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 5a76fb35923a..7fa1b738bbab 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -957,25 +957,62 @@ set_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry, } EXPORT_SYMBOL(set_posix_acl); +int posix_acl_listxattr(struct inode *inode, char **buffer, + ssize_t *remaining_size) +{ + int err; + + if (!IS_POSIXACL(inode)) + return 0; + + if (inode->i_acl) { + err = xattr_list_one(buffer, remaining_size, + XATTR_NAME_POSIX_ACL_ACCESS); + if (err) + return err; + } + + if (inode->i_default_acl) { + err = xattr_list_one(buffer, remaining_size, + XATTR_NAME_POSIX_ACL_DEFAULT); + if (err) + return err; + } + + return 0; +} + static bool posix_acl_xattr_list(struct dentry *dentry) { return IS_POSIXACL(d_backing_inode(dentry)); } -const struct xattr_handler posix_acl_access_xattr_handler = { +/* + * nop_posix_acl_access - legacy xattr handler for access POSIX ACLs + * + * This is the legacy POSIX ACL access xattr handler. It is used by some + * filesystems to implement their ->listxattr() inode operation. New code + * should never use them. + */ +const struct xattr_handler nop_posix_acl_access = { .name = XATTR_NAME_POSIX_ACL_ACCESS, - .flags = ACL_TYPE_ACCESS, .list = posix_acl_xattr_list, }; -EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler); +EXPORT_SYMBOL_GPL(nop_posix_acl_access); -const struct xattr_handler posix_acl_default_xattr_handler = { +/* + * nop_posix_acl_default - legacy xattr handler for default POSIX ACLs + * + * This is the legacy POSIX ACL default xattr handler. It is used by some + * filesystems to implement their ->listxattr() inode operation. New code + * should never use them. + */ +const struct xattr_handler nop_posix_acl_default = { .name = XATTR_NAME_POSIX_ACL_DEFAULT, - .flags = ACL_TYPE_DEFAULT, .list = posix_acl_xattr_list, }; -EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); +EXPORT_SYMBOL_GPL(nop_posix_acl_default); int simple_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) @@ -1094,12 +1131,10 @@ retry_deleg: if (error) goto out_inode_unlock; - if (inode->i_opflags & IOP_XATTR) + if (likely(!is_bad_inode(inode))) error = set_posix_acl(idmap, dentry, acl_type, kacl); - else if (unlikely(is_bad_inode(inode))) - error = -EIO; else - error = -EOPNOTSUPP; + error = -EIO; if (!error) { fsnotify_xattr(dentry); evm_inode_post_set_acl(dentry, acl_name, kacl); @@ -1204,12 +1239,10 @@ retry_deleg: if (error) goto out_inode_unlock; - if (inode->i_opflags & IOP_XATTR) + if (likely(!is_bad_inode(inode))) error = set_posix_acl(idmap, dentry, acl_type, NULL); - else if (unlikely(is_bad_inode(inode))) - error = -EIO; else - error = -EOPNOTSUPP; + error = -EIO; if (!error) { fsnotify_xattr(dentry); evm_inode_post_remove_acl(idmap, dentry, acl_name); diff --git a/fs/qnx4/README b/fs/qnx4/README deleted file mode 100644 index 1f1e320d91da..000000000000 --- a/fs/qnx4/README +++ /dev/null @@ -1,9 +0,0 @@ - - This is a snapshot of the QNX4 filesystem for Linux. - Please send diffs and remarks to <al@alarsen.net> . - -Credits : - -Richard "Scuba" A. Frowijn <scuba@wxs.nl> -Frank "Jedi/Sector One" Denis <j@pureftpd.org> -Anders Larsen <al@alarsen.net> (Maintainer) diff --git a/fs/qnx6/README b/fs/qnx6/README deleted file mode 100644 index 116d622026cc..000000000000 --- a/fs/qnx6/README +++ /dev/null @@ -1,8 +0,0 @@ - - This is a snapshot of the QNX6 filesystem for Linux. - Please send diffs and remarks to <chaosman@ontika.net> . - -Credits : - -Al Viro <viro@ZenIV.linux.org.uk> (endless patience with me & support ;)) -Kai Bankett <chaosman@ontika.net> (Maintainer) diff --git a/fs/read_write.c b/fs/read_write.c index 7a2ff6157eda..a21ba3be7dbe 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -749,15 +749,14 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, return -EOPNOTSUPP; while (iov_iter_count(iter)) { - struct iovec iovec = iov_iter_iovec(iter); ssize_t nr; if (type == READ) { - nr = filp->f_op->read(filp, iovec.iov_base, - iovec.iov_len, ppos); + nr = filp->f_op->read(filp, iter_iov_addr(iter), + iter_iov_len(iter), ppos); } else { - nr = filp->f_op->write(filp, iovec.iov_base, - iovec.iov_len, ppos); + nr = filp->f_op->write(filp, iter_iov_addr(iter), + iter_iov_len(iter), ppos); } if (nr < 0) { @@ -766,7 +765,7 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, break; } ret += nr; - if (nr != iovec.iov_len) + if (nr != iter_iov_len(iter)) break; iov_iter_advance(iter, nr); } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 467d13da198f..b54cc7048f02 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -261,3 +261,10 @@ const struct inode_operations reiserfs_file_inode_operations = { .fileattr_get = reiserfs_fileattr_get, .fileattr_set = reiserfs_fileattr_set, }; + +const struct inode_operations reiserfs_priv_file_inode_operations = { + .setattr = reiserfs_setattr, + .permission = reiserfs_permission, + .fileattr_get = reiserfs_fileattr_get, + .fileattr_set = reiserfs_fileattr_set, +}; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index d54cab854f60..d8debbb6105f 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2087,10 +2087,8 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, * Mark it private if we're creating the privroot * or something under it. */ - if (IS_PRIVATE(dir) || dentry == REISERFS_SB(sb)->priv_root) { - inode->i_flags |= S_PRIVATE; - inode->i_opflags &= ~IOP_XATTR; - } + if (IS_PRIVATE(dir) || dentry == REISERFS_SB(sb)->priv_root) + reiserfs_init_priv_inode(inode); if (reiserfs_posixacl(inode->i_sb)) { reiserfs_write_unlock(inode->i_sb); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 42d2c20e1345..52240cc891cf 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -378,13 +378,11 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, /* * Propagate the private flag so we know we're - * in the priv tree. Also clear IOP_XATTR + * in the priv tree. Also clear xattr support * since we don't have xattrs on xattr files. */ - if (IS_PRIVATE(dir)) { - inode->i_flags |= S_PRIVATE; - inode->i_opflags &= ~IOP_XATTR; - } + if (IS_PRIVATE(dir)) + reiserfs_init_priv_inode(inode); } reiserfs_write_unlock(dir->i_sb); if (retval == IO_ERROR) { @@ -1649,6 +1647,48 @@ static int reiserfs_rename(struct mnt_idmap *idmap, return retval; } +static const struct inode_operations reiserfs_priv_dir_inode_operations = { + .create = reiserfs_create, + .lookup = reiserfs_lookup, + .link = reiserfs_link, + .unlink = reiserfs_unlink, + .symlink = reiserfs_symlink, + .mkdir = reiserfs_mkdir, + .rmdir = reiserfs_rmdir, + .mknod = reiserfs_mknod, + .rename = reiserfs_rename, + .setattr = reiserfs_setattr, + .permission = reiserfs_permission, + .fileattr_get = reiserfs_fileattr_get, + .fileattr_set = reiserfs_fileattr_set, +}; + +static const struct inode_operations reiserfs_priv_symlink_inode_operations = { + .get_link = page_get_link, + .setattr = reiserfs_setattr, + .permission = reiserfs_permission, +}; + +static const struct inode_operations reiserfs_priv_special_inode_operations = { + .setattr = reiserfs_setattr, + .permission = reiserfs_permission, +}; + +void reiserfs_init_priv_inode(struct inode *inode) +{ + inode->i_flags |= S_PRIVATE; + inode->i_opflags &= ~IOP_XATTR; + + if (S_ISREG(inode->i_mode)) + inode->i_op = &reiserfs_priv_file_inode_operations; + else if (S_ISDIR(inode->i_mode)) + inode->i_op = &reiserfs_priv_dir_inode_operations; + else if (S_ISLNK(inode->i_mode)) + inode->i_op = &reiserfs_priv_symlink_inode_operations; + else + inode->i_op = &reiserfs_priv_special_inode_operations; +} + /* directories can handle most operations... */ const struct inode_operations reiserfs_dir_inode_operations = { .create = reiserfs_create, diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 98e6f53c2fe0..1bccf6a2e908 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -3106,6 +3106,7 @@ int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); /* namei.c */ +void reiserfs_init_priv_inode(struct inode *inode); void set_de_name_and_namelen(struct reiserfs_dir_entry *de); int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, struct treepath *path, struct reiserfs_dir_entry *de); @@ -3175,6 +3176,7 @@ void reiserfs_unmap_buffer(struct buffer_head *); /* file.c */ extern const struct inode_operations reiserfs_file_inode_operations; +extern const struct inode_operations reiserfs_priv_file_inode_operations; extern const struct file_operations reiserfs_file_operations; extern const struct address_space_operations reiserfs_address_space_operations; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 06d810c72c52..651027967159 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -52,6 +52,7 @@ #include <linux/quotaops.h> #include <linux/security.h> #include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> #define PRIVROOT_NAME ".reiserfs_priv" #define XAROOT_NAME "xattrs" @@ -770,23 +771,34 @@ out: (handler) != NULL; \ (handler) = *(handlers)++) +static inline bool reiserfs_posix_acl_list(const char *name, + struct dentry *dentry) +{ + return (posix_acl_type(name) >= 0) && + IS_POSIXACL(d_backing_inode(dentry)); +} + /* This is the implementation for the xattr plugin infrastructure */ -static inline const struct xattr_handler * -find_xattr_handler_prefix(const struct xattr_handler **handlers, - const char *name) +static inline bool reiserfs_xattr_list(const struct xattr_handler **handlers, + const char *name, struct dentry *dentry) { - const struct xattr_handler *xah; + if (handlers) { + const struct xattr_handler *xah = NULL; - if (!handlers) - return NULL; + for_each_xattr_handler(handlers, xah) { + const char *prefix = xattr_prefix(xah); - for_each_xattr_handler(handlers, xah) { - const char *prefix = xattr_prefix(xah); - if (strncmp(prefix, name, strlen(prefix)) == 0) - break; + if (strncmp(prefix, name, strlen(prefix))) + continue; + + if (!xattr_handler_can_list(xah, dentry)) + return false; + + return true; + } } - return xah; + return reiserfs_posix_acl_list(name, dentry); } struct listxattr_buf { @@ -807,12 +819,8 @@ static bool listxattr_filler(struct dir_context *ctx, const char *name, if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) { - const struct xattr_handler *handler; - - handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, - name); - if (!handler /* Unsupported xattr name */ || - (handler->list && !handler->list(b->dentry))) + if (!reiserfs_xattr_list(b->dentry->d_sb->s_xattr, name, + b->dentry)) return true; size = namelen + 1; if (b->buf) { @@ -888,8 +896,7 @@ static int create_privroot(struct dentry *dentry) return -EOPNOTSUPP; } - d_inode(dentry)->i_flags |= S_PRIVATE; - d_inode(dentry)->i_opflags &= ~IOP_XATTR; + reiserfs_init_priv_inode(d_inode(dentry)); reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr " "storage.\n", PRIVROOT_NAME); @@ -911,10 +918,6 @@ const struct xattr_handler *reiserfs_xattr_handlers[] = { #ifdef CONFIG_REISERFS_FS_SECURITY &reiserfs_xattr_security_handler, #endif -#ifdef CONFIG_REISERFS_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif NULL }; @@ -975,10 +978,8 @@ int reiserfs_lookup_privroot(struct super_block *s) if (!IS_ERR(dentry)) { REISERFS_SB(s)->priv_root = dentry; d_set_d_op(dentry, &xattr_lookup_poison_ops); - if (d_really_is_positive(dentry)) { - d_inode(dentry)->i_flags |= S_PRIVATE; - d_inode(dentry)->i_opflags &= ~IOP_XATTR; - } + if (d_really_is_positive(dentry)) + reiserfs_init_priv_inode(d_inode(dentry)); } else err = PTR_ERR(dentry); inode_unlock(d_inode(s->s_root)); diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 41c0ea84fbff..6e0a099dd788 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -39,6 +39,22 @@ static bool security_list(struct dentry *dentry) return !IS_PRIVATE(d_inode(dentry)); } +static int +reiserfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + struct reiserfs_security_handle *sec = fs_info; + + sec->value = kmemdup(xattr_array->value, xattr_array->value_len, + GFP_KERNEL); + if (!sec->value) + return -ENOMEM; + + sec->name = xattr_array->name; + sec->length = xattr_array->value_len; + return 0; +} + /* Initializes the security context for a new inode and returns the number * of blocks needed for the transaction. If successful, reiserfs_security * must be released using reiserfs_security_free when the caller is done. */ @@ -56,12 +72,9 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode, if (IS_PRIVATE(dir)) return 0; - error = security_old_inode_init_security(inode, dir, qstr, &sec->name, - &sec->value, &sec->length); + error = security_inode_init_security(inode, dir, qstr, + &reiserfs_initxattrs, sec); if (error) { - if (error == -EOPNOTSUPP) - error = 0; - sec->name = NULL; sec->value = NULL; sec->length = 0; @@ -82,11 +95,15 @@ int reiserfs_security_write(struct reiserfs_transaction_handle *th, struct inode *inode, struct reiserfs_security_handle *sec) { + char xattr_name[XATTR_NAME_MAX + 1] = XATTR_SECURITY_PREFIX; int error; - if (strlen(sec->name) < sizeof(XATTR_SECURITY_PREFIX)) + + if (XATTR_SECURITY_PREFIX_LEN + strlen(sec->name) > XATTR_NAME_MAX) return -EINVAL; - error = reiserfs_xattr_set_handle(th, inode, sec->name, sec->value, + strlcat(xattr_name, sec->name, sizeof(xattr_name)); + + error = reiserfs_xattr_set_handle(th, inode, xattr_name, sec->value, sec->length, XATTR_CREATE); if (error == -ENODATA || error == -EOPNOTSUPP) error = 0; diff --git a/fs/splice.c b/fs/splice.c index 2c3dec2b6dfa..0af8d150394f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -30,6 +30,7 @@ #include <linux/export.h> #include <linux/syscalls.h> #include <linux/uio.h> +#include <linux/fsnotify.h> #include <linux/security.h> #include <linux/gfp.h> #include <linux/socket.h> @@ -1165,6 +1166,9 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, ret = do_splice_from(ipipe, out, &offset, len, flags); file_end_write(out); + if (ret > 0) + fsnotify_modify(out); + if (!off_out) out->f_pos = offset; else @@ -1188,6 +1192,10 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, flags |= SPLICE_F_NONBLOCK; ret = splice_file_to_pipe(in, opipe, &offset, len, flags); + + if (ret > 0) + fsnotify_access(in); + if (!off_in) in->f_pos = offset; else diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 999bceb99974..cdb3d632c63d 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -28,12 +28,6 @@ const struct file_operations sysv_dir_operations = { .fsync = generic_file_fsync, }; -inline void dir_put_page(struct page *page, void *page_addr) -{ - kunmap_local((void *)((unsigned long)page_addr & PAGE_MASK)); - put_page(page); -} - static void dir_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; @@ -58,7 +52,7 @@ static int sysv_handle_dirsync(struct inode *dir) } /* - * Calls to dir_get_page()/dir_put_page() must be nested according to the + * Calls to dir_get_page()/put_and_unmap_page() must be nested according to the * rules documented in mm/highmem.rst. * * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_page() @@ -109,11 +103,11 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN), fs16_to_cpu(SYSV_SB(sb), de->inode), DT_UNKNOWN)) { - dir_put_page(page, kaddr); + put_and_unmap_page(page, kaddr); return 0; } } - dir_put_page(page, kaddr); + put_and_unmap_page(page, kaddr); } return 0; } @@ -137,7 +131,7 @@ static inline int namecompare(int len, int maxlen, * itself (as a parameter - res_dir). It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. * - * On Success dir_put_page() should be called on *res_page. + * On Success put_and_unmap_page() should be called on *res_page. * * sysv_find_entry() acts as a call to dir_get_page() and must be treated * accordingly for nesting purposes. @@ -172,7 +166,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ name, de->name)) goto found; } - dir_put_page(page, kaddr); + put_and_unmap_page(page, kaddr); } if (++n >= npages) @@ -215,7 +209,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) goto out_page; de++; } - dir_put_page(page, kaddr); + put_and_unmap_page(page, kaddr); } BUG(); return -EINVAL; @@ -234,7 +228,7 @@ got_it: mark_inode_dirty(dir); err = sysv_handle_dirsync(dir); out_page: - dir_put_page(page, kaddr); + put_and_unmap_page(page, kaddr); return err; out_unlock: unlock_page(page); @@ -327,12 +321,12 @@ int sysv_empty_dir(struct inode * inode) if (de->name[1] != '.' || de->name[2]) goto not_empty; } - dir_put_page(page, kaddr); + put_and_unmap_page(page, kaddr); } return 1; not_empty: - dir_put_page(page, kaddr); + put_and_unmap_page(page, kaddr); return 0; } @@ -358,7 +352,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page, } /* - * Calls to dir_get_page()/dir_put_page() must be nested according to the + * Calls to dir_get_page()/put_and_unmap_page() must be nested according to the * rules documented in mm/highmem.rst. * * sysv_dotdot() acts as a call to dir_get_page() and must be treated @@ -382,7 +376,7 @@ ino_t sysv_inode_by_name(struct dentry *dentry) if (de) { res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode); - dir_put_page(page, de); + put_and_unmap_page(page, de); } return res; } diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index a25862773d82..2b2dba4c4f56 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -164,7 +164,7 @@ static int sysv_unlink(struct inode * dir, struct dentry * dentry) inode->i_ctime = dir->i_ctime; inode_dec_link_count(inode); } - dir_put_page(page, de); + put_and_unmap_page(page, de); return err; } @@ -227,7 +227,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (!new_de) goto out_dir; err = sysv_set_link(new_de, new_page, old_inode); - dir_put_page(new_page, new_de); + put_and_unmap_page(new_page, new_de); if (err) goto out_dir; new_inode->i_ctime = current_time(new_inode); @@ -256,9 +256,9 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, out_dir: if (dir_de) - dir_put_page(dir_page, dir_de); + put_and_unmap_page(dir_page, dir_de); out_old: - dir_put_page(old_page, old_de); + put_and_unmap_page(old_page, old_de); out: return err; } diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index f2c36ea42df6..e3f988b469ee 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -148,7 +148,6 @@ extern void sysv_destroy_icache(void); /* dir.c */ -extern void dir_put_page(struct page *page, void *vaddr); extern struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct page **); extern int sysv_add_link(struct dentry *, struct inode *); extern int sysv_delete_entry(struct sysv_dir_entry *, struct page *); diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 391efaf1d528..379d75796a5c 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -42,11 +42,10 @@ static inline int ufs_match(struct super_block *sb, int len, return !memcmp(name, de->d_name, len); } -static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; - int err = 0; inode_inc_iversion(dir); block_write_end(NULL, mapping, pos, len, len, page, NULL); @@ -54,10 +53,16 @@ static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) i_size_write(dir, pos+len); mark_inode_dirty(dir); } - if (IS_DIRSYNC(dir)) - err = write_one_page(page); - else - unlock_page(page); + unlock_page(page); +} + +static int ufs_handle_dirsync(struct inode *dir) +{ + int err; + + err = filemap_write_and_wait(dir->i_mapping); + if (!err) + err = sync_inode_metadata(dir, 1); return err; } @@ -99,11 +104,12 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); ufs_set_de_type(dir->i_sb, de, inode->i_mode); - err = ufs_commit_chunk(page, pos, len); + ufs_commit_chunk(page, pos, len); ufs_put_page(page); if (update_times) dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); + ufs_handle_dirsync(dir); } @@ -390,10 +396,11 @@ got_it: de->d_ino = cpu_to_fs32(sb, inode->i_ino); ufs_set_de_type(sb, de, inode->i_mode); - err = ufs_commit_chunk(page, pos, rec_len); + ufs_commit_chunk(page, pos, rec_len); dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); + err = ufs_handle_dirsync(dir); /* OFFSET_CACHE */ out_put: ufs_put_page(page); @@ -531,9 +538,10 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, if (pde) pde->d_reclen = cpu_to_fs16(sb, to - from); dir->d_ino = 0; - err = ufs_commit_chunk(page, pos, to - from); + ufs_commit_chunk(page, pos, to - from); inode->i_ctime = inode->i_mtime = current_time(inode); mark_inode_dirty(inode); + err = ufs_handle_dirsync(inode); out: ufs_put_page(page); UFSD("EXIT\n"); @@ -579,7 +587,8 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) strcpy (de->d_name, ".."); kunmap(page); - err = ufs_commit_chunk(page, 0, chunk_size); + ufs_commit_chunk(page, 0, chunk_size); + err = ufs_handle_dirsync(inode); fail: put_page(page); return err; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 44d1ee429eb0..40f9e1a2ebdd 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1955,8 +1955,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EFAULT; if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; - /* Ignore unsupported features (userspace built against newer kernel) */ - features = uffdio_api.features & UFFD_API_FEATURES; + features = uffdio_api.features; + ret = -EINVAL; + if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) + goto err_out; ret = -EPERM; if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) goto err_out; diff --git a/fs/xattr.c b/fs/xattr.c index 14a7eb3c8fa8..fcf67d80d7f9 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -160,11 +160,10 @@ xattr_permission(struct mnt_idmap *idmap, struct inode *inode, * Look for any handler that deals with the specified namespace. */ int -xattr_supported_namespace(struct inode *inode, const char *prefix) +xattr_supports_user_prefix(struct inode *inode) { const struct xattr_handler **handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; - size_t preflen; if (!(inode->i_opflags & IOP_XATTR)) { if (unlikely(is_bad_inode(inode))) @@ -172,16 +171,15 @@ xattr_supported_namespace(struct inode *inode, const char *prefix) return -EOPNOTSUPP; } - preflen = strlen(prefix); - for_each_xattr_handler(handlers, handler) { - if (!strncmp(xattr_prefix(handler), prefix, preflen)) + if (!strncmp(xattr_prefix(handler), XATTR_USER_PREFIX, + XATTR_USER_PREFIX_LEN)) return 0; } return -EOPNOTSUPP; } -EXPORT_SYMBOL(xattr_supported_namespace); +EXPORT_SYMBOL(xattr_supports_user_prefix); int __vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, @@ -460,6 +458,28 @@ nolsm: } EXPORT_SYMBOL_GPL(vfs_getxattr); +/** + * vfs_listxattr - retrieve \0 separated list of xattr names + * @dentry: the dentry from whose inode the xattr names are retrieved + * @list: buffer to store xattr names into + * @size: size of the buffer + * + * This function returns the names of all xattrs associated with the + * inode of @dentry. + * + * Note, for legacy reasons the vfs_listxattr() function lists POSIX + * ACLs as well. Since POSIX ACLs are decoupled from IOP_XATTR the + * vfs_listxattr() function doesn't check for this flag since a + * filesystem could implement POSIX ACLs without implementing any other + * xattrs. + * + * However, since all codepaths that remove IOP_XATTR also assign of + * inode operations that either don't implement or implement a stub + * ->listxattr() operation. + * + * Return: On success, the size of the buffer that was used. On error a + * negative error code. + */ ssize_t vfs_listxattr(struct dentry *dentry, char *list, size_t size) { @@ -469,7 +489,8 @@ vfs_listxattr(struct dentry *dentry, char *list, size_t size) error = security_inode_listxattr(dentry); if (error) return error; - if (inode->i_op->listxattr && (inode->i_opflags & IOP_XATTR)) { + + if (inode->i_op->listxattr) { error = inode->i_op->listxattr(dentry, list, size); } else { error = security_inode_listsecurity(inode, list, size); @@ -949,6 +970,21 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) return error; } +int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name) +{ + size_t len; + + len = strlen(name) + 1; + if (*buffer) { + if (*remaining_size < len) + return -ERANGE; + memcpy(*buffer, name, len); + *buffer += len; + } + *remaining_size -= len; + return 0; +} + /* * Combine the results of the list() operation from every xattr_handler in the * list. @@ -957,33 +993,22 @@ ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; - unsigned int size = 0; - - if (!buffer) { - for_each_xattr_handler(handlers, handler) { - if (!handler->name || - (handler->list && !handler->list(dentry))) - continue; - size += strlen(handler->name) + 1; - } - } else { - char *buf = buffer; - size_t len; - - for_each_xattr_handler(handlers, handler) { - if (!handler->name || - (handler->list && !handler->list(dentry))) - continue; - len = strlen(handler->name); - if (len + 1 > buffer_size) - return -ERANGE; - memcpy(buf, handler->name, len + 1); - buf += len + 1; - buffer_size -= len + 1; - } - size = buf - buffer; + ssize_t remaining_size = buffer_size; + int err = 0; + + err = posix_acl_listxattr(d_inode(dentry), &buffer, &remaining_size); + if (err) + return err; + + for_each_xattr_handler(handlers, handler) { + if (!handler->name || (handler->list && !handler->list(dentry))) + continue; + err = xattr_list_one(&buffer, &remaining_size, handler->name); + if (err) + return err; } - return size; + + return err ? err : buffer_size - remaining_size; } EXPORT_SYMBOL(generic_listxattr); @@ -1245,20 +1270,6 @@ static bool xattr_is_trusted(const char *name) return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } -static int xattr_list_one(char **buffer, ssize_t *remaining_size, - const char *name) -{ - size_t len = strlen(name) + 1; - if (*buffer) { - if (*remaining_size < len) - return -ERANGE; - memcpy(*buffer, name, len); - *buffer += len; - } - *remaining_size -= len; - return 0; -} - /** * simple_xattr_list - list all xattr objects * @inode: inode from which to get the xattrs @@ -1287,22 +1298,9 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, ssize_t remaining_size = size; int err = 0; -#ifdef CONFIG_FS_POSIX_ACL - if (IS_POSIXACL(inode)) { - if (inode->i_acl) { - err = xattr_list_one(&buffer, &remaining_size, - XATTR_NAME_POSIX_ACL_ACCESS); - if (err) - return err; - } - if (inode->i_default_acl) { - err = xattr_list_one(&buffer, &remaining_size, - XATTR_NAME_POSIX_ACL_DEFAULT); - if (err) - return err; - } - } -#endif + err = posix_acl_listxattr(inode, &buffer, &remaining_size); + if (err) + return err; read_lock(&xattrs->lock); for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 7b9a0ed1b11f..43e5c219aaed 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -179,10 +179,6 @@ const struct xattr_handler *xfs_xattr_handlers[] = { &xfs_xattr_user_handler, &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, -#ifdef CONFIG_XFS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif NULL }; diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index 95e4f56f9754..1b4f81f1ac5d 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -723,8 +723,7 @@ typedef u32 acpi_event_type; #define ACPI_EVENT_POWER_BUTTON 2 #define ACPI_EVENT_SLEEP_BUTTON 3 #define ACPI_EVENT_RTC 4 -#define ACPI_EVENT_PCIE_WAKE 5 -#define ACPI_EVENT_MAX 5 +#define ACPI_EVENT_MAX 4 #define ACPI_NUM_FIXED_EVENTS ACPI_EVENT_MAX + 1 /* diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 8845a2eca339..90d7f68ed39d 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -26,6 +26,8 @@ #include <asm/ptrace.h> #include <asm/hyperv-tlfs.h> +#define VTPM_BASE_ADDRESS 0xfed40000 + struct ms_hyperv_info { u32 features; u32 priv_high; diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h index 68f7aa2a7e55..653992a6e941 100644 --- a/include/crypto/public_key.h +++ b/include/crypto/public_key.h @@ -28,6 +28,10 @@ struct public_key { bool key_is_private; const char *id_type; const char *pkey_algo; + unsigned long key_eflags; /* key extension flags */ +#define KEY_EFLAG_CA 0 /* set if the CA basic constraints is set */ +#define KEY_EFLAG_DIGITALSIG 1 /* set if the digitalSignature usage is set */ +#define KEY_EFLAG_KEYCERTSIGN 2 /* set if the keyCertSign usage is set */ }; extern void public_key_free(struct public_key *key); @@ -71,6 +75,21 @@ extern int restrict_link_by_key_or_keyring_chain(struct key *trust_keyring, const union key_payload *payload, struct key *trusted); +#if IS_REACHABLE(CONFIG_ASYMMETRIC_KEY_TYPE) +extern int restrict_link_by_ca(struct key *dest_keyring, + const struct key_type *type, + const union key_payload *payload, + struct key *trust_keyring); +#else +static inline int restrict_link_by_ca(struct key *dest_keyring, + const struct key_type *type, + const union key_payload *payload, + struct key *trust_keyring) +{ + return 0; +} +#endif + extern int query_asymmetric_key(const struct kernel_pkey_params *, struct kernel_pkey_query *); @@ -80,7 +99,16 @@ extern int create_signature(struct kernel_pkey_params *, const void *, void *); extern int verify_signature(const struct key *, const struct public_key_signature *); +#if IS_REACHABLE(CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE) int public_key_verify_signature(const struct public_key *pkey, const struct public_key_signature *sig); +#else +static inline +int public_key_verify_signature(const struct public_key *pkey, + const struct public_key_signature *sig) +{ + return -EINVAL; +} +#endif #endif /* _LINUX_PUBLIC_KEY_H */ diff --git a/include/kunit/resource.h b/include/kunit/resource.h index cf6fb8f2ac1b..c0d88b318e90 100644 --- a/include/kunit/resource.h +++ b/include/kunit/resource.h @@ -72,7 +72,7 @@ typedef void (*kunit_resource_free_t)(struct kunit_resource *); * params.gfp = gfp; * * return kunit_alloc_resource(test, kunit_kmalloc_init, - * kunit_kmalloc_free, ¶ms); + * kunit_kmalloc_free, gfp, ¶ms); * } * * Resources can also be named, with lookup/removal done on a name diff --git a/include/kunit/test.h b/include/kunit/test.h index 08d3559dd703..57b309c6ca27 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -34,7 +34,7 @@ DECLARE_STATIC_KEY_FALSE(kunit_running); struct kunit; /* Size of log associated with test. */ -#define KUNIT_LOG_SIZE 512 +#define KUNIT_LOG_SIZE 2048 /* Maximum size of parameter description string. */ #define KUNIT_PARAM_DESC_SIZE 128 @@ -420,7 +420,7 @@ void __printf(2, 3) kunit_log_append(char *log, const char *fmt, ...); #define kunit_log(lvl, test_or_suite, fmt, ...) \ do { \ printk(lvl fmt, ##__VA_ARGS__); \ - kunit_log_append((test_or_suite)->log, fmt "\n", \ + kunit_log_append((test_or_suite)->log, fmt, \ ##__VA_ARGS__); \ } while (0) diff --git a/include/linux/fs.h b/include/linux/fs.h index c85916e9f7db..ef2281a2acce 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2675,6 +2675,8 @@ extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); extern int file_remove_privs(struct file *); +int setattr_should_drop_sgid(struct mnt_idmap *idmap, + const struct inode *inode); /* * This must be used for allocating filesystems specific inodes to set @@ -2778,7 +2780,7 @@ enum { ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, get_block_t get_block, - dio_iodone_t end_io, dio_submit_t submit_io, + dio_iodone_t end_io, int flags); static inline ssize_t blockdev_direct_IO(struct kiocb *iocb, @@ -2787,7 +2789,7 @@ static inline ssize_t blockdev_direct_IO(struct kiocb *iocb, get_block_t get_block) { return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - get_block, NULL, NULL, DIO_LOCKING | DIO_SKIP_HOLES); + get_block, NULL, DIO_LOCKING | DIO_SKIP_HOLES); } #endif diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 5469ffee21c7..ff6341e09925 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -104,7 +104,6 @@ struct fs_context { unsigned int sb_flags; /* Proposed superblock flags (SB_*) */ unsigned int sb_flags_mask; /* Superblock flags that were changed */ unsigned int s_iflags; /* OR'd with sb->s_iflags */ - unsigned int lsm_flags; /* Information flags from the fs to the LSM */ enum fs_context_purpose purpose:8; enum fs_context_phase phase:8; /* The phase the context is in */ bool need_free:1; /* Need to call ops->free() */ diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index 501fa8486749..1b608e00290a 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -15,12 +15,11 @@ /** * instrument_read - instrument regular read access + * @v: address of access + * @size: size of access * * Instrument a regular read access. The instrumentation should be inserted * before the actual read happens. - * - * @ptr address of access - * @size size of access */ static __always_inline void instrument_read(const volatile void *v, size_t size) { @@ -30,12 +29,11 @@ static __always_inline void instrument_read(const volatile void *v, size_t size) /** * instrument_write - instrument regular write access + * @v: address of access + * @size: size of access * * Instrument a regular write access. The instrumentation should be inserted * before the actual write happens. - * - * @ptr address of access - * @size size of access */ static __always_inline void instrument_write(const volatile void *v, size_t size) { @@ -45,12 +43,11 @@ static __always_inline void instrument_write(const volatile void *v, size_t size /** * instrument_read_write - instrument regular read-write access + * @v: address of access + * @size: size of access * * Instrument a regular write access. The instrumentation should be inserted * before the actual write happens. - * - * @ptr address of access - * @size size of access */ static __always_inline void instrument_read_write(const volatile void *v, size_t size) { @@ -60,12 +57,11 @@ static __always_inline void instrument_read_write(const volatile void *v, size_t /** * instrument_atomic_read - instrument atomic read access + * @v: address of access + * @size: size of access * * Instrument an atomic read access. The instrumentation should be inserted * before the actual read happens. - * - * @ptr address of access - * @size size of access */ static __always_inline void instrument_atomic_read(const volatile void *v, size_t size) { @@ -75,12 +71,11 @@ static __always_inline void instrument_atomic_read(const volatile void *v, size_ /** * instrument_atomic_write - instrument atomic write access + * @v: address of access + * @size: size of access * * Instrument an atomic write access. The instrumentation should be inserted * before the actual write happens. - * - * @ptr address of access - * @size size of access */ static __always_inline void instrument_atomic_write(const volatile void *v, size_t size) { @@ -90,12 +85,11 @@ static __always_inline void instrument_atomic_write(const volatile void *v, size /** * instrument_atomic_read_write - instrument atomic read-write access + * @v: address of access + * @size: size of access * * Instrument an atomic read-write access. The instrumentation should be * inserted before the actual write happens. - * - * @ptr address of access - * @size size of access */ static __always_inline void instrument_atomic_read_write(const volatile void *v, size_t size) { @@ -105,13 +99,12 @@ static __always_inline void instrument_atomic_read_write(const volatile void *v, /** * instrument_copy_to_user - instrument reads of copy_to_user + * @to: destination address + * @from: source address + * @n: number of bytes to copy * * Instrument reads from kernel memory, that are due to copy_to_user (and * variants). The instrumentation must be inserted before the accesses. - * - * @to destination address - * @from source address - * @n number of bytes to copy */ static __always_inline void instrument_copy_to_user(void __user *to, const void *from, unsigned long n) @@ -123,13 +116,12 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n) /** * instrument_copy_from_user_before - add instrumentation before copy_from_user + * @to: destination address + * @from: source address + * @n: number of bytes to copy * * Instrument writes to kernel memory, that are due to copy_from_user (and * variants). The instrumentation should be inserted before the accesses. - * - * @to destination address - * @from source address - * @n number of bytes to copy */ static __always_inline void instrument_copy_from_user_before(const void *to, const void __user *from, unsigned long n) @@ -140,14 +132,13 @@ instrument_copy_from_user_before(const void *to, const void __user *from, unsign /** * instrument_copy_from_user_after - add instrumentation after copy_from_user + * @to: destination address + * @from: source address + * @n: number of bytes to copy + * @left: number of bytes not copied (as returned by copy_from_user) * * Instrument writes to kernel memory, that are due to copy_from_user (and * variants). The instrumentation should be inserted after the accesses. - * - * @to destination address - * @from source address - * @n number of bytes to copy - * @left number of bytes not copied (as returned by copy_from_user) */ static __always_inline void instrument_copy_from_user_after(const void *to, const void __user *from, @@ -158,12 +149,11 @@ instrument_copy_from_user_after(const void *to, const void __user *from, /** * instrument_get_user() - add instrumentation to get_user()-like macros + * @to: destination variable, may not be address-taken * * get_user() and friends are fragile, so it may depend on the implementation * whether the instrumentation happens before or after the data is copied from * the userspace. - * - * @to destination variable, may not be address-taken */ #define instrument_get_user(to) \ ({ \ @@ -175,14 +165,13 @@ instrument_copy_from_user_after(const void *to, const void __user *from, /** * instrument_put_user() - add instrumentation to put_user()-like macros + * @from: source address + * @ptr: userspace pointer to copy to + * @size: number of bytes to copy * * put_user() and friends are fragile, so it may depend on the implementation * whether the instrumentation happens before or after the data is copied from * the userspace. - * - * @from source address - * @ptr userspace pointer to copy to - * @size number of bytes to copy */ #define instrument_put_user(from, ptr, size) \ ({ \ diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index e38ae3c34618..30b17647ce3c 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -134,11 +134,12 @@ void kmsan_kfree_large(const void *ptr); * @page_shift: page_shift passed to vmap_range_noflush(). * * KMSAN maps shadow and origin pages of @pages into contiguous ranges in - * vmalloc metadata address range. + * vmalloc metadata address range. Returns 0 on success, callers must check + * for non-zero return value. */ -void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages, - unsigned int page_shift); +int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift); /** * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap. @@ -159,11 +160,12 @@ void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end); * @page_shift: page_shift argument passed to vmap_range_noflush(). * * KMSAN creates new metadata pages for the physical pages mapped into the - * virtual memory. + * virtual memory. Returns 0 on success, callers must check for non-zero return + * value. */ -void kmsan_ioremap_page_range(unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int page_shift); +int kmsan_ioremap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift); /** * kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call. @@ -281,12 +283,13 @@ static inline void kmsan_kfree_large(const void *ptr) { } -static inline void kmsan_vmap_pages_range_noflush(unsigned long start, - unsigned long end, - pgprot_t prot, - struct page **pages, - unsigned int page_shift) +static inline int kmsan_vmap_pages_range_noflush(unsigned long start, + unsigned long end, + pgprot_t prot, + struct page **pages, + unsigned int page_shift) { + return 0; } static inline void kmsan_vunmap_range_noflush(unsigned long start, @@ -294,12 +297,12 @@ static inline void kmsan_vunmap_range_noflush(unsigned long start, { } -static inline void kmsan_ioremap_page_range(unsigned long start, - unsigned long end, - phys_addr_t phys_addr, - pgprot_t prot, - unsigned int page_shift) +static inline int kmsan_ioremap_page_range(unsigned long start, + unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift) { + return 0; } static inline void kmsan_iounmap_page_range(unsigned long start, diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 1023f349af71..b32256e9e944 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -134,7 +134,8 @@ struct held_lock { unsigned int read:2; /* see lock_acquire() comment */ unsigned int check:1; /* see lock_acquire() comment */ unsigned int hardirqs_off:1; - unsigned int references:12; /* 32 bits */ + unsigned int sync:1; + unsigned int references:11; /* 32 bits */ unsigned int pin_count; }; @@ -268,6 +269,10 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, extern void lock_release(struct lockdep_map *lock, unsigned long ip); +extern void lock_sync(struct lockdep_map *lock, unsigned int subclass, + int read, int check, struct lockdep_map *nest_lock, + unsigned long ip); + /* lock_is_held_type() returns */ #define LOCK_STATE_UNKNOWN -1 #define LOCK_STATE_NOT_HELD 0 @@ -554,6 +559,7 @@ do { \ #define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_release(l) lock_release(l, _THIS_IP_) +#define lock_map_sync(l) lock_sync(l, 0, 0, 1, NULL, _THIS_IP_) #ifdef CONFIG_PROVE_LOCKING # define might_lock(lock) \ diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 094b76dc7164..6bb55e61e8e8 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -381,7 +381,7 @@ LSM_HOOK(int, 0, key_alloc, struct key *key, const struct cred *cred, LSM_HOOK(void, LSM_RET_VOID, key_free, struct key *key) LSM_HOOK(int, 0, key_permission, key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) -LSM_HOOK(int, 0, key_getsecurity, struct key *key, char **_buffer) +LSM_HOOK(int, 0, key_getsecurity, struct key *key, char **buffer) #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 6e156d2acffc..ab2b2fafa4a4 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -29,1630 +29,6 @@ #include <linux/init.h> #include <linux/rculist.h> -/** - * union security_list_options - Linux Security Module hook function list - * - * Security hooks for program execution operations. - * - * @bprm_creds_for_exec: - * If the setup in prepare_exec_creds did not setup @bprm->cred->security - * properly for executing @bprm->file, update the LSM's portion of - * @bprm->cred->security to be what commit_creds needs to install for the - * new program. This hook may also optionally check permissions - * (e.g. for transitions between security domains). - * The hook must set @bprm->secureexec to 1 if AT_SECURE should be set to - * request libc enable secure mode. - * @bprm contains the linux_binprm structure. - * Return 0 if the hook is successful and permission is granted. - * @bprm_creds_from_file: - * If @file is setpcap, suid, sgid or otherwise marked to change - * privilege upon exec, update @bprm->cred to reflect that change. - * This is called after finding the binary that will be executed. - * without an interpreter. This ensures that the credentials will not - * be derived from a script that the binary will need to reopen, which - * when reopend may end up being a completely different file. This - * hook may also optionally check permissions (e.g. for transitions - * between security domains). - * The hook must set @bprm->secureexec to 1 if AT_SECURE should be set to - * request libc enable secure mode. - * The hook must add to @bprm->per_clear any personality flags that - * should be cleared from current->personality. - * @bprm contains the linux_binprm structure. - * Return 0 if the hook is successful and permission is granted. - * @bprm_check_security: - * This hook mediates the point when a search for a binary handler will - * begin. It allows a check against the @bprm->cred->security value - * which was set in the preceding creds_for_exec call. The argv list and - * envp list are reliably available in @bprm. This hook may be called - * multiple times during a single execve. - * @bprm contains the linux_binprm structure. - * Return 0 if the hook is successful and permission is granted. - * @bprm_committing_creds: - * Prepare to install the new security attributes of a process being - * transformed by an execve operation, based on the old credentials - * pointed to by @current->cred and the information set in @bprm->cred by - * the bprm_creds_for_exec hook. @bprm points to the linux_binprm - * structure. This hook is a good place to perform state changes on the - * process such as closing open file descriptors to which access will no - * longer be granted when the attributes are changed. This is called - * immediately before commit_creds(). - * @bprm_committed_creds: - * Tidy up after the installation of the new security attributes of a - * process being transformed by an execve operation. The new credentials - * have, by this point, been set to @current->cred. @bprm points to the - * linux_binprm structure. This hook is a good place to perform state - * changes on the process such as clearing out non-inheritable signal - * state. This is called immediately after commit_creds(). - * - * Security hooks for mount using fs_context. - * [See also Documentation/filesystems/mount_api.rst] - * - * @fs_context_dup: - * Allocate and attach a security structure to sc->security. This pointer - * is initialised to NULL by the caller. - * @fc indicates the new filesystem context. - * @src_fc indicates the original filesystem context. - * Return 0 on success or a negative error code on failure. - * @fs_context_parse_param: - * Userspace provided a parameter to configure a superblock. The LSM may - * reject it with an error and may use it for itself, in which case it - * should return 0; otherwise it should return -ENOPARAM to pass it on to - * the filesystem. - * @fc indicates the filesystem context. - * @param The parameter. - * - * Security hooks for filesystem operations. - * - * @sb_alloc_security: - * Allocate and attach a security structure to the sb->s_security field. - * The s_security field is initialized to NULL when the structure is - * allocated. - * @sb contains the super_block structure to be modified. - * Return 0 if operation was successful. - * @sb_delete: - * Release objects tied to a superblock (e.g. inodes). - * @sb contains the super_block structure being released. - * @sb_free_security: - * Deallocate and clear the sb->s_security field. - * @sb contains the super_block structure to be modified. - * @sb_free_mnt_opts: - * Free memory associated with @mnt_ops. - * @sb_eat_lsm_opts: - * Eat (scan @orig options) and save them in @mnt_opts. - * Return 0 on success, negative values on failure. - * @sb_statfs: - * Check permission before obtaining filesystem statistics for the @mnt - * mountpoint. - * @dentry is a handle on the superblock for the filesystem. - * Return 0 if permission is granted. - * @sb_mount: - * Check permission before an object specified by @dev_name is mounted on - * the mount point named by @nd. For an ordinary mount, @dev_name - * identifies a device if the file system type requires a device. For a - * remount (@flags & MS_REMOUNT), @dev_name is irrelevant. For a - * loopback/bind mount (@flags & MS_BIND), @dev_name identifies the - * pathname of the object being mounted. - * @dev_name contains the name for object being mounted. - * @path contains the path for mount point object. - * @type contains the filesystem type. - * @flags contains the mount flags. - * @data contains the filesystem-specific data. - * Return 0 if permission is granted. - * @sb_mnt_opts_compat: - * Determine if the new mount options in @mnt_opts are allowed given - * the existing mounted filesystem at @sb. - * @sb superblock being compared. - * @mnt_opts new mount options. - * Return 0 if options are compatible. - * @sb_remount: - * Extracts security system specific mount options and verifies no changes - * are being made to those options. - * @sb superblock being remounted. - * @data contains the filesystem-specific data. - * Return 0 if permission is granted. - * @sb_kern_mount: - * Mount this @sb if allowed by permissions. - * Return 0 if permission is granted. - * @sb_show_options: - * Show (print on @m) mount options for this @sb. - * Return 0 on success, negative values on failure. - * @sb_umount: - * Check permission before the @mnt file system is unmounted. - * @mnt contains the mounted file system. - * @flags contains the unmount flags, e.g. MNT_FORCE. - * Return 0 if permission is granted. - * @sb_pivotroot: - * Check permission before pivoting the root filesystem. - * @old_path contains the path for the new location of the - * current root (put_old). - * @new_path contains the path for the new root (new_root). - * Return 0 if permission is granted. - * @sb_set_mnt_opts: - * Set the security relevant mount options used for a superblock - * @sb the superblock to set security mount options for. - * @opts binary data structure containing all lsm mount data. - * Return 0 on success, error on failure. - * @sb_clone_mnt_opts: - * Copy all security options from a given superblock to another - * @oldsb old superblock which contain information to clone. - * @newsb new superblock which needs filled in. - * Return 0 on success, error on failure. - * @move_mount: - * Check permission before a mount is moved. - * @from_path indicates the mount that is going to be moved. - * @to_path indicates the mountpoint that will be mounted upon. - * Return 0 if permission is granted. - * @dentry_init_security: - * Compute a context for a dentry as the inode is not yet available - * since NFSv4 has no label backed by an EA anyway. - * @dentry dentry to use in calculating the context. - * @mode mode used to determine resource type. - * @name name of the last path component used to create file. - * @xattr_name pointer to place the pointer to security xattr name. - * Caller does not have to free the resulting pointer. Its - * a pointer to static string. - * @ctx pointer to place the pointer to the resulting context in. - * @ctxlen point to place the length of the resulting context. - * Return 0 on success, negative values on failure. - * @dentry_create_files_as: - * Compute a context for a dentry as the inode is not yet available - * and set that context in passed in creds so that new files are - * created using that context. Context is calculated using the - * passed in creds and not the creds of the caller. - * @dentry dentry to use in calculating the context. - * @mode mode used to determine resource type. - * @name name of the last path component used to create file. - * @old creds which should be used for context calculation. - * @new creds to modify. - * Return 0 on success, error on failure. - * - * - * Security hooks for inode operations. - * - * @inode_alloc_security: - * Allocate and attach a security structure to @inode->i_security. The - * i_security field is initialized to NULL when the inode structure is - * allocated. - * @inode contains the inode structure. - * Return 0 if operation was successful. - * @inode_free_security: - * @inode contains the inode structure. - * Deallocate the inode security structure and set @inode->i_security to - * NULL. - * @inode_init_security: - * Obtain the security attribute name suffix and value to set on a newly - * created inode and set up the incore security field for the new inode. - * This hook is called by the fs code as part of the inode creation - * transaction and provides for atomic labeling of the inode, unlike - * the post_create/mkdir/... hooks called by the VFS. The hook function - * is expected to allocate the name and value via kmalloc, with the caller - * being responsible for calling kfree after using them. - * If the security module does not use security attributes or does - * not wish to put a security attribute on this particular inode, - * then it should return -EOPNOTSUPP to skip this processing. - * @inode contains the inode structure of the newly created inode. - * @dir contains the inode structure of the parent directory. - * @qstr contains the last path component of the new object. - * @name will be set to the allocated name suffix (e.g. selinux). - * @value will be set to the allocated attribute value. - * @len will be set to the length of the value. - * Returns 0 if @name and @value have been successfully set, - * -EOPNOTSUPP if no security attribute is needed, or - * -ENOMEM on memory allocation failure. - * @inode_init_security_anon: - * Set up the incore security field for the new anonymous inode - * and return whether the inode creation is permitted by the security - * module or not. - * @inode contains the inode structure. - * @name name of the anonymous inode class. - * @context_inode optional related inode. - * Returns 0 on success, -EACCES if the security module denies the - * creation of this inode, or another -errno upon other errors. - * @inode_create: - * Check permission to create a regular file. - * @dir contains inode structure of the parent of the new file. - * @dentry contains the dentry structure for the file to be created. - * @mode contains the file mode of the file to be created. - * Return 0 if permission is granted. - * @inode_link: - * Check permission before creating a new hard link to a file. - * @old_dentry contains the dentry structure for an existing - * link to the file. - * @dir contains the inode structure of the parent directory - * of the new link. - * @new_dentry contains the dentry structure for the new link. - * Return 0 if permission is granted. - * @path_link: - * Check permission before creating a new hard link to a file. - * @old_dentry contains the dentry structure for an existing link - * to the file. - * @new_dir contains the path structure of the parent directory of - * the new link. - * @new_dentry contains the dentry structure for the new link. - * Return 0 if permission is granted. - * @inode_unlink: - * Check the permission to remove a hard link to a file. - * @dir contains the inode structure of parent directory of the file. - * @dentry contains the dentry structure for file to be unlinked. - * Return 0 if permission is granted. - * @path_unlink: - * Check the permission to remove a hard link to a file. - * @dir contains the path structure of parent directory of the file. - * @dentry contains the dentry structure for file to be unlinked. - * Return 0 if permission is granted. - * @inode_symlink: - * Check the permission to create a symbolic link to a file. - * @dir contains the inode structure of parent directory of - * the symbolic link. - * @dentry contains the dentry structure of the symbolic link. - * @old_name contains the pathname of file. - * Return 0 if permission is granted. - * @path_symlink: - * Check the permission to create a symbolic link to a file. - * @dir contains the path structure of parent directory of - * the symbolic link. - * @dentry contains the dentry structure of the symbolic link. - * @old_name contains the pathname of file. - * Return 0 if permission is granted. - * @inode_mkdir: - * Check permissions to create a new directory in the existing directory - * associated with inode structure @dir. - * @dir contains the inode structure of parent of the directory - * to be created. - * @dentry contains the dentry structure of new directory. - * @mode contains the mode of new directory. - * Return 0 if permission is granted. - * @path_mkdir: - * Check permissions to create a new directory in the existing directory - * associated with path structure @path. - * @dir contains the path structure of parent of the directory - * to be created. - * @dentry contains the dentry structure of new directory. - * @mode contains the mode of new directory. - * Return 0 if permission is granted. - * @inode_rmdir: - * Check the permission to remove a directory. - * @dir contains the inode structure of parent of the directory - * to be removed. - * @dentry contains the dentry structure of directory to be removed. - * Return 0 if permission is granted. - * @path_rmdir: - * Check the permission to remove a directory. - * @dir contains the path structure of parent of the directory to be - * removed. - * @dentry contains the dentry structure of directory to be removed. - * Return 0 if permission is granted. - * @inode_mknod: - * Check permissions when creating a special file (or a socket or a fifo - * file created via the mknod system call). Note that if mknod operation - * is being done for a regular file, then the create hook will be called - * and not this hook. - * @dir contains the inode structure of parent of the new file. - * @dentry contains the dentry structure of the new file. - * @mode contains the mode of the new file. - * @dev contains the device number. - * Return 0 if permission is granted. - * @path_mknod: - * Check permissions when creating a file. Note that this hook is called - * even if mknod operation is being done for a regular file. - * @dir contains the path structure of parent of the new file. - * @dentry contains the dentry structure of the new file. - * @mode contains the mode of the new file. - * @dev contains the undecoded device number. Use new_decode_dev() to get - * the decoded device number. - * Return 0 if permission is granted. - * @inode_rename: - * Check for permission to rename a file or directory. - * @old_dir contains the inode structure for parent of the old link. - * @old_dentry contains the dentry structure of the old link. - * @new_dir contains the inode structure for parent of the new link. - * @new_dentry contains the dentry structure of the new link. - * Return 0 if permission is granted. - * @path_rename: - * Check for permission to rename a file or directory. - * @old_dir contains the path structure for parent of the old link. - * @old_dentry contains the dentry structure of the old link. - * @new_dir contains the path structure for parent of the new link. - * @new_dentry contains the dentry structure of the new link. - * @flags may contain rename options such as RENAME_EXCHANGE. - * Return 0 if permission is granted. - * @path_chmod: - * Check for permission to change a mode of the file @path. The new - * mode is specified in @mode. - * @path contains the path structure of the file to change the mode. - * @mode contains the new DAC's permission, which is a bitmask of - * constants from <include/uapi/linux/stat.h>. - * Return 0 if permission is granted. - * @path_chown: - * Check for permission to change owner/group of a file or directory. - * @path contains the path structure. - * @uid contains new owner's ID. - * @gid contains new group's ID. - * Return 0 if permission is granted. - * @path_chroot: - * Check for permission to change root directory. - * @path contains the path structure. - * Return 0 if permission is granted. - * @path_notify: - * Check permissions before setting a watch on events as defined by @mask, - * on an object at @path, whose type is defined by @obj_type. - * Return 0 if permission is granted. - * @inode_readlink: - * Check the permission to read the symbolic link. - * @dentry contains the dentry structure for the file link. - * Return 0 if permission is granted. - * @inode_follow_link: - * Check permission to follow a symbolic link when looking up a pathname. - * @dentry contains the dentry structure for the link. - * @inode contains the inode, which itself is not stable in RCU-walk. - * @rcu indicates whether we are in RCU-walk mode. - * Return 0 if permission is granted. - * @inode_permission: - * Check permission before accessing an inode. This hook is called by the - * existing Linux permission function, so a security module can use it to - * provide additional checking for existing Linux permission checks. - * Notice that this hook is called when a file is opened (as well as many - * other operations), whereas the file_security_ops permission hook is - * called when the actual read/write operations are performed. - * @inode contains the inode structure to check. - * @mask contains the permission mask. - * Return 0 if permission is granted. - * @inode_setattr: - * Check permission before setting file attributes. Note that the kernel - * call to notify_change is performed from several locations, whenever - * file attributes change (such as when a file is truncated, chown/chmod - * operations, transferring disk quotas, etc). - * @dentry contains the dentry structure for the file. - * @attr is the iattr structure containing the new file attributes. - * Return 0 if permission is granted. - * @path_truncate: - * Check permission before truncating the file indicated by path. - * Note that truncation permissions may also be checked based on - * already opened files, using the @file_truncate hook. - * @path contains the path structure for the file. - * Return 0 if permission is granted. - * @inode_getattr: - * Check permission before obtaining file attributes. - * @path contains the path structure for the file. - * Return 0 if permission is granted. - * @inode_setxattr: - * Check permission before setting the extended attributes - * @value identified by @name for @dentry. - * Return 0 if permission is granted. - * @inode_post_setxattr: - * Update inode security field after successful setxattr operation. - * @value identified by @name for @dentry. - * @inode_getxattr: - * Check permission before obtaining the extended attributes - * identified by @name for @dentry. - * Return 0 if permission is granted. - * @inode_listxattr: - * Check permission before obtaining the list of extended attribute - * names for @dentry. - * Return 0 if permission is granted. - * @inode_removexattr: - * Check permission before removing the extended attribute - * identified by @name for @dentry. - * Return 0 if permission is granted. - * @inode_set_acl: - * Check permission before setting posix acls - * The posix acls in @kacl are identified by @acl_name. - * Return 0 if permission is granted. - * @inode_get_acl: - * Check permission before getting osix acls - * The posix acls are identified by @acl_name. - * Return 0 if permission is granted. - * @inode_remove_acl: - * Check permission before removing posix acls - * The posix acls are identified by @acl_name. - * Return 0 if permission is granted. - * @inode_getsecurity: - * Retrieve a copy of the extended attribute representation of the - * security label associated with @name for @inode via @buffer. Note that - * @name is the remainder of the attribute name after the security prefix - * has been removed. @alloc is used to specify if the call should return a - * value via the buffer or just the value length. - * Return size of buffer on success. - * @inode_setsecurity: - * Set the security label associated with @name for @inode from the - * extended attribute value @value. @size indicates the size of the - * @value in bytes. @flags may be XATTR_CREATE, XATTR_REPLACE, or 0. - * Note that @name is the remainder of the attribute name after the - * security. prefix has been removed. - * Return 0 on success. - * @inode_listsecurity: - * Copy the extended attribute names for the security labels - * associated with @inode into @buffer. The maximum size of @buffer - * is specified by @buffer_size. @buffer may be NULL to request - * the size of the buffer required. - * Returns number of bytes used/required on success. - * @inode_need_killpriv: - * Called when an inode has been changed. - * @dentry is the dentry being changed. - * Return <0 on error to abort the inode change operation. - * Return 0 if inode_killpriv does not need to be called. - * Return >0 if inode_killpriv does need to be called. - * @inode_killpriv: - * The setuid bit is being removed. Remove similar security labels. - * Called with the dentry->d_inode->i_mutex held. - * @idmap: idmap of the mount. - * @dentry is the dentry being changed. - * Return 0 on success. If error is returned, then the operation - * causing setuid bit removal is failed. - * @inode_getsecid: - * Get the secid associated with the node. - * @inode contains a pointer to the inode. - * @secid contains a pointer to the location where result will be saved. - * In case of failure, @secid will be set to zero. - * @inode_copy_up: - * A file is about to be copied up from lower layer to upper layer of - * overlay filesystem. Security module can prepare a set of new creds - * and modify as need be and return new creds. Caller will switch to - * new creds temporarily to create new file and release newly allocated - * creds. - * @src indicates the union dentry of file that is being copied up. - * @new pointer to pointer to return newly allocated creds. - * Returns 0 on success or a negative error code on error. - * @inode_copy_up_xattr: - * Filter the xattrs being copied up when a unioned file is copied - * up from a lower layer to the union/overlay layer. - * @name indicates the name of the xattr. - * Returns 0 to accept the xattr, 1 to discard the xattr, -EOPNOTSUPP if - * security module does not know about attribute or a negative error code - * to abort the copy up. Note that the caller is responsible for reading - * and writing the xattrs as this hook is merely a filter. - * @d_instantiate: - * Fill in @inode security information for a @dentry if allowed. - * @getprocattr: - * Read attribute @name for process @p and store it into @value if allowed. - * Return the length of @value on success, a negative value otherwise. - * @setprocattr: - * Write (set) attribute @name to @value, size @size if allowed. - * Return written bytes on success, a negative value otherwise. - * - * Security hooks for kernfs node operations - * - * @kernfs_init_security: - * Initialize the security context of a newly created kernfs node based - * on its own and its parent's attributes. - * @kn_dir the parent kernfs node. - * @kn the new child kernfs node. - * Return 0 if permission is granted. - * - * Security hooks for file operations - * - * @file_permission: - * Check file permissions before accessing an open file. This hook is - * called by various operations that read or write files. A security - * module can use this hook to perform additional checking on these - * operations, e.g. to revalidate permissions on use to support privilege - * bracketing or policy changes. Notice that this hook is used when the - * actual read/write operations are performed, whereas the - * inode_security_ops hook is called when a file is opened (as well as - * many other operations). - * Caveat: Although this hook can be used to revalidate permissions for - * various system call operations that read or write files, it does not - * address the revalidation of permissions for memory-mapped files. - * Security modules must handle this separately if they need such - * revalidation. - * @file contains the file structure being accessed. - * @mask contains the requested permissions. - * Return 0 if permission is granted. - * @file_alloc_security: - * Allocate and attach a security structure to the file->f_security field. - * The security field is initialized to NULL when the structure is first - * created. - * @file contains the file structure to secure. - * Return 0 if the hook is successful and permission is granted. - * @file_free_security: - * Deallocate and free any security structures stored in file->f_security. - * @file contains the file structure being modified. - * @file_ioctl: - * @file contains the file structure. - * @cmd contains the operation to perform. - * @arg contains the operational arguments. - * Check permission for an ioctl operation on @file. Note that @arg - * sometimes represents a user space pointer; in other cases, it may be a - * simple integer value. When @arg represents a user space pointer, it - * should never be used by the security module. - * Return 0 if permission is granted. - * @mmap_addr: - * Check permissions for a mmap operation at @addr. - * @addr contains virtual address that will be used for the operation. - * Return 0 if permission is granted. - * @mmap_file: - * Check permissions for a mmap operation. The @file may be NULL, e.g. - * if mapping anonymous memory. - * @file contains the file structure for file to map (may be NULL). - * @reqprot contains the protection requested by the application. - * @prot contains the protection that will be applied by the kernel. - * @flags contains the operational flags. - * Return 0 if permission is granted. - * @file_mprotect: - * Check permissions before changing memory access permissions. - * @vma contains the memory region to modify. - * @reqprot contains the protection requested by the application. - * @prot contains the protection that will be applied by the kernel. - * Return 0 if permission is granted. - * @file_lock: - * Check permission before performing file locking operations. - * Note the hook mediates both flock and fcntl style locks. - * @file contains the file structure. - * @cmd contains the posix-translated lock operation to perform - * (e.g. F_RDLCK, F_WRLCK). - * Return 0 if permission is granted. - * @file_fcntl: - * Check permission before allowing the file operation specified by @cmd - * from being performed on the file @file. Note that @arg sometimes - * represents a user space pointer; in other cases, it may be a simple - * integer value. When @arg represents a user space pointer, it should - * never be used by the security module. - * @file contains the file structure. - * @cmd contains the operation to be performed. - * @arg contains the operational arguments. - * Return 0 if permission is granted. - * @file_set_fowner: - * Save owner security information (typically from current->security) in - * file->f_security for later use by the send_sigiotask hook. - * @file contains the file structure to update. - * Return 0 on success. - * @file_send_sigiotask: - * Check permission for the file owner @fown to send SIGIO or SIGURG to the - * process @tsk. Note that this hook is sometimes called from interrupt. - * Note that the fown_struct, @fown, is never outside the context of a - * struct file, so the file structure (and associated security information) - * can always be obtained: container_of(fown, struct file, f_owner) - * @tsk contains the structure of task receiving signal. - * @fown contains the file owner information. - * @sig is the signal that will be sent. When 0, kernel sends SIGIO. - * Return 0 if permission is granted. - * @file_receive: - * This hook allows security modules to control the ability of a process - * to receive an open file descriptor via socket IPC. - * @file contains the file structure being received. - * Return 0 if permission is granted. - * @file_truncate: - * Check permission before truncating a file, i.e. using ftruncate. - * Note that truncation permission may also be checked based on the path, - * using the @path_truncate hook. - * @file contains the file structure for the file. - * Return 0 if permission is granted. - * @file_open: - * Save open-time permission checking state for later use upon - * file_permission, and recheck access if anything has changed - * since inode_permission. - * Return 0 if permission is granted. - * - * Security hooks for task operations. - * - * @task_alloc: - * @task task being allocated. - * @clone_flags contains the flags indicating what should be shared. - * Handle allocation of task-related resources. - * Returns a zero on success, negative values on failure. - * @task_free: - * @task task about to be freed. - * Handle release of task-related resources. (Note that this can be called - * from interrupt context.) - * @cred_alloc_blank: - * @cred points to the credentials. - * @gfp indicates the atomicity of any memory allocations. - * Only allocate sufficient memory and attach to @cred such that - * cred_transfer() will not get ENOMEM. - * Return 0 on success, negative values on failure. - * @cred_free: - * @cred points to the credentials. - * Deallocate and clear the cred->security field in a set of credentials. - * @cred_prepare: - * @new points to the new credentials. - * @old points to the original credentials. - * @gfp indicates the atomicity of any memory allocations. - * Prepare a new set of credentials by copying the data from the old set. - * Return 0 on success, negative values on failure. - * @cred_transfer: - * @new points to the new credentials. - * @old points to the original credentials. - * Transfer data from original creds to new creds - * @cred_getsecid: - * Retrieve the security identifier of the cred structure @c - * @c contains the credentials, secid will be placed into @secid. - * In case of failure, @secid will be set to zero. - * @kernel_act_as: - * Set the credentials for a kernel service to act as (subjective context). - * @new points to the credentials to be modified. - * @secid specifies the security ID to be set. - * The current task must be the one that nominated @secid. - * Return 0 if successful. - * @kernel_create_files_as: - * Set the file creation context in a set of credentials to be the same as - * the objective context of the specified inode. - * @new points to the credentials to be modified. - * @inode points to the inode to use as a reference. - * The current task must be the one that nominated @inode. - * Return 0 if successful. - * @kernel_module_request: - * Ability to trigger the kernel to automatically upcall to userspace for - * userspace to load a kernel module with the given name. - * @kmod_name name of the module requested by the kernel. - * Return 0 if successful. - * @kernel_load_data: - * Load data provided by userspace. - * @id kernel load data identifier. - * @contents if a subsequent @kernel_post_load_data will be called. - * Return 0 if permission is granted. - * @kernel_post_load_data: - * Load data provided by a non-file source (usually userspace buffer). - * @buf pointer to buffer containing the data contents. - * @size length of the data contents. - * @id kernel load data identifier. - * @description a text description of what was loaded, @id-specific. - * Return 0 if permission is granted. - * This must be paired with a prior @kernel_load_data call that had - * @contents set to true. - * @kernel_read_file: - * Read a file specified by userspace. - * @file contains the file structure pointing to the file being read - * by the kernel. - * @id kernel read file identifier. - * @contents if a subsequent @kernel_post_read_file will be called. - * Return 0 if permission is granted. - * @kernel_post_read_file: - * Read a file specified by userspace. - * @file contains the file structure pointing to the file being read - * by the kernel. - * @buf pointer to buffer containing the file contents. - * @size length of the file contents. - * @id kernel read file identifier. - * This must be paired with a prior @kernel_read_file call that had - * @contents set to true. - * Return 0 if permission is granted. - * @task_fix_setuid: - * Update the module's state after setting one or more of the user - * identity attributes of the current process. The @flags parameter - * indicates which of the set*uid system calls invoked this hook. If - * @new is the set of credentials that will be installed. Modifications - * should be made to this rather than to @current->cred. - * @old is the set of credentials that are being replaced. - * @flags contains one of the LSM_SETID_* values. - * Return 0 on success. - * @task_fix_setgid: - * Update the module's state after setting one or more of the group - * identity attributes of the current process. The @flags parameter - * indicates which of the set*gid system calls invoked this hook. - * @new is the set of credentials that will be installed. Modifications - * should be made to this rather than to @current->cred. - * @old is the set of credentials that are being replaced. - * @flags contains one of the LSM_SETID_* values. - * Return 0 on success. - * @task_fix_setgroups: - * Update the module's state after setting the supplementary group - * identity attributes of the current process. - * @new is the set of credentials that will be installed. Modifications - * should be made to this rather than to @current->cred. - * @old is the set of credentials that are being replaced. - * Return 0 on success. - * @task_setpgid: - * Check permission before setting the process group identifier of the - * process @p to @pgid. - * @p contains the task_struct for process being modified. - * @pgid contains the new pgid. - * Return 0 if permission is granted. - * @task_getpgid: - * Check permission before getting the process group identifier of the - * process @p. - * @p contains the task_struct for the process. - * Return 0 if permission is granted. - * @task_getsid: - * Check permission before getting the session identifier of the process - * @p. - * @p contains the task_struct for the process. - * Return 0 if permission is granted. - * @current_getsecid_subj: - * Retrieve the subjective security identifier of the current task and - * return it in @secid. - * In case of failure, @secid will be set to zero. - * @task_getsecid_obj: - * Retrieve the objective security identifier of the task_struct in @p - * and return it in @secid. - * In case of failure, @secid will be set to zero. - * - * @task_setnice: - * Check permission before setting the nice value of @p to @nice. - * @p contains the task_struct of process. - * @nice contains the new nice value. - * Return 0 if permission is granted. - * @task_setioprio: - * Check permission before setting the ioprio value of @p to @ioprio. - * @p contains the task_struct of process. - * @ioprio contains the new ioprio value. - * Return 0 if permission is granted. - * @task_getioprio: - * Check permission before getting the ioprio value of @p. - * @p contains the task_struct of process. - * Return 0 if permission is granted. - * @task_prlimit: - * Check permission before getting and/or setting the resource limits of - * another task. - * @cred points to the cred structure for the current task. - * @tcred points to the cred structure for the target task. - * @flags contains the LSM_PRLIMIT_* flag bits indicating whether the - * resource limits are being read, modified, or both. - * Return 0 if permission is granted. - * @task_setrlimit: - * Check permission before setting the resource limits of process @p - * for @resource to @new_rlim. The old resource limit values can - * be examined by dereferencing (p->signal->rlim + resource). - * @p points to the task_struct for the target task's group leader. - * @resource contains the resource whose limit is being set. - * @new_rlim contains the new limits for @resource. - * Return 0 if permission is granted. - * @task_setscheduler: - * Check permission before setting scheduling policy and/or parameters of - * process @p. - * @p contains the task_struct for process. - * Return 0 if permission is granted. - * @task_getscheduler: - * Check permission before obtaining scheduling information for process - * @p. - * @p contains the task_struct for process. - * Return 0 if permission is granted. - * @task_movememory: - * Check permission before moving memory owned by process @p. - * @p contains the task_struct for process. - * Return 0 if permission is granted. - * @task_kill: - * Check permission before sending signal @sig to @p. @info can be NULL, - * the constant 1, or a pointer to a kernel_siginfo structure. If @info is 1 or - * SI_FROMKERNEL(info) is true, then the signal should be viewed as coming - * from the kernel and should typically be permitted. - * SIGIO signals are handled separately by the send_sigiotask hook in - * file_security_ops. - * @p contains the task_struct for process. - * @info contains the signal information. - * @sig contains the signal value. - * @cred contains the cred of the process where the signal originated, or - * NULL if the current task is the originator. - * Return 0 if permission is granted. - * @task_prctl: - * Check permission before performing a process control operation on the - * current process. - * @option contains the operation. - * @arg2 contains a argument. - * @arg3 contains a argument. - * @arg4 contains a argument. - * @arg5 contains a argument. - * Return -ENOSYS if no-one wanted to handle this op, any other value to - * cause prctl() to return immediately with that value. - * @task_to_inode: - * Set the security attributes for an inode based on an associated task's - * security attributes, e.g. for /proc/pid inodes. - * @p contains the task_struct for the task. - * @inode contains the inode structure for the inode. - * @userns_create: - * Check permission prior to creating a new user namespace. - * @cred points to prepared creds. - * Return 0 if successful, otherwise < 0 error code. - * - * Security hooks for Netlink messaging. - * - * @netlink_send: - * Save security information for a netlink message so that permission - * checking can be performed when the message is processed. The security - * information can be saved using the eff_cap field of the - * netlink_skb_parms structure. Also may be used to provide fine - * grained control over message transmission. - * @sk associated sock of task sending the message. - * @skb contains the sk_buff structure for the netlink message. - * Return 0 if the information was successfully saved and message - * is allowed to be transmitted. - * - * Security hooks for Unix domain networking. - * - * @unix_stream_connect: - * Check permissions before establishing a Unix domain stream connection - * between @sock and @other. - * @sock contains the sock structure. - * @other contains the peer sock structure. - * @newsk contains the new sock structure. - * Return 0 if permission is granted. - * @unix_may_send: - * Check permissions before connecting or sending datagrams from @sock to - * @other. - * @sock contains the socket structure. - * @other contains the peer socket structure. - * Return 0 if permission is granted. - * - * The @unix_stream_connect and @unix_may_send hooks were necessary because - * Linux provides an alternative to the conventional file name space for Unix - * domain sockets. Whereas binding and connecting to sockets in the file name - * space is mediated by the typical file permissions (and caught by the mknod - * and permission hooks in inode_security_ops), binding and connecting to - * sockets in the abstract name space is completely unmediated. Sufficient - * control of Unix domain sockets in the abstract name space isn't possible - * using only the socket layer hooks, since we need to know the actual target - * socket, which is not looked up until we are inside the af_unix code. - * - * Security hooks for socket operations. - * - * @socket_create: - * Check permissions prior to creating a new socket. - * @family contains the requested protocol family. - * @type contains the requested communications type. - * @protocol contains the requested protocol. - * @kern set to 1 if a kernel socket. - * Return 0 if permission is granted. - * @socket_post_create: - * This hook allows a module to update or allocate a per-socket security - * structure. Note that the security field was not added directly to the - * socket structure, but rather, the socket security information is stored - * in the associated inode. Typically, the inode alloc_security hook will - * allocate and attach security information to - * SOCK_INODE(sock)->i_security. This hook may be used to update the - * SOCK_INODE(sock)->i_security field with additional information that - * wasn't available when the inode was allocated. - * @sock contains the newly created socket structure. - * @family contains the requested protocol family. - * @type contains the requested communications type. - * @protocol contains the requested protocol. - * @kern set to 1 if a kernel socket. - * Return 0 if permission is granted. - * @socket_socketpair: - * Check permissions before creating a fresh pair of sockets. - * @socka contains the first socket structure. - * @sockb contains the second socket structure. - * Return 0 if permission is granted and the connection was established. - * @socket_bind: - * Check permission before socket protocol layer bind operation is - * performed and the socket @sock is bound to the address specified in the - * @address parameter. - * @sock contains the socket structure. - * @address contains the address to bind to. - * @addrlen contains the length of address. - * Return 0 if permission is granted. - * @socket_connect: - * Check permission before socket protocol layer connect operation - * attempts to connect socket @sock to a remote address, @address. - * @sock contains the socket structure. - * @address contains the address of remote endpoint. - * @addrlen contains the length of address. - * Return 0 if permission is granted. - * @socket_listen: - * Check permission before socket protocol layer listen operation. - * @sock contains the socket structure. - * @backlog contains the maximum length for the pending connection queue. - * Return 0 if permission is granted. - * @socket_accept: - * Check permission before accepting a new connection. Note that the new - * socket, @newsock, has been created and some information copied to it, - * but the accept operation has not actually been performed. - * @sock contains the listening socket structure. - * @newsock contains the newly created server socket for connection. - * Return 0 if permission is granted. - * @socket_sendmsg: - * Check permission before transmitting a message to another socket. - * @sock contains the socket structure. - * @msg contains the message to be transmitted. - * @size contains the size of message. - * Return 0 if permission is granted. - * @socket_recvmsg: - * Check permission before receiving a message from a socket. - * @sock contains the socket structure. - * @msg contains the message structure. - * @size contains the size of message structure. - * @flags contains the operational flags. - * Return 0 if permission is granted. - * @socket_getsockname: - * Check permission before the local address (name) of the socket object - * @sock is retrieved. - * @sock contains the socket structure. - * Return 0 if permission is granted. - * @socket_getpeername: - * Check permission before the remote address (name) of a socket object - * @sock is retrieved. - * @sock contains the socket structure. - * Return 0 if permission is granted. - * @socket_getsockopt: - * Check permissions before retrieving the options associated with socket - * @sock. - * @sock contains the socket structure. - * @level contains the protocol level to retrieve option from. - * @optname contains the name of option to retrieve. - * Return 0 if permission is granted. - * @socket_setsockopt: - * Check permissions before setting the options associated with socket - * @sock. - * @sock contains the socket structure. - * @level contains the protocol level to set options for. - * @optname contains the name of the option to set. - * Return 0 if permission is granted. - * @socket_shutdown: - * Checks permission before all or part of a connection on the socket - * @sock is shut down. - * @sock contains the socket structure. - * @how contains the flag indicating how future sends and receives - * are handled. - * Return 0 if permission is granted. - * @socket_sock_rcv_skb: - * Check permissions on incoming network packets. This hook is distinct - * from Netfilter's IP input hooks since it is the first time that the - * incoming sk_buff @skb has been associated with a particular socket, @sk. - * Must not sleep inside this hook because some callers hold spinlocks. - * @sk contains the sock (not socket) associated with the incoming sk_buff. - * @skb contains the incoming network data. - * Return 0 if permission is granted. - * @socket_getpeersec_stream: - * This hook allows the security module to provide peer socket security - * state for unix or connected tcp sockets to userspace via getsockopt - * SO_GETPEERSEC. For tcp sockets this can be meaningful if the - * socket is associated with an ipsec SA. - * @sock is the local socket. - * @optval memory where the security state is to be copied. - * @optlen memory where the module should copy the actual length - * of the security state. - * @len as input is the maximum length to copy to userspace provided - * by the caller. - * Return 0 if all is well, otherwise, typical getsockopt return - * values. - * @socket_getpeersec_dgram: - * This hook allows the security module to provide peer socket security - * state for udp sockets on a per-packet basis to userspace via - * getsockopt SO_GETPEERSEC. The application must first have indicated - * the IP_PASSSEC option via getsockopt. It can then retrieve the - * security state returned by this hook for a packet via the SCM_SECURITY - * ancillary message type. - * @sock contains the peer socket. May be NULL. - * @skb is the sk_buff for the packet being queried. May be NULL. - * @secid pointer to store the secid of the packet. - * Return 0 on success, error on failure. - * @sk_alloc_security: - * Allocate and attach a security structure to the sk->sk_security field, - * which is used to copy security attributes between local stream sockets. - * Return 0 on success, error on failure. - * @sk_free_security: - * Deallocate security structure. - * @sk_clone_security: - * Clone/copy security structure. - * @sk_getsecid: - * Retrieve the LSM-specific secid for the sock to enable caching - * of network authorizations. - * @sock_graft: - * Sets the socket's isec sid to the sock's sid. - * @inet_conn_request: - * Sets the openreq's sid to socket's sid with MLS portion taken - * from peer sid. - * Return 0 if permission is granted. - * @inet_csk_clone: - * Sets the new child socket's sid to the openreq sid. - * @inet_conn_established: - * Sets the connection's peersid to the secmark on skb. - * @secmark_relabel_packet: - * Check if the process should be allowed to relabel packets to - * the given secid. - * Return 0 if permission is granted. - * @secmark_refcount_inc: - * Tells the LSM to increment the number of secmark labeling rules loaded. - * @secmark_refcount_dec: - * Tells the LSM to decrement the number of secmark labeling rules loaded. - * @req_classify_flow: - * Sets the flow's sid to the openreq sid. - * @tun_dev_alloc_security: - * This hook allows a module to allocate a security structure for a TUN - * device. - * @security pointer to a security structure pointer. - * Returns a zero on success, negative values on failure. - * @tun_dev_free_security: - * This hook allows a module to free the security structure for a TUN - * device. - * @security pointer to the TUN device's security structure. - * @tun_dev_create: - * Check permissions prior to creating a new TUN device. - * Return 0 if permission is granted. - * @tun_dev_attach_queue: - * Check permissions prior to attaching to a TUN device queue. - * @security pointer to the TUN device's security structure. - * Return 0 if permission is granted. - * @tun_dev_attach: - * This hook can be used by the module to update any security state - * associated with the TUN device's sock structure. - * @sk contains the existing sock structure. - * @security pointer to the TUN device's security structure. - * Return 0 if permission is granted. - * @tun_dev_open: - * This hook can be used by the module to update any security state - * associated with the TUN device's security structure. - * @security pointer to the TUN devices's security structure. - * Return 0 if permission is granted. - * - * Security hooks for SCTP - * - * @sctp_assoc_request: - * Passes the @asoc and @chunk->skb of the association INIT packet to - * the security module. - * @asoc pointer to sctp association structure. - * @skb pointer to skbuff of association packet. - * Return 0 on success, error on failure. - * @sctp_bind_connect: - * Validiate permissions required for each address associated with sock - * @sk. Depending on @optname, the addresses will be treated as either - * for a connect or bind service. The @addrlen is calculated on each - * ipv4 and ipv6 address using sizeof(struct sockaddr_in) or - * sizeof(struct sockaddr_in6). - * @sk pointer to sock structure. - * @optname name of the option to validate. - * @address list containing one or more ipv4/ipv6 addresses. - * @addrlen total length of address(s). - * Return 0 on success, error on failure. - * @sctp_sk_clone: - * Called whenever a new socket is created by accept(2) (i.e. a TCP - * style socket) or when a socket is 'peeled off' e.g userspace - * calls sctp_peeloff(3). - * @asoc pointer to current sctp association structure. - * @sk pointer to current sock structure. - * @newsk pointer to new sock structure. - * @sctp_assoc_established: - * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet - * to the security module. - * @asoc pointer to sctp association structure. - * @skb pointer to skbuff of association packet. - * Return 0 if permission is granted. - * - * Security hooks for Infiniband - * - * @ib_pkey_access: - * Check permission to access a pkey when modifing a QP. - * @subnet_prefix the subnet prefix of the port being used. - * @pkey the pkey to be accessed. - * @sec pointer to a security structure. - * Return 0 if permission is granted. - * @ib_endport_manage_subnet: - * Check permissions to send and receive SMPs on a end port. - * @dev_name the IB device name (i.e. mlx4_0). - * @port_num the port number. - * @sec pointer to a security structure. - * Return 0 if permission is granted. - * @ib_alloc_security: - * Allocate a security structure for Infiniband objects. - * @sec pointer to a security structure pointer. - * Returns 0 on success, non-zero on failure. - * @ib_free_security: - * Deallocate an Infiniband security structure. - * @sec contains the security structure to be freed. - * - * Security hooks for XFRM operations. - * - * @xfrm_policy_alloc_security: - * @ctxp is a pointer to the xfrm_sec_ctx being added to Security Policy - * Database used by the XFRM system. - * @sec_ctx contains the security context information being provided by - * the user-level policy update program (e.g., setkey). - * @gfp is to specify the context for the allocation. - * Allocate a security structure to the xp->security field; the security - * field is initialized to NULL when the xfrm_policy is allocated. - * Return 0 if operation was successful (memory to allocate, legal - * context). - * @xfrm_policy_clone_security: - * @old_ctx contains an existing xfrm_sec_ctx. - * @new_ctxp contains a new xfrm_sec_ctx being cloned from old. - * Allocate a security structure in new_ctxp that contains the - * information from the old_ctx structure. - * Return 0 if operation was successful (memory to allocate). - * @xfrm_policy_free_security: - * @ctx contains the xfrm_sec_ctx. - * Deallocate xp->security. - * @xfrm_policy_delete_security: - * @ctx contains the xfrm_sec_ctx. - * Authorize deletion of xp->security. - * Return 0 if permission is granted. - * @xfrm_state_alloc: - * @x contains the xfrm_state being added to the Security Association - * Database by the XFRM system. - * @sec_ctx contains the security context information being provided by - * the user-level SA generation program (e.g., setkey or racoon). - * Allocate a security structure to the x->security field; the security - * field is initialized to NULL when the xfrm_state is allocated. Set the - * context to correspond to sec_ctx. Return 0 if operation was successful - * (memory to allocate, legal context). - * @xfrm_state_alloc_acquire: - * @x contains the xfrm_state being added to the Security Association - * Database by the XFRM system. - * @polsec contains the policy's security context. - * @secid contains the secid from which to take the mls portion of the - * context. - * Allocate a security structure to the x->security field; the security - * field is initialized to NULL when the xfrm_state is allocated. Set the - * context to correspond to secid. Return 0 if operation was successful - * (memory to allocate, legal context). - * @xfrm_state_free_security: - * @x contains the xfrm_state. - * Deallocate x->security. - * @xfrm_state_delete_security: - * @x contains the xfrm_state. - * Authorize deletion of x->security. - * Return 0 if permission is granted. - * @xfrm_policy_lookup: - * @ctx contains the xfrm_sec_ctx for which the access control is being - * checked. - * @fl_secid contains the flow security label that is used to authorize - * access to the policy xp. - * @dir contains the direction of the flow (input or output). - * Check permission when a flow selects a xfrm_policy for processing - * XFRMs on a packet. The hook is called when selecting either a - * per-socket policy or a generic xfrm policy. - * Return 0 if permission is granted, -ESRCH otherwise, or -errno - * on other errors. - * @xfrm_state_pol_flow_match: - * @x contains the state to match. - * @xp contains the policy to check for a match. - * @flic contains the flowi_common struct to check for a match. - * Return 1 if there is a match. - * @xfrm_decode_session: - * @skb points to skb to decode. - * @secid points to the flow key secid to set. - * @ckall says if all xfrms used should be checked for same secid. - * Return 0 if ckall is zero or all xfrms used have the same secid. - * - * Security hooks affecting all Key Management operations - * - * @key_alloc: - * Permit allocation of a key and assign security data. Note that key does - * not have a serial number assigned at this point. - * @key points to the key. - * @flags is the allocation flags. - * Return 0 if permission is granted, -ve error otherwise. - * @key_free: - * Notification of destruction; free security data. - * @key points to the key. - * No return value. - * @key_permission: - * See whether a specific operational right is granted to a process on a - * key. - * @key_ref refers to the key (key pointer + possession attribute bit). - * @cred points to the credentials to provide the context against which to - * evaluate the security data on the key. - * @perm describes the combination of permissions required of this key. - * Return 0 if permission is granted, -ve error otherwise. - * @key_getsecurity: - * Get a textual representation of the security context attached to a key - * for the purposes of honouring KEYCTL_GETSECURITY. This function - * allocates the storage for the NUL-terminated string and the caller - * should free it. - * @key points to the key to be queried. - * @_buffer points to a pointer that should be set to point to the - * resulting string (if no label or an error occurs). - * Return the length of the string (including terminating NUL) or -ve if - * an error. - * May also return 0 (and a NULL buffer pointer) if there is no label. - * - * Security hooks affecting all System V IPC operations. - * - * @ipc_permission: - * Check permissions for access to IPC - * @ipcp contains the kernel IPC permission structure. - * @flag contains the desired (requested) permission set. - * Return 0 if permission is granted. - * @ipc_getsecid: - * Get the secid associated with the ipc object. - * @ipcp contains the kernel IPC permission structure. - * @secid contains a pointer to the location where result will be saved. - * In case of failure, @secid will be set to zero. - * - * Security hooks for individual messages held in System V IPC message queues - * - * @msg_msg_alloc_security: - * Allocate and attach a security structure to the msg->security field. - * The security field is initialized to NULL when the structure is first - * created. - * @msg contains the message structure to be modified. - * Return 0 if operation was successful and permission is granted. - * @msg_msg_free_security: - * Deallocate the security structure for this message. - * @msg contains the message structure to be modified. - * - * Security hooks for System V IPC Message Queues - * - * @msg_queue_alloc_security: - * Allocate and attach a security structure to the - * @perm->security field. The security field is initialized to - * NULL when the structure is first created. - * @perm contains the IPC permissions of the message queue. - * Return 0 if operation was successful and permission is granted. - * @msg_queue_free_security: - * Deallocate security field @perm->security for the message queue. - * @perm contains the IPC permissions of the message queue. - * @msg_queue_associate: - * Check permission when a message queue is requested through the - * msgget system call. This hook is only called when returning the - * message queue identifier for an existing message queue, not when a - * new message queue is created. - * @perm contains the IPC permissions of the message queue. - * @msqflg contains the operation control flags. - * Return 0 if permission is granted. - * @msg_queue_msgctl: - * Check permission when a message control operation specified by @cmd - * is to be performed on the message queue with permissions @perm. - * The @perm may be NULL, e.g. for IPC_INFO or MSG_INFO. - * @perm contains the IPC permissions of the msg queue. May be NULL. - * @cmd contains the operation to be performed. - * Return 0 if permission is granted. - * @msg_queue_msgsnd: - * Check permission before a message, @msg, is enqueued on the message - * queue with permissions @perm. - * @perm contains the IPC permissions of the message queue. - * @msg contains the message to be enqueued. - * @msqflg contains operational flags. - * Return 0 if permission is granted. - * @msg_queue_msgrcv: - * Check permission before a message, @msg, is removed from the message - * queue. The @target task structure contains a pointer to the - * process that will be receiving the message (not equal to the current - * process when inline receives are being performed). - * @perm contains the IPC permissions of the message queue. - * @msg contains the message destination. - * @target contains the task structure for recipient process. - * @type contains the type of message requested. - * @mode contains the operational flags. - * Return 0 if permission is granted. - * - * Security hooks for System V Shared Memory Segments - * - * @shm_alloc_security: - * Allocate and attach a security structure to the @perm->security - * field. The security field is initialized to NULL when the structure is - * first created. - * @perm contains the IPC permissions of the shared memory structure. - * Return 0 if operation was successful and permission is granted. - * @shm_free_security: - * Deallocate the security structure @perm->security for the memory segment. - * @perm contains the IPC permissions of the shared memory structure. - * @shm_associate: - * Check permission when a shared memory region is requested through the - * shmget system call. This hook is only called when returning the shared - * memory region identifier for an existing region, not when a new shared - * memory region is created. - * @perm contains the IPC permissions of the shared memory structure. - * @shmflg contains the operation control flags. - * Return 0 if permission is granted. - * @shm_shmctl: - * Check permission when a shared memory control operation specified by - * @cmd is to be performed on the shared memory region with permissions @perm. - * The @perm may be NULL, e.g. for IPC_INFO or SHM_INFO. - * @perm contains the IPC permissions of the shared memory structure. - * @cmd contains the operation to be performed. - * Return 0 if permission is granted. - * @shm_shmat: - * Check permissions prior to allowing the shmat system call to attach the - * shared memory segment with permissions @perm to the data segment of the - * calling process. The attaching address is specified by @shmaddr. - * @perm contains the IPC permissions of the shared memory structure. - * @shmaddr contains the address to attach memory region to. - * @shmflg contains the operational flags. - * Return 0 if permission is granted. - * - * Security hooks for System V Semaphores - * - * @sem_alloc_security: - * Allocate and attach a security structure to the @perm->security - * field. The security field is initialized to NULL when the structure is - * first created. - * @perm contains the IPC permissions of the semaphore. - * Return 0 if operation was successful and permission is granted. - * @sem_free_security: - * Deallocate security structure @perm->security for the semaphore. - * @perm contains the IPC permissions of the semaphore. - * @sem_associate: - * Check permission when a semaphore is requested through the semget - * system call. This hook is only called when returning the semaphore - * identifier for an existing semaphore, not when a new one must be - * created. - * @perm contains the IPC permissions of the semaphore. - * @semflg contains the operation control flags. - * Return 0 if permission is granted. - * @sem_semctl: - * Check permission when a semaphore operation specified by @cmd is to be - * performed on the semaphore. The @perm may be NULL, e.g. for - * IPC_INFO or SEM_INFO. - * @perm contains the IPC permissions of the semaphore. May be NULL. - * @cmd contains the operation to be performed. - * Return 0 if permission is granted. - * @sem_semop: - * Check permissions before performing operations on members of the - * semaphore set. If the @alter flag is nonzero, the semaphore set - * may be modified. - * @perm contains the IPC permissions of the semaphore. - * @sops contains the operations to perform. - * @nsops contains the number of operations to perform. - * @alter contains the flag indicating whether changes are to be made. - * Return 0 if permission is granted. - * - * @binder_set_context_mgr: - * Check whether @mgr is allowed to be the binder context manager. - * @mgr contains the struct cred for the current binder process. - * Return 0 if permission is granted. - * @binder_transaction: - * Check whether @from is allowed to invoke a binder transaction call - * to @to. - * @from contains the struct cred for the sending process. - * @to contains the struct cred for the receiving process. - * Return 0 if permission is granted. - * @binder_transfer_binder: - * Check whether @from is allowed to transfer a binder reference to @to. - * @from contains the struct cred for the sending process. - * @to contains the struct cred for the receiving process. - * Return 0 if permission is granted. - * @binder_transfer_file: - * Check whether @from is allowed to transfer @file to @to. - * @from contains the struct cred for the sending process. - * @file contains the struct file being transferred. - * @to contains the struct cred for the receiving process. - * Return 0 if permission is granted. - * - * @ptrace_access_check: - * Check permission before allowing the current process to trace the - * @child process. - * Security modules may also want to perform a process tracing check - * during an execve in the set_security or apply_creds hooks of - * tracing check during an execve in the bprm_set_creds hook of - * binprm_security_ops if the process is being traced and its security - * attributes would be changed by the execve. - * @child contains the task_struct structure for the target process. - * @mode contains the PTRACE_MODE flags indicating the form of access. - * Return 0 if permission is granted. - * @ptrace_traceme: - * Check that the @parent process has sufficient permission to trace the - * current process before allowing the current process to present itself - * to the @parent process for tracing. - * @parent contains the task_struct structure for debugger process. - * Return 0 if permission is granted. - * @capget: - * Get the @effective, @inheritable, and @permitted capability sets for - * the @target process. The hook may also perform permission checking to - * determine if the current process is allowed to see the capability sets - * of the @target process. - * @target contains the task_struct structure for target process. - * @effective contains the effective capability set. - * @inheritable contains the inheritable capability set. - * @permitted contains the permitted capability set. - * Return 0 if the capability sets were successfully obtained. - * @capset: - * Set the @effective, @inheritable, and @permitted capability sets for - * the current process. - * @new contains the new credentials structure for target process. - * @old contains the current credentials structure for target process. - * @effective contains the effective capability set. - * @inheritable contains the inheritable capability set. - * @permitted contains the permitted capability set. - * Return 0 and update @new if permission is granted. - * @capable: - * Check whether the @tsk process has the @cap capability in the indicated - * credentials. - * @cred contains the credentials to use. - * @ns contains the user namespace we want the capability in. - * @cap contains the capability <include/linux/capability.h>. - * @opts contains options for the capable check <include/linux/security.h>. - * Return 0 if the capability is granted for @tsk. - * @quotactl: - * Check whether the quotactl syscall is allowed for this @sb. - * Return 0 if permission is granted. - * @quota_on: - * Check whether QUOTAON is allowed for this @dentry. - * Return 0 if permission is granted. - * @syslog: - * Check permission before accessing the kernel message ring or changing - * logging to the console. - * See the syslog(2) manual page for an explanation of the @type values. - * @type contains the SYSLOG_ACTION_* constant from - * <include/linux/syslog.h>. - * Return 0 if permission is granted. - * @settime: - * Check permission to change the system time. - * struct timespec64 is defined in <include/linux/time64.h> and timezone - * is defined in <include/linux/time.h> - * @ts contains new time. - * @tz contains new timezone. - * Return 0 if permission is granted. - * @vm_enough_memory: - * Check permissions for allocating a new virtual mapping. - * @mm contains the mm struct it is being added to. - * @pages contains the number of pages. - * Return 0 if permission is granted by the LSM infrastructure to the - * caller. If all LSMs return a positive value, __vm_enough_memory() will - * be called with cap_sys_admin set. If at least one LSM returns 0 or - * negative, __vm_enough_memory() will be called with cap_sys_admin - * cleared. - * - * @ismaclabel: - * Check if the extended attribute specified by @name - * represents a MAC label. Returns 1 if name is a MAC - * attribute otherwise returns 0. - * @name full extended attribute name to check against - * LSM as a MAC label. - * - * @secid_to_secctx: - * Convert secid to security context. If secdata is NULL the length of - * the result will be returned in seclen, but no secdata will be returned. - * This does mean that the length could change between calls to check the - * length and the next call which actually allocates and returns the - * secdata. - * @secid contains the security ID. - * @secdata contains the pointer that stores the converted security - * context. - * @seclen pointer which contains the length of the data. - * Return 0 on success, error on failure. - * @secctx_to_secid: - * Convert security context to secid. - * @secid contains the pointer to the generated security ID. - * @secdata contains the security context. - * Return 0 on success, error on failure. - * - * @release_secctx: - * Release the security context. - * @secdata contains the security context. - * @seclen contains the length of the security context. - * - * Security hooks for Audit - * - * @audit_rule_init: - * Allocate and initialize an LSM audit rule structure. - * @field contains the required Audit action. - * Fields flags are defined in <include/linux/audit.h> - * @op contains the operator the rule uses. - * @rulestr contains the context where the rule will be applied to. - * @lsmrule contains a pointer to receive the result. - * Return 0 if @lsmrule has been successfully set, - * -EINVAL in case of an invalid rule. - * - * @audit_rule_known: - * Specifies whether given @krule contains any fields related to - * current LSM. - * @krule contains the audit rule of interest. - * Return 1 in case of relation found, 0 otherwise. - * - * @audit_rule_match: - * Determine if given @secid matches a rule previously approved - * by @audit_rule_known. - * @secid contains the security id in question. - * @field contains the field which relates to current LSM. - * @op contains the operator that will be used for matching. - * @lrule points to the audit rule that will be checked against. - * Return 1 if secid matches the rule, 0 if it does not, -ERRNO on failure. - * - * @audit_rule_free: - * Deallocate the LSM audit rule structure previously allocated by - * audit_rule_init. - * @lsmrule contains the allocated rule. - * - * @inode_invalidate_secctx: - * Notify the security module that it must revalidate the security context - * of an inode. - * - * @inode_notifysecctx: - * Notify the security module of what the security context of an inode - * should be. Initializes the incore security context managed by the - * security module for this inode. Example usage: NFS client invokes - * this hook to initialize the security context in its incore inode to the - * value provided by the server for the file when the server returned the - * file's attributes to the client. - * Must be called with inode->i_mutex locked. - * @inode we wish to set the security context of. - * @ctx contains the string which we wish to set in the inode. - * @ctxlen contains the length of @ctx. - * Return 0 on success, error on failure. - * - * @inode_setsecctx: - * Change the security context of an inode. Updates the - * incore security context managed by the security module and invokes the - * fs code as needed (via __vfs_setxattr_noperm) to update any backing - * xattrs that represent the context. Example usage: NFS server invokes - * this hook to change the security context in its incore inode and on the - * backing filesystem to a value provided by the client on a SETATTR - * operation. - * Must be called with inode->i_mutex locked. - * @dentry contains the inode we wish to set the security context of. - * @ctx contains the string which we wish to set in the inode. - * @ctxlen contains the length of @ctx. - * Return 0 on success, error on failure. - * - * @inode_getsecctx: - * On success, returns 0 and fills out @ctx and @ctxlen with the security - * context for the given @inode. - * @inode we wish to get the security context of. - * @ctx is a pointer in which to place the allocated security context. - * @ctxlen points to the place to put the length of @ctx. - * Return 0 on success, error on failure. - * - * Security hooks for the general notification queue: - * - * @post_notification: - * Check to see if a watch notification can be posted to a particular - * queue. - * @w_cred: The credentials of the whoever set the watch. - * @cred: The event-triggerer's credentials. - * @n: The notification being posted. - * Return 0 if permission is granted. - * - * @watch_key: - * Check to see if a process is allowed to watch for event notifications - * from a key or keyring. - * @key: The key to watch. - * Return 0 if permission is granted. - * - * Security hooks for using the eBPF maps and programs functionalities through - * eBPF syscalls. - * - * @bpf: - * Do a initial check for all bpf syscalls after the attribute is copied - * into the kernel. The actual security module can implement their own - * rules to check the specific cmd they need. - * Return 0 if permission is granted. - * - * @bpf_map: - * Do a check when the kernel generate and return a file descriptor for - * eBPF maps. - * @map: bpf map that we want to access. - * @mask: the access flags. - * Return 0 if permission is granted. - * - * @bpf_prog: - * Do a check when the kernel generate and return a file descriptor for - * eBPF programs. - * @prog: bpf prog that userspace want to use. - * Return 0 if permission is granted. - * - * @bpf_map_alloc_security: - * Initialize the security field inside bpf map. - * Return 0 on success, error on failure. - * - * @bpf_map_free_security: - * Clean up the security information stored inside bpf map. - * - * @bpf_prog_alloc_security: - * Initialize the security field inside bpf program. - * Return 0 on success, error on failure. - * - * @bpf_prog_free_security: - * Clean up the security information stored inside bpf prog. - * - * @locked_down: - * Determine whether a kernel feature that potentially enables arbitrary - * code execution in kernel space should be permitted. - * @what: kernel feature being accessed. - * Return 0 if permission is granted. - * - * Security hooks for perf events - * - * @perf_event_open: - * Check whether the @type of perf_event_open syscall is allowed. - * Return 0 if permission is granted. - * @perf_event_alloc: - * Allocate and save perf_event security info. - * Return 0 on success, error on failure. - * @perf_event_free: - * Release (free) perf_event security info. - * @perf_event_read: - * Read perf_event security info if allowed. - * Return 0 if permission is granted. - * @perf_event_write: - * Write perf_event security info if allowed. - * Return 0 if permission is granted. - * - * Security hooks for io_uring - * - * @uring_override_creds: - * Check if the current task, executing an io_uring operation, is allowed - * to override it's credentials with @new. - * @new: the new creds to use. - * Return 0 if permission is granted. - * - * @uring_sqpoll: - * Check whether the current task is allowed to spawn a io_uring polling - * thread (IORING_SETUP_SQPOLL). - * Return 0 if permission is granted. - * - * @uring_cmd: - * Check whether the file_operations uring_cmd is allowed to run. - * Return 0 if permission is granted. - * - */ union security_list_options { #define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__); #include "lsm_hook_defs.h" @@ -1716,6 +92,7 @@ extern void security_add_hooks(struct security_hook_list *hooks, int count, enum lsm_order { LSM_ORDER_FIRST = -1, /* This is only for capabilities. */ LSM_ORDER_MUTABLE = 0, + LSM_ORDER_LAST = 1, /* This is only for integrity. */ }; struct lsm_info { @@ -1740,36 +117,6 @@ extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[]; __used __section(".early_lsm_info.init") \ __aligned(sizeof(unsigned long)) -#ifdef CONFIG_SECURITY_SELINUX_DISABLE -/* - * Assuring the safety of deleting a security module is up to - * the security module involved. This may entail ordering the - * module's hook list in a particular way, refusing to disable - * the module once a policy is loaded or any number of other - * actions better imagined than described. - * - * The name of the configuration option reflects the only module - * that currently uses the mechanism. Any developer who thinks - * disabling their module is a good idea needs to be at least as - * careful as the SELinux team. - */ -static inline void security_delete_hooks(struct security_hook_list *hooks, - int count) -{ - int i; - - for (i = 0; i < count; i++) - hlist_del_rcu(&hooks[i].list); -} -#endif /* CONFIG_SECURITY_SELINUX_DISABLE */ - -/* Currently required to handle SELinux runtime hook disable. */ -#ifdef CONFIG_SECURITY_WRITABLE_HOOKS -#define __lsm_ro_after_init -#else -#define __lsm_ro_after_init __ro_after_init -#endif /* CONFIG_SECURITY_WRITABLE_HOOKS */ - extern int lsm_inode_alloc(struct inode *inode); #endif /* ! __LINUX_LSM_HOOKS_H */ diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 71b06ebad402..1db19a9d26e3 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -36,6 +36,7 @@ #include <linux/types.h> #include <rdma/ib_verbs.h> #include <linux/mlx5/mlx5_ifc.h> +#include <linux/bitfield.h> #if defined(__LITTLE_ENDIAN) #define MLX5_SET_HOST_ENDIANNESS 0 @@ -980,14 +981,23 @@ enum { }; enum { - CQE_RSS_HTYPE_IP = 0x3 << 2, + CQE_RSS_HTYPE_IP = GENMASK(3, 2), /* cqe->rss_hash_type[3:2] - IP destination selected for hash * (00 = none, 01 = IPv4, 10 = IPv6, 11 = Reserved) */ - CQE_RSS_HTYPE_L4 = 0x3 << 6, + CQE_RSS_IP_NONE = 0x0, + CQE_RSS_IPV4 = 0x1, + CQE_RSS_IPV6 = 0x2, + CQE_RSS_RESERVED = 0x3, + + CQE_RSS_HTYPE_L4 = GENMASK(7, 6), /* cqe->rss_hash_type[7:6] - L4 destination selected for hash * (00 = none, 01 = TCP. 10 = UDP, 11 = IPSEC.SPI */ + CQE_RSS_L4_NONE = 0x0, + CQE_RSS_L4_TCP = 0x1, + CQE_RSS_L4_UDP = 0x2, + CQE_RSS_L4_IPSEC = 0x3, }; enum { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f33389b42209..7e225e41d55b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1211,11 +1211,6 @@ static inline bool mlx5_core_is_vf(const struct mlx5_core_dev *dev) return dev->coredev_type == MLX5_COREDEV_VF; } -static inline bool mlx5_core_is_management_pf(const struct mlx5_core_dev *dev) -{ - return MLX5_CAP_GEN(dev, num_ports) == 1 && !MLX5_CAP_GEN(dev, native_port_num); -} - static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev) { return dev->caps.embedded_cpu; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 470085b121d3..c35f04f636f1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1624,7 +1624,8 @@ struct net_device_ops { struct xdp_metadata_ops { int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); - int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash); + int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type); }; /** diff --git a/include/linux/notifier.h b/include/linux/notifier.h index aef88c2d1173..2aba75145144 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -73,6 +73,9 @@ struct raw_notifier_head { struct srcu_notifier_head { struct mutex mutex; +#ifdef CONFIG_TREE_SRCU + struct srcu_usage srcuu; +#endif struct srcu_struct srcu; struct notifier_block __rcu *head; }; @@ -107,7 +110,7 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); { \ .mutex = __MUTEX_INITIALIZER(name.mutex), \ .head = NULL, \ - .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu), \ + .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \ } #define ATOMIC_NOTIFIER_HEAD(name) \ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0acb8e1fb7af..853184a46411 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1066,12 +1066,6 @@ static inline void folio_cancel_dirty(struct folio *folio) bool folio_clear_dirty_for_io(struct folio *folio); bool clear_page_dirty_for_io(struct page *page); void folio_invalidate(struct folio *folio, size_t offset, size_t length); -int __must_check folio_write_one(struct folio *folio); -static inline int __must_check write_one_page(struct page *page) -{ - return folio_write_one(page_folio(page)); -} - int __set_page_dirty_nobuffers(struct page *page); bool noop_dirty_folio(struct address_space *mapping, struct folio *folio); diff --git a/include/linux/pci.h b/include/linux/pci.h index b50e5c79f7e3..a5dda515fcd1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1624,6 +1624,8 @@ pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, flags, NULL); } +static inline bool pci_msix_can_alloc_dyn(struct pci_dev *dev) +{ return false; } static inline struct msi_map pci_msix_alloc_irq_at(struct pci_dev *dev, unsigned int index, const struct irq_affinity_desc *affdesc) { diff --git a/include/linux/pid.h b/include/linux/pid.h index 343abf22092e..b75de288a8c2 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -80,6 +80,7 @@ extern struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); int pidfd_create(struct pid *pid, unsigned int flags); +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret); static inline struct pid *get_pid(struct pid *pid) { diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 21cc29b8a9e8..0e65b3d634d9 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -106,6 +106,8 @@ struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); +int posix_acl_listxattr(struct inode *inode, char **buffer, + ssize_t *remaining_size); #else static inline int posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) @@ -153,6 +155,11 @@ static inline int vfs_remove_acl(struct mnt_idmap *idmap, { return -EOPNOTSUPP; } +static inline int posix_acl_listxattr(struct inode *inode, char **buffer, + ssize_t *remaining_size) +{ + return 0; +} #endif /* CONFIG_FS_POSIX_ACL */ struct posix_acl *get_inode_acl(struct inode *inode, int type); diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index 54cd7a14330d..e86f3b731da2 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -68,7 +68,8 @@ static inline int posix_acl_type(const char *name) return -1; } -extern const struct xattr_handler posix_acl_access_xattr_handler; -extern const struct xattr_handler posix_acl_default_xattr_handler; +/* These are legacy handlers. Don't use them for new code. */ +extern const struct xattr_handler nop_posix_acl_access; +extern const struct xattr_handler nop_posix_acl_default; #endif /* _POSIX_ACL_XATTR_H */ diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 75807ecef880..49539bc416ce 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -72,7 +72,6 @@ static inline int ns_alloc_inum(struct ns_common *ns) #define ns_free_inum(ns) proc_free_inum((ns)->inum) -extern struct file *proc_ns_fget(int fd); #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) extern int ns_get_path(struct path *path, struct task_struct *task, const struct proc_ns_operations *ns_ops); diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 92ad75549e9c..b6e6378dcbbd 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -25,7 +25,8 @@ void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned change, u32 event, gfp_t flags, int *new_nsid, - int new_ifindex, u32 portid, u32 seq); + int new_ifindex, u32 portid, + const struct nlmsghdr *nlh); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags, u32 portid, const struct nlmsghdr *nlh); diff --git a/include/linux/sched.h b/include/linux/sched.h index 63d242164b1a..e1e605b1255b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1729,7 +1729,7 @@ extern struct pid *cad_pid; #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ -#define PF__HOLE__00004000 0x00004000 +#define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF__HOLE__00010000 0x00010000 #define PF_KSWAPD 0x00020000 /* I am kswapd */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 357e0068497c..537cbf9a2ade 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -23,7 +23,13 @@ struct kernel_clone_args { int __user *pidfd; int __user *child_tid; int __user *parent_tid; + const char *name; int exit_signal; + u32 kthread:1; + u32 io_thread:1; + u32 user_worker:1; + u32 no_files:1; + u32 ignore_signals:1; unsigned long stack; unsigned long stack_size; unsigned long tls; @@ -31,8 +37,6 @@ struct kernel_clone_args { /* Number of elements in *set_tid */ size_t set_tid_size; int cgroup; - int io_thread; - int kthread; int idle; int (*fn)(void *); void *fn_arg; @@ -89,9 +93,12 @@ extern void exit_files(struct task_struct *); extern void exit_itimers(struct task_struct *); extern pid_t kernel_clone(struct kernel_clone_args *kargs); +struct task_struct *copy_process(struct pid *pid, int trace, int node, + struct kernel_clone_args *args); struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); struct task_struct *fork_idle(int); -extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); +extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, + unsigned long flags); extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags); extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); int kernel_wait(pid_t pid, int *stat); diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h new file mode 100644 index 000000000000..6123c10b99cf --- /dev/null +++ b/include/linux/sched/vhost_task.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VHOST_TASK_H +#define _LINUX_VHOST_TASK_H + +#include <linux/completion.h> + +struct task_struct; + +struct vhost_task { + int (*fn)(void *data); + void *data; + struct completion exited; + unsigned long flags; + struct task_struct *task; +}; + +struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg, + const char *name); +void vhost_task_start(struct vhost_task *vtsk); +void vhost_task_stop(struct vhost_task *vtsk); +bool vhost_task_should_stop(struct vhost_task *vtsk); + +#endif diff --git a/include/linux/security.h b/include/linux/security.h index 5984d0d550b4..e2734e9e44d5 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -68,7 +68,7 @@ struct watch_notification; /* If capable is being called by a setid function */ #define CAP_OPT_INSETID BIT(2) -/* LSM Agnostic defines for fs_context::lsm_flags */ +/* LSM Agnostic defines for security_sb_set_mnt_opts() flags */ #define SECURITY_LSM_NATIVE_LABELS 1 struct ctl_table; @@ -336,9 +336,6 @@ int security_inode_init_security(struct inode *inode, struct inode *dir, int security_inode_init_security_anon(struct inode *inode, const struct qstr *name, const struct inode *context_inode); -int security_old_inode_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr, const char **name, - void **value, size_t *len); int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode); int security_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry); @@ -778,15 +775,6 @@ static inline int security_inode_init_security_anon(struct inode *inode, return 0; } -static inline int security_old_inode_init_security(struct inode *inode, - struct inode *dir, - const struct qstr *qstr, - const char **name, - void **value, size_t *len) -{ - return -EOPNOTSUPP; -} - static inline int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ff7ad331fb82..dbcaac8b6966 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -294,6 +294,7 @@ struct nf_bridge_info { u8 pkt_otherhost:1; u8 in_prerouting:1; u8 bridged_dnat:1; + u8 sabotage_in_done:1; __u16 frag_max_size; struct net_device *physindev; @@ -4712,7 +4713,7 @@ static inline void nf_reset_ct(struct sk_buff *skb) static inline void nf_reset_trace(struct sk_buff *skb) { -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) skb->nf_trace = 0; #endif } @@ -4732,7 +4733,7 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, dst->_nfct = src->_nfct; nf_conntrack_get(skb_nfct(src)); #endif -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) if (copy) dst->nf_trace = src->nf_trace; #endif diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 74796cd7e7a9..41c4b26fb1c1 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -102,6 +102,32 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) return lock_is_held(&ssp->dep_map); } +/* + * Annotations provide deadlock detection for SRCU. + * + * Similar to other lockdep annotations, except there is an additional + * srcu_lock_sync(), which is basically an empty *write*-side critical section, + * see lock_sync() for more information. + */ + +/* Annotates a srcu_read_lock() */ +static inline void srcu_lock_acquire(struct lockdep_map *map) +{ + lock_map_acquire_read(map); +} + +/* Annotates a srcu_read_lock() */ +static inline void srcu_lock_release(struct lockdep_map *map) +{ + lock_map_release(map); +} + +/* Annotates a synchronize_srcu() */ +static inline void srcu_lock_sync(struct lockdep_map *map) +{ + lock_map_sync(map); +} + #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) @@ -109,6 +135,10 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) return 1; } +#define srcu_lock_acquire(m) do { } while (0) +#define srcu_lock_release(m) do { } while (0) +#define srcu_lock_sync(m) do { } while (0) + #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #define SRCU_NMI_UNKNOWN 0x0 @@ -182,7 +212,7 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) srcu_check_nmi_safety(ssp, false); retval = __srcu_read_lock(ssp); - rcu_lock_acquire(&(ssp)->dep_map); + srcu_lock_acquire(&(ssp)->dep_map); return retval; } @@ -254,7 +284,7 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) { WARN_ON_ONCE(idx & ~0x1); srcu_check_nmi_safety(ssp, false); - rcu_lock_release(&(ssp)->dep_map); + srcu_lock_release(&(ssp)->dep_map); __srcu_read_unlock(ssp, idx); } diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 5aa5e0faf6a1..ebd72491af99 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -31,7 +31,7 @@ struct srcu_struct { void srcu_drive_gp(struct work_struct *wp); -#define __SRCU_STRUCT_INIT(name, __ignored) \ +#define __SRCU_STRUCT_INIT(name, __ignored, ___ignored) \ { \ .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \ .srcu_cb_tail = &name.srcu_cb_head, \ @@ -44,9 +44,9 @@ void srcu_drive_gp(struct work_struct *wp); * Tree SRCU, which needs some per-CPU data. */ #define DEFINE_SRCU(name) \ - struct srcu_struct name = __SRCU_STRUCT_INIT(name, name) + struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name) #define DEFINE_STATIC_SRCU(name) \ - static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name) + static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name) void synchronize_srcu(struct srcu_struct *ssp); diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 558057b517b7..8f3f72480e78 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -58,9 +58,9 @@ struct srcu_node { }; /* - * Per-SRCU-domain structure, similar in function to rcu_state. + * Per-SRCU-domain structure, update-side data linked from srcu_struct. */ -struct srcu_struct { +struct srcu_usage { struct srcu_node *node; /* Combining tree. */ struct srcu_node *level[RCU_NUM_LVLS + 1]; /* First node at each level. */ @@ -68,7 +68,6 @@ struct srcu_struct { struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ spinlock_t __private lock; /* Protect counters and size state. */ struct mutex srcu_gp_mutex; /* Serialize GP work. */ - unsigned int srcu_idx; /* Current rdr array element. */ unsigned long srcu_gp_seq; /* Grace-period seq #. */ unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ @@ -77,7 +76,6 @@ struct srcu_struct { unsigned long srcu_size_jiffies; /* Current contention-measurement interval. */ unsigned long srcu_n_lock_retries; /* Contention events in current interval. */ unsigned long srcu_n_exp_nodelay; /* # expedited no-delays in current GP phase. */ - struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ bool sda_is_static; /* May ->sda be passed to free_percpu()? */ unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ @@ -89,32 +87,68 @@ struct srcu_struct { unsigned long reschedule_jiffies; unsigned long reschedule_count; struct delayed_work work; + struct srcu_struct *srcu_ssp; +}; + +/* + * Per-SRCU-domain structure, similar in function to rcu_state. + */ +struct srcu_struct { + unsigned int srcu_idx; /* Current rdr array element. */ + struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ struct lockdep_map dep_map; + struct srcu_usage *srcu_sup; /* Update-side data. */ }; -/* Values for size state variable (->srcu_size_state). */ -#define SRCU_SIZE_SMALL 0 -#define SRCU_SIZE_ALLOC 1 -#define SRCU_SIZE_WAIT_BARRIER 2 -#define SRCU_SIZE_WAIT_CALL 3 -#define SRCU_SIZE_WAIT_CBS1 4 -#define SRCU_SIZE_WAIT_CBS2 5 -#define SRCU_SIZE_WAIT_CBS3 6 -#define SRCU_SIZE_WAIT_CBS4 7 -#define SRCU_SIZE_BIG 8 +// Values for size state variable (->srcu_size_state). Once the state +// has been set to SRCU_SIZE_ALLOC, the grace-period code advances through +// this state machine one step per grace period until the SRCU_SIZE_BIG state +// is reached. Otherwise, the state machine remains in the SRCU_SIZE_SMALL +// state indefinitely. +#define SRCU_SIZE_SMALL 0 // No srcu_node combining tree, ->node == NULL +#define SRCU_SIZE_ALLOC 1 // An srcu_node tree is being allocated, initialized, + // and then referenced by ->node. It will not be used. +#define SRCU_SIZE_WAIT_BARRIER 2 // The srcu_node tree starts being used by everything + // except call_srcu(), especially by srcu_barrier(). + // By the end of this state, all CPUs and threads + // are aware of this tree's existence. +#define SRCU_SIZE_WAIT_CALL 3 // The srcu_node tree starts being used by call_srcu(). + // By the end of this state, all of the call_srcu() + // invocations that were running on a non-boot CPU + // and using the boot CPU's callback queue will have + // completed. +#define SRCU_SIZE_WAIT_CBS1 4 // Don't trust the ->srcu_have_cbs[] grace-period +#define SRCU_SIZE_WAIT_CBS2 5 // sequence elements or the ->srcu_data_have_cbs[] +#define SRCU_SIZE_WAIT_CBS3 6 // CPU-bitmask elements until all four elements of +#define SRCU_SIZE_WAIT_CBS4 7 // each array have been initialized. +#define SRCU_SIZE_BIG 8 // The srcu_node combining tree is fully initialized + // and all aspects of it are being put to use. /* Values for state variable (bottom bits of ->srcu_gp_seq). */ #define SRCU_STATE_IDLE 0 #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 -#define __SRCU_STRUCT_INIT(name, pcpu_name) \ -{ \ - .sda = &pcpu_name, \ - .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ - .srcu_gp_seq_needed = -1UL, \ - .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ - __SRCU_DEP_MAP_INIT(name) \ +#define __SRCU_USAGE_INIT(name) \ +{ \ + .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ + .srcu_gp_seq_needed = -1UL, \ + .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ +} + +#define __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ + .srcu_sup = &usage_name, \ + __SRCU_DEP_MAP_INIT(name) + +#define __SRCU_STRUCT_INIT_MODULE(name, usage_name) \ +{ \ + __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ +} + +#define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name) \ +{ \ + .sda = &pcpu_name, \ + __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ } /* @@ -137,16 +171,18 @@ struct srcu_struct { * See include/linux/percpu-defs.h for the rules on per-CPU variables. */ #ifdef MODULE -# define __DEFINE_SRCU(name, is_static) \ - is_static struct srcu_struct name; \ - extern struct srcu_struct * const __srcu_struct_##name; \ - struct srcu_struct * const __srcu_struct_##name \ +# define __DEFINE_SRCU(name, is_static) \ + static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage); \ + is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage); \ + extern struct srcu_struct * const __srcu_struct_##name; \ + struct srcu_struct * const __srcu_struct_##name \ __section("___srcu_struct_ptrs") = &name #else -# define __DEFINE_SRCU(name, is_static) \ - static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data); \ - is_static struct srcu_struct name = \ - __SRCU_STRUCT_INIT(name, name##_srcu_data) +# define __DEFINE_SRCU(name, is_static) \ + static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data); \ + static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage); \ + is_static struct srcu_struct name = \ + __SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data) #endif #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) diff --git a/include/linux/tick.h b/include/linux/tick.h index bfd571f18cfd..9459fef5b857 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -216,6 +216,7 @@ extern void tick_nohz_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit); extern void tick_nohz_dep_clear_signal(struct signal_struct *signal, enum tick_dep_bits bit); +extern bool tick_nohz_cpu_hotpluggable(unsigned int cpu); /* * The below are tick_nohz_[set,clear]_dep() wrappers that optimize off-cases @@ -280,6 +281,7 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { } static inline void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { } static inline void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { } +static inline bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { return true; } static inline void tick_dep_set(enum tick_dep_bits bit) { } static inline void tick_dep_clear(enum tick_dep_bits bit) { } diff --git a/include/linux/uio.h b/include/linux/uio.h index 27e3fd942960..ed35f4427a0a 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -49,14 +49,35 @@ struct iov_iter { size_t iov_offset; int last_offset; }; - size_t count; + /* + * Hack alert: overlay ubuf_iovec with iovec + count, so + * that the members resolve correctly regardless of the type + * of iterator used. This means that you can use: + * + * &iter->__ubuf_iovec or iter->__iov + * + * interchangably for the user_backed cases, hence simplifying + * some of the cases that need to deal with both. + */ union { - const struct iovec *iov; - const struct kvec *kvec; - const struct bio_vec *bvec; - struct xarray *xarray; - struct pipe_inode_info *pipe; - void __user *ubuf; + /* + * This really should be a const, but we cannot do that without + * also modifying any of the zero-filling iter init functions. + * Leave it non-const for now, but it should be treated as such. + */ + struct iovec __ubuf_iovec; + struct { + union { + /* use iter_iov() to get the current vec */ + const struct iovec *__iov; + const struct kvec *kvec; + const struct bio_vec *bvec; + struct xarray *xarray; + struct pipe_inode_info *pipe; + void __user *ubuf; + }; + size_t count; + }; }; union { unsigned long nr_segs; @@ -68,6 +89,16 @@ struct iov_iter { }; }; +static inline const struct iovec *iter_iov(const struct iov_iter *iter) +{ + if (iter->iter_type == ITER_UBUF) + return (const struct iovec *) &iter->__ubuf_iovec; + return iter->__iov; +} + +#define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset) +#define iter_iov_len(iter) (iter_iov(iter)->iov_len - (iter)->iov_offset) + static inline enum iter_type iov_iter_type(const struct iov_iter *i) { return i->iter_type; @@ -143,15 +174,6 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) return ret; } -static inline struct iovec iov_iter_iovec(const struct iov_iter *iter) -{ - return (struct iovec) { - .iov_base = iter->iov->iov_base + iter->iov_offset, - .iov_len = min(iter->count, - iter->iov->iov_len - iter->iov_offset), - }; -} - size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i); void iov_iter_advance(struct iov_iter *i, size_t bytes); @@ -359,7 +381,8 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, .user_backed = true, .data_source = direction, .ubuf = buf, - .count = count + .count = count, + .nr_segs = 1 }; } /* Flags for iov_iter_get/extract_pages*() */ diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 6af72461397d..d591ef59aa98 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -47,6 +47,22 @@ struct xattr_handler { size_t size, int flags); }; +/** + * xattr_handler_can_list - check whether xattr can be listed + * @handler: handler for this type of xattr + * @dentry: dentry whose inode xattr to list + * + * Determine whether the xattr associated with @dentry can be listed given + * @handler. + * + * Return: true if xattr can be listed, false if not. + */ +static inline bool xattr_handler_can_list(const struct xattr_handler *handler, + struct dentry *dentry) +{ + return handler && (!handler->list || handler->list(dentry)); +} + const char *xattr_full_name(const struct xattr_handler *, const char *); struct xattr { @@ -78,7 +94,7 @@ int vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, char **xattr_value, size_t size, gfp_t flags); -int xattr_supported_namespace(struct inode *inode, const char *prefix); +int xattr_supports_user_prefix(struct inode *inode); static inline const char *xattr_prefix(const struct xattr_handler *handler) { @@ -109,5 +125,6 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); void simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr); +int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); #endif /* _LINUX_XATTR_H */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6ed9b4d546a7..d5311ceb21c6 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -954,6 +954,7 @@ enum { HCI_CONN_STK_ENCRYPT, HCI_CONN_AUTH_INITIATOR, HCI_CONN_DROP, + HCI_CONN_CANCEL, HCI_CONN_PARAM_REMOVAL_PEND, HCI_CONN_NEW_LINK_KEY, HCI_CONN_SCANNING, diff --git a/include/net/bonding.h b/include/net/bonding.h index ea36ab7f9e72..c3843239517d 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -761,13 +761,17 @@ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip) #if IS_ENABLED(CONFIG_IPV6) static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr *ip) { + struct in6_addr mcaddr; int i; - for (i = 0; i < BOND_MAX_NS_TARGETS; i++) - if (ipv6_addr_equal(&targets[i], ip)) + for (i = 0; i < BOND_MAX_NS_TARGETS; i++) { + addrconf_addr_solict_mult(&targets[i], &mcaddr); + if ((ipv6_addr_equal(&targets[i], ip)) || + (ipv6_addr_equal(&mcaddr, ip))) return i; else if (ipv6_addr_any(&targets[i])) break; + } return -1; } diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 9430128aae99..1b8e305bb54a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1085,6 +1085,10 @@ struct nft_chain { }; int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); +int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem); +int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, diff --git a/include/net/xdp.h b/include/net/xdp.h index 41c57b8b1671..76aa748e7923 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -8,6 +8,7 @@ #include <linux/skbuff.h> /* skb_shared_info */ #include <uapi/linux/netdev.h> +#include <linux/bitfield.h> /** * DOC: XDP RX-queue information @@ -425,6 +426,52 @@ XDP_METADATA_KFUNC_xxx MAX_XDP_METADATA_KFUNC, }; +enum xdp_rss_hash_type { + /* First part: Individual bits for L3/L4 types */ + XDP_RSS_L3_IPV4 = BIT(0), + XDP_RSS_L3_IPV6 = BIT(1), + + /* The fixed (L3) IPv4 and IPv6 headers can both be followed by + * variable/dynamic headers, IPv4 called Options and IPv6 called + * Extension Headers. HW RSS type can contain this info. + */ + XDP_RSS_L3_DYNHDR = BIT(2), + + /* When RSS hash covers L4 then drivers MUST set XDP_RSS_L4 bit in + * addition to the protocol specific bit. This ease interaction with + * SKBs and avoids reserving a fixed mask for future L4 protocol bits. + */ + XDP_RSS_L4 = BIT(3), /* L4 based hash, proto can be unknown */ + XDP_RSS_L4_TCP = BIT(4), + XDP_RSS_L4_UDP = BIT(5), + XDP_RSS_L4_SCTP = BIT(6), + XDP_RSS_L4_IPSEC = BIT(7), /* L4 based hash include IPSEC SPI */ + + /* Second part: RSS hash type combinations used for driver HW mapping */ + XDP_RSS_TYPE_NONE = 0, + XDP_RSS_TYPE_L2 = XDP_RSS_TYPE_NONE, + + XDP_RSS_TYPE_L3_IPV4 = XDP_RSS_L3_IPV4, + XDP_RSS_TYPE_L3_IPV6 = XDP_RSS_L3_IPV6, + XDP_RSS_TYPE_L3_IPV4_OPT = XDP_RSS_L3_IPV4 | XDP_RSS_L3_DYNHDR, + XDP_RSS_TYPE_L3_IPV6_EX = XDP_RSS_L3_IPV6 | XDP_RSS_L3_DYNHDR, + + XDP_RSS_TYPE_L4_ANY = XDP_RSS_L4, + XDP_RSS_TYPE_L4_IPV4_TCP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_TCP, + XDP_RSS_TYPE_L4_IPV4_UDP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP, + XDP_RSS_TYPE_L4_IPV4_SCTP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, + XDP_RSS_TYPE_L4_IPV4_IPSEC = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, + + XDP_RSS_TYPE_L4_IPV6_TCP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP, + XDP_RSS_TYPE_L4_IPV6_UDP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP, + XDP_RSS_TYPE_L4_IPV6_SCTP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, + XDP_RSS_TYPE_L4_IPV6_IPSEC = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, + + XDP_RSS_TYPE_L4_IPV6_TCP_EX = XDP_RSS_TYPE_L4_IPV6_TCP | XDP_RSS_L3_DYNHDR, + XDP_RSS_TYPE_L4_IPV6_UDP_EX = XDP_RSS_TYPE_L4_IPV6_UDP | XDP_RSS_L3_DYNHDR, + XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR, +}; + #ifdef CONFIG_NET u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index cf4a0d28b178..71dbe8bfa7db 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -71,8 +71,8 @@ TRACE_EVENT(erofs_fill_inode, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; - __entry->blkaddr = erofs_blknr(erofs_iloc(inode)); - __entry->ofs = erofs_blkoff(erofs_iloc(inode)); + __entry->blkaddr = erofs_blknr(inode->i_sb, erofs_iloc(inode)); + __entry->ofs = erofs_blkoff(inode->i_sb, erofs_iloc(inode)); ), TP_printk("dev = (%d,%d), nid = %llu, blkaddr %u ofs %u", diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 012fa0d171b2..2ef9c719772a 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -776,9 +776,7 @@ TRACE_EVENT_RCU(rcu_torture_read, ), TP_fast_assign( - strncpy(__entry->rcutorturename, rcutorturename, - RCUTORTURENAME_LEN); - __entry->rcutorturename[RCUTORTURENAME_LEN - 1] = 0; + strscpy(__entry->rcutorturename, rcutorturename, RCUTORTURENAME_LEN); __entry->rhp = rhp; __entry->secs = secs; __entry->c_old = c_old; diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 2e713a7d9aa3..3e8619c72f77 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -371,7 +371,8 @@ TRACE_EVENT(itimer_expire, tick_dep_name(PERF_EVENTS) \ tick_dep_name(SCHED) \ tick_dep_name(CLOCK_UNSTABLE) \ - tick_dep_name_end(RCU) + tick_dep_name(RCU) \ + tick_dep_name_end(RCU_EXP) #undef tick_dep_name #undef tick_dep_mask_name diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h index 1ecdb911add8..80f37a0d40d7 100644 --- a/include/uapi/asm-generic/fcntl.h +++ b/include/uapi/asm-generic/fcntl.h @@ -91,7 +91,6 @@ /* a horrid kludge trying to make sure that this will fail on old kernels */ #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) -#define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT) #ifndef O_NDELAY #define O_NDELAY O_NONBLOCK diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index f3223f964691..81d09ef9aa50 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -130,21 +130,37 @@ struct landlock_path_beneath_attr { * - %LANDLOCK_ACCESS_FS_MAKE_BLOCK: Create (or rename or link) a block device. * - %LANDLOCK_ACCESS_FS_MAKE_SYM: Create (or rename or link) a symbolic link. * - %LANDLOCK_ACCESS_FS_REFER: Link or rename a file from or to a different - * directory (i.e. reparent a file hierarchy). This access right is - * available since the second version of the Landlock ABI. This is also the - * only access right which is always considered handled by any ruleset in - * such a way that reparenting a file hierarchy is always denied by default. - * To avoid privilege escalation, it is not enough to add a rule with this - * access right. When linking or renaming a file, the destination directory - * hierarchy must also always have the same or a superset of restrictions of - * the source hierarchy. If it is not the case, or if the domain doesn't - * handle this access right, such actions are denied by default with errno - * set to ``EXDEV``. Linking also requires a ``LANDLOCK_ACCESS_FS_MAKE_*`` - * access right on the destination directory, and renaming also requires a - * ``LANDLOCK_ACCESS_FS_REMOVE_*`` access right on the source's (file or - * directory) parent. Otherwise, such actions are denied with errno set to - * ``EACCES``. The ``EACCES`` errno prevails over ``EXDEV`` to let user space - * efficiently deal with an unrecoverable error. + * directory (i.e. reparent a file hierarchy). + * + * This access right is available since the second version of the Landlock + * ABI. + * + * This is the only access right which is denied by default by any ruleset, + * even if the right is not specified as handled at ruleset creation time. + * The only way to make a ruleset grant this right is to explicitly allow it + * for a specific directory by adding a matching rule to the ruleset. + * + * In particular, when using the first Landlock ABI version, Landlock will + * always deny attempts to reparent files between different directories. + * + * In addition to the source and destination directories having the + * %LANDLOCK_ACCESS_FS_REFER access right, the attempted link or rename + * operation must meet the following constraints: + * + * * The reparented file may not gain more access rights in the destination + * directory than it previously had in the source directory. If this is + * attempted, the operation results in an ``EXDEV`` error. + * + * * When linking or renaming, the ``LANDLOCK_ACCESS_FS_MAKE_*`` right for the + * respective file type must be granted for the destination directory. + * Otherwise, the operation results in an ``EACCES`` error. + * + * * When renaming, the ``LANDLOCK_ACCESS_FS_REMOVE_*`` right for the + * respective file type must be granted for the source directory. Otherwise, + * the operation results in an ``EACCES`` error. + * + * If multiple requirements are not met, the ``EACCES`` error code takes + * precedence over ``EXDEV``. * * .. warning:: * diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index 91b4c63d5cbf..1c9da485318f 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -36,6 +36,13 @@ enum { * SEV Firmware status code */ typedef enum { + /* + * This error code is not in the SEV spec. Its purpose is to convey that + * there was an error that prevented the SEV firmware from being called. + * The SEV API error codes are 16 bits, so the -1 value will not overlap + * with possible values from the specification. + */ + SEV_RET_NO_FW_CALL = -1, SEV_RET_SUCCESS = 0, SEV_RET_INVALID_PLATFORM_STATE, SEV_RET_INVALID_GUEST_STATE, diff --git a/include/uapi/linux/sev-guest.h b/include/uapi/linux/sev-guest.h index 256aaeff7e65..2aa39112cf8d 100644 --- a/include/uapi/linux/sev-guest.h +++ b/include/uapi/linux/sev-guest.h @@ -52,8 +52,14 @@ struct snp_guest_request_ioctl { __u64 req_data; __u64 resp_data; - /* firmware error code on failure (see psp-sev.h) */ - __u64 fw_err; + /* bits[63:32]: VMM error code, bits[31:0] firmware error code (see psp-sev.h) */ + union { + __u64 exitinfo2; + struct { + __u32 fw_error; + __u32 vmm_error; + }; + }; }; struct snp_ext_report_req { @@ -77,4 +83,12 @@ struct snp_ext_report_req { /* Get SNP extended report as defined in the GHCB specification version 2. */ #define SNP_GET_EXT_REPORT _IOWR(SNP_GUEST_REQ_IOC_TYPE, 0x2, struct snp_guest_request_ioctl) +/* Guest message request EXIT_INFO_2 constants */ +#define SNP_GUEST_FW_ERR_MASK GENMASK_ULL(31, 0) +#define SNP_GUEST_VMM_ERR_SHIFT 32 +#define SNP_GUEST_VMM_ERR(x) (((u64)x) << SNP_GUEST_VMM_ERR_SHIFT) + +#define SNP_GUEST_VMM_ERR_INVALID_LEN 1 +#define SNP_GUEST_VMM_ERR_BUSY 2 + #endif /* __UAPI_LINUX_SEV_GUEST_H_ */ diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h index 5af2a0300bb9..3744e4da1b2a 100644 --- a/include/uapi/linux/virtio_blk.h +++ b/include/uapi/linux/virtio_blk.h @@ -140,11 +140,11 @@ struct virtio_blk_config { /* Zoned block device characteristics (if VIRTIO_BLK_F_ZONED) */ struct virtio_blk_zoned_characteristics { - __le32 zone_sectors; - __le32 max_open_zones; - __le32 max_active_zones; - __le32 max_append_sectors; - __le32 write_granularity; + __virtio32 zone_sectors; + __virtio32 max_open_zones; + __virtio32 max_active_zones; + __virtio32 max_append_sectors; + __virtio32 write_granularity; __u8 model; __u8 unused2[3]; } zoned; @@ -241,11 +241,11 @@ struct virtio_blk_outhdr { */ struct virtio_blk_zone_descriptor { /* Zone capacity */ - __le64 z_cap; + __virtio64 z_cap; /* The starting sector of the zone */ - __le64 z_start; + __virtio64 z_start; /* Zone write pointer position in sectors */ - __le64 z_wp; + __virtio64 z_wp; /* Zone type */ __u8 z_type; /* Zone state */ @@ -254,7 +254,7 @@ struct virtio_blk_zone_descriptor { }; struct virtio_blk_zone_report { - __le64 nr_zones; + __virtio64 nr_zones; __u8 reserved[56]; struct virtio_blk_zone_descriptor zones[]; }; diff --git a/init/Kconfig b/init/Kconfig index 1fb5f313d18f..c88bb30a8b0b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -890,18 +890,14 @@ config CC_IMPLICIT_FALLTHROUGH default "-Wimplicit-fallthrough=5" if CC_IS_GCC && $(cc-option,-Wimplicit-fallthrough=5) default "-Wimplicit-fallthrough" if CC_IS_CLANG && $(cc-option,-Wunreachable-code-fallthrough) -# Currently, disable gcc-11,12 array-bounds globally. -# We may want to target only particular configurations some day. +# Currently, disable gcc-11+ array-bounds globally. +# It's still broken in gcc-13, so no upper bound yet. config GCC11_NO_ARRAY_BOUNDS def_bool y -config GCC12_NO_ARRAY_BOUNDS - def_bool y - config CC_NO_ARRAY_BOUNDS bool - default y if CC_IS_GCC && GCC_VERSION >= 110000 && GCC_VERSION < 120000 && GCC11_NO_ARRAY_BOUNDS - default y if CC_IS_GCC && GCC_VERSION >= 120000 && GCC_VERSION < 130000 && GCC12_NO_ARRAY_BOUNDS + default y if CC_IS_GCC && GCC_VERSION >= 110000 && GCC11_NO_ARRAY_BOUNDS # # For architectures that know their GCC __int128 support is sound diff --git a/init/initramfs.c b/init/initramfs.c index f6c112e30bd4..e7a01c2ccd1b 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -60,15 +60,8 @@ static void __init error(char *x) message = x; } -static void panic_show_mem(const char *fmt, ...) -{ - va_list args; - - show_mem(0, NULL); - va_start(args, fmt); - panic(fmt, args); - va_end(args); -} +#define panic_show_mem(fmt, ...) \ + ({ show_mem(0, NULL); panic(fmt, ##__VA_ARGS__); }) /* link hash */ diff --git a/init/main.c b/init/main.c index bb87b789c543..c62f0c8811d7 100644 --- a/init/main.c +++ b/init/main.c @@ -711,7 +711,7 @@ noinline void __ref rest_init(void) rcu_read_unlock(); numa_default_policy(); - pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); + pid = kernel_thread(kthreadd, NULL, NULL, CLONE_FS | CLONE_FILES); rcu_read_lock(); kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); rcu_read_unlock(); @@ -1092,14 +1092,6 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void) */ locking_selftest(); - /* - * This needs to be called before any devices perform DMA - * operations that might use the SWIOTLB bounce buffers. It will - * mark the bounce buffers as decrypted so that their usage will - * not cause "plain-text" data to be decrypted when accessed. - */ - mem_encrypt_init(); - #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start && !initrd_below_start_ok && page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) { @@ -1116,6 +1108,17 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void) late_time_init(); sched_clock_init(); calibrate_delay(); + + /* + * This needs to be called before any devices perform DMA + * operations that might use the SWIOTLB bounce buffers. It will + * mark the bounce buffers as decrypted so that their usage will + * not cause "plain-text" data to be decrypted when accessed. It + * must be called after late_time_init() so that Hyper-V x86/x64 + * hypercalls work when the SWIOTLB bounce buffers are decrypted. + */ + mem_encrypt_init(); + pid_idr_init(); anon_vma_init(); #ifdef CONFIG_X86 diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2a8b8c304d2a..4a865f0e85d0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -998,7 +998,7 @@ static void __io_req_complete_post(struct io_kiocb *req) void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { - if (req->ctx->task_complete && (issue_flags & IO_URING_F_IOWQ)) { + if (req->ctx->task_complete && req->ctx->submitter_task != current) { req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); } else if (!(issue_flags & IO_URING_F_UNLOCKED) || diff --git a/io_uring/net.c b/io_uring/net.c index 4040cf093318..89e839013837 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -184,8 +184,8 @@ static int io_setup_async_msg(struct io_kiocb *req, async_msg->msg.msg_name = &async_msg->addr; /* if were using fast_iov, set it to the new one */ if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) { - size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov; - async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx]; + size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov; + async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx]; } return -EAGAIN; diff --git a/io_uring/rw.c b/io_uring/rw.c index 4c233910e200..f33ba6f28247 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -447,26 +447,25 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) ppos = io_kiocb_ppos(kiocb); while (iov_iter_count(iter)) { - struct iovec iovec; + void __user *addr; + size_t len; ssize_t nr; if (iter_is_ubuf(iter)) { - iovec.iov_base = iter->ubuf + iter->iov_offset; - iovec.iov_len = iov_iter_count(iter); + addr = iter->ubuf + iter->iov_offset; + len = iov_iter_count(iter); } else if (!iov_iter_is_bvec(iter)) { - iovec = iov_iter_iovec(iter); + addr = iter_iov_addr(iter); + len = iter_iov_len(iter); } else { - iovec.iov_base = u64_to_user_ptr(rw->addr); - iovec.iov_len = rw->len; + addr = u64_to_user_ptr(rw->addr); + len = rw->len; } - if (ddir == READ) { - nr = file->f_op->read(file, iovec.iov_base, - iovec.iov_len, ppos); - } else { - nr = file->f_op->write(file, iovec.iov_base, - iovec.iov_len, ppos); - } + if (ddir == READ) + nr = file->f_op->read(file, addr, len, ppos); + else + nr = file->f_op->write(file, addr, len, ppos); if (nr < 0) { if (!ret) @@ -482,7 +481,7 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) if (!rw->len) break; } - if (nr != iovec.iov_len) + if (nr != len) break; } @@ -503,10 +502,10 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, if (!iovec) { unsigned iov_off = 0; - io->s.iter.iov = io->s.fast_iov; - if (iter->iov != fast_iov) { - iov_off = iter->iov - fast_iov; - io->s.iter.iov += iov_off; + io->s.iter.__iov = io->s.fast_iov; + if (iter->__iov != fast_iov) { + iov_off = iter_iov(iter) - fast_iov; + io->s.iter.__iov += iov_off; } if (io->s.fast_iov != fast_iov) memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off, diff --git a/kernel/Makefile b/kernel/Makefile index 10ef068f598d..6fc72b3afbde 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -15,6 +15,7 @@ obj-y = fork.o exec_domain.o panic.o \ obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o +obj-$(CONFIG_VHOST_TASK) += vhost_task.o ifdef CONFIG_FUNCTION_TRACER # Do not trace internal ftrace files diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 05f4c66c9089..85720311cc67 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -84,16 +84,13 @@ void bpf_inode_storage_free(struct inode *inode) static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) { struct bpf_local_storage_data *sdata; - struct file *f; - int fd; + struct fd f = fdget_raw(*(int *)key); - fd = *(int *)key; - f = fget_raw(fd); - if (!f) + if (!f.file) return ERR_PTR(-EBADF); - sdata = inode_storage_lookup(f->f_inode, map, true); - fput(f); + sdata = inode_storage_lookup(file_inode(f.file), map, true); + fdput(f); return sdata ? sdata->data : NULL; } @@ -101,22 +98,19 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; - struct file *f; - int fd; + struct fd f = fdget_raw(*(int *)key); - fd = *(int *)key; - f = fget_raw(fd); - if (!f) + if (!f.file) return -EBADF; - if (!inode_storage_ptr(f->f_inode)) { - fput(f); + if (!inode_storage_ptr(file_inode(f.file))) { + fdput(f); return -EBADF; } - sdata = bpf_local_storage_update(f->f_inode, + sdata = bpf_local_storage_update(file_inode(f.file), (struct bpf_local_storage_map *)map, value, map_flags, GFP_ATOMIC); - fput(f); + fdput(f); return PTR_ERR_OR_ZERO(sdata); } @@ -135,16 +129,14 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) { - struct file *f; - int fd, err; + struct fd f = fdget_raw(*(int *)key); + int err; - fd = *(int *)key; - f = fget_raw(fd); - if (!f) + if (!f.file) return -EBADF; - err = inode_storage_delete(f->f_inode, map); - fput(f); + err = inode_storage_delete(file_inode(f.file), map); + fdput(f); return err; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d517d13878cf..767e8930b0bd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2967,6 +2967,21 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, } } else if (opcode == BPF_EXIT) { return -ENOTSUPP; + } else if (BPF_SRC(insn->code) == BPF_X) { + if (!(*reg_mask & (dreg | sreg))) + return 0; + /* dreg <cond> sreg + * Both dreg and sreg need precision before + * this insn. If only sreg was marked precise + * before it would be equally necessary to + * propagate it to dreg. + */ + *reg_mask |= (sreg | dreg); + /* else dreg <cond> K + * Only dreg still needs precision before + * this insn, so for the K-based conditional + * there is nothing new to be marked. + */ } } else if (class == BPF_LD) { if (!(*reg_mask & dreg)) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 935e8121b21e..4b249f81c693 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6856,14 +6856,12 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path); struct cgroup *cgroup_v1v2_get_from_fd(int fd) { struct cgroup *cgrp; - struct file *f; - - f = fget_raw(fd); - if (!f) + struct fd f = fdget_raw(fd); + if (!f.file) return ERR_PTR(-EBADF); - cgrp = cgroup_v1v2_get_from_file(f); - fput(f); + cgrp = cgroup_v1v2_get_from_file(f.file); + fdput(f); return cgrp; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 636f1c682ac0..505d86b16642 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1513,7 +1513,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, spin_unlock_irq(&callback_lock); if (adding || deleting) - update_tasks_cpumask(parent, tmp->new_cpus); + update_tasks_cpumask(parent, tmp->addmask); /* * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. @@ -1770,10 +1770,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, /* * Use the cpumasks in trialcs for tmpmasks when they are pointers * to allocated cpumasks. + * + * Note that update_parent_subparts_cpumask() uses only addmask & + * delmask, but not new_cpus. */ tmp.addmask = trialcs->subparts_cpus; tmp.delmask = trialcs->effective_cpus; - tmp.new_cpus = trialcs->cpus_allowed; + tmp.new_cpus = NULL; #endif retval = validate_change(cs, trialcs); @@ -1838,6 +1841,11 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } spin_unlock_irq(&callback_lock); +#ifdef CONFIG_CPUMASK_OFFSTACK + /* Now trialcs->cpus_allowed is available */ + tmp.new_cpus = trialcs->cpus_allowed; +#endif + /* effective_cpus will be updated here */ update_cpumasks_hier(cs, &tmp, false); @@ -2445,6 +2453,20 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; +/* + * Check to see if a cpuset can accept a new task + * For v1, cpus_allowed and mems_allowed can't be empty. + * For v2, effective_cpus can't be empty. + * Note that in v1, effective_cpus = cpus_allowed. + */ +static int cpuset_can_attach_check(struct cpuset *cs) +{ + if (cpumask_empty(cs->effective_cpus) || + (!is_in_v2_mode() && nodes_empty(cs->mems_allowed))) + return -ENOSPC; + return 0; +} + /* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { @@ -2459,16 +2481,9 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) percpu_down_write(&cpuset_rwsem); - /* allow moving tasks into an empty cpuset if on default hierarchy */ - ret = -ENOSPC; - if (!is_in_v2_mode() && - (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) - goto out_unlock; - - /* - * Task cannot be moved to a cpuset with empty effective cpus. - */ - if (cpumask_empty(cs->effective_cpus)) + /* Check to see if task is allowed in the cpuset */ + ret = cpuset_can_attach_check(cs); + if (ret) goto out_unlock; cgroup_taskset_for_each(task, css, tset) { @@ -2485,7 +2500,6 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; - ret = 0; out_unlock: percpu_up_write(&cpuset_rwsem); return ret; @@ -2494,25 +2508,47 @@ out_unlock: static void cpuset_cancel_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; + struct cpuset *cs; cgroup_taskset_first(tset, &css); + cs = css_cs(css); percpu_down_write(&cpuset_rwsem); - css_cs(css)->attach_in_progress--; + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); percpu_up_write(&cpuset_rwsem); } /* - * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach() + * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach_task() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ static cpumask_var_t cpus_attach; +static nodemask_t cpuset_attach_nodemask_to; + +static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) +{ + percpu_rwsem_assert_held(&cpuset_rwsem); + + if (cs != &top_cpuset) + guarantee_online_cpus(task, cpus_attach); + else + cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), + cs->subparts_cpus); + /* + * can_attach beforehand should guarantee that this doesn't + * fail. TODO: have a better way to handle failure here + */ + WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); + + cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); + cpuset_update_task_spread_flags(cs, task); +} static void cpuset_attach(struct cgroup_taskset *tset) { - /* static buf protected by cpuset_rwsem */ - static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; struct cgroup_subsys_state *css; @@ -2543,20 +2579,8 @@ static void cpuset_attach(struct cgroup_taskset *tset) guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, css, tset) { - if (cs != &top_cpuset) - guarantee_online_cpus(task, cpus_attach); - else - cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); - /* - * can_attach beforehand should guarantee that this doesn't - * fail. TODO: have a better way to handle failure here - */ - WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); - - cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flags(cs, task); - } + cgroup_taskset_for_each(task, css, tset) + cpuset_attach_task(cs, task); /* * Change mm for all threadgroup leaders. This is expensive and may @@ -3248,17 +3272,101 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) } /* + * In case the child is cloned into a cpuset different from its parent, + * additional checks are done to see if the move is allowed. + */ +static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) +{ + struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); + bool same_cs; + int ret; + + rcu_read_lock(); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) + return 0; + + lockdep_assert_held(&cgroup_mutex); + percpu_down_write(&cpuset_rwsem); + + /* Check to see if task is allowed in the cpuset */ + ret = cpuset_can_attach_check(cs); + if (ret) + goto out_unlock; + + ret = task_can_attach(task, cs->effective_cpus); + if (ret) + goto out_unlock; + + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; + + /* + * Mark attach is in progress. This makes validate_change() fail + * changes which zero cpus/mems_allowed. + */ + cs->attach_in_progress++; +out_unlock: + percpu_up_write(&cpuset_rwsem); + return ret; +} + +static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) +{ + struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); + bool same_cs; + + rcu_read_lock(); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) + return; + + percpu_down_write(&cpuset_rwsem); + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); + percpu_up_write(&cpuset_rwsem); +} + +/* * Make sure the new task conform to the current state of its parent, * which could have been changed by cpuset just after it inherits the * state from the parent and before it sits on the cgroup's task list. */ static void cpuset_fork(struct task_struct *task) { - if (task_css_is_root(task, cpuset_cgrp_id)) + struct cpuset *cs; + bool same_cs; + + rcu_read_lock(); + cs = task_cs(task); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) { + if (cs == &top_cpuset) + return; + + set_cpus_allowed_ptr(task, current->cpus_ptr); + task->mems_allowed = current->mems_allowed; return; + } + + /* CLONE_INTO_CGROUP */ + percpu_down_write(&cpuset_rwsem); + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + cpuset_attach_task(cs, task); + + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); - set_cpus_allowed_ptr(task, current->cpus_ptr); - task->mems_allowed = current->mems_allowed; + percpu_up_write(&cpuset_rwsem); } struct cgroup_subsys cpuset_cgrp_subsys = { @@ -3271,6 +3379,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .attach = cpuset_attach, .post_attach = cpuset_post_attach, .bind = cpuset_bind, + .can_fork = cpuset_can_fork, + .cancel_fork = cpuset_cancel_fork, .fork = cpuset_fork, .legacy_cftypes = legacy_files, .dfl_cftypes = dfl_files, diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 1b6b21851e9d..936473203a6b 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -22,6 +22,7 @@ #include <linux/freezer.h> #include <linux/seq_file.h> #include <linux/mutex.h> +#include <linux/cpu.h> /* * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is @@ -350,7 +351,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, if (freeze) { if (!(freezer->state & CGROUP_FREEZING)) - static_branch_inc(&freezer_active); + static_branch_inc_cpuslocked(&freezer_active); freezer->state |= state; freeze_cgroup(freezer); } else { @@ -361,7 +362,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, if (!(freezer->state & CGROUP_FREEZING)) { freezer->state &= ~CGROUP_FROZEN; if (was_freezing) - static_branch_dec(&freezer_active); + static_branch_dec_cpuslocked(&freezer_active); unfreeze_cgroup(freezer); } } @@ -379,6 +380,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) { struct cgroup_subsys_state *pos; + cpus_read_lock(); /* * Update all its descendants in pre-order traversal. Each * descendant will try to inherit its parent's FREEZING state as @@ -407,6 +409,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) } rcu_read_unlock(); mutex_unlock(&freezer_mutex); + cpus_read_unlock(); } static ssize_t freezer_write(struct kernfs_open_file *of, diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 831f1f472bb8..0a2b4967e333 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -457,9 +457,7 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) struct task_cputime *cputime = &bstat->cputime; int i; - cputime->stime = 0; - cputime->utime = 0; - cputime->sum_exec_runtime = 0; + memset(bstat, 0, sizeof(*bstat)); for_each_possible_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; diff --git a/kernel/fork.c b/kernel/fork.c index 0c92f224c68c..bfe73db1c26c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1174,6 +1174,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, fail_pcpu: while (i > 0) percpu_counter_destroy(&mm->rss_stat[--i]); + destroy_context(mm); fail_nocontext: mm_free_pgd(mm); fail_nopgd: @@ -1625,7 +1626,8 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct *tsk) +static int copy_files(unsigned long clone_flags, struct task_struct *tsk, + int no_files) { struct files_struct *oldf, *newf; int error = 0; @@ -1637,6 +1639,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk) if (!oldf) goto out; + if (no_files) { + tsk->files = NULL; + goto out; + } + if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); goto out; @@ -1954,6 +1961,91 @@ const struct file_operations pidfd_fops = { #endif }; +/** + * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * @pid: the struct pid for which to create a pidfd + * @flags: flags of the new @pidfd + * @pidfd: the pidfd to return + * + * Allocate a new file that stashes @pid and reserve a new pidfd number in the + * caller's file descriptor table. The pidfd is reserved but not installed yet. + + * The helper doesn't perform checks on @pid which makes it useful for pidfds + * created via CLONE_PIDFD where @pid has no task attached when the pidfd and + * pidfd file are prepared. + * + * If this function returns successfully the caller is responsible to either + * call fd_install() passing the returned pidfd and pidfd file as arguments in + * order to install the pidfd into its file descriptor table or they must use + * put_unused_fd() and fput() on the returned pidfd and pidfd file + * respectively. + * + * This function is useful when a pidfd must already be reserved but there + * might still be points of failure afterwards and the caller wants to ensure + * that no pidfd is leaked into its file descriptor table. + * + * Return: On success, a reserved pidfd is returned from the function and a new + * pidfd file is returned in the last argument to the function. On + * error, a negative error code is returned from the function and the + * last argument remains unchanged. + */ +static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +{ + int pidfd; + struct file *pidfd_file; + + if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) + return -EINVAL; + + pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (pidfd < 0) + return pidfd; + + pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, + flags | O_RDWR | O_CLOEXEC); + if (IS_ERR(pidfd_file)) { + put_unused_fd(pidfd); + return PTR_ERR(pidfd_file); + } + get_pid(pid); /* held by pidfd_file now */ + *ret = pidfd_file; + return pidfd; +} + +/** + * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * @pid: the struct pid for which to create a pidfd + * @flags: flags of the new @pidfd + * @pidfd: the pidfd to return + * + * Allocate a new file that stashes @pid and reserve a new pidfd number in the + * caller's file descriptor table. The pidfd is reserved but not installed yet. + * + * The helper verifies that @pid is used as a thread group leader. + * + * If this function returns successfully the caller is responsible to either + * call fd_install() passing the returned pidfd and pidfd file as arguments in + * order to install the pidfd into its file descriptor table or they must use + * put_unused_fd() and fput() on the returned pidfd and pidfd file + * respectively. + * + * This function is useful when a pidfd must already be reserved but there + * might still be points of failure afterwards and the caller wants to ensure + * that no pidfd is leaked into its file descriptor table. + * + * Return: On success, a reserved pidfd is returned from the function and a new + * pidfd file is returned in the last argument to the function. On + * error, a negative error code is returned from the function and the + * last argument remains unchanged. + */ +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +{ + if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) + return -EINVAL; + + return __pidfd_prepare(pid, flags, ret); +} + static void __delayed_free_task(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); @@ -2008,7 +2100,7 @@ static void rv_task_fork(struct task_struct *p) * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ -static __latent_entropy struct task_struct *copy_process( +__latent_entropy struct task_struct *copy_process( struct pid *pid, int trace, int node, @@ -2101,6 +2193,8 @@ static __latent_entropy struct task_struct *copy_process( p->flags &= ~PF_KTHREAD; if (args->kthread) p->flags |= PF_KTHREAD; + if (args->user_worker) + p->flags |= PF_USER_WORKER; if (args->io_thread) { /* * Mark us an IO worker, and block any signal that isn't @@ -2110,6 +2204,9 @@ static __latent_entropy struct task_struct *copy_process( siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); } + if (args->name) + strscpy_pad(p->comm, args->name, sizeof(p->comm)); + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* * Clear TID on mm_release()? @@ -2252,7 +2349,7 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_security; - retval = copy_files(clone_flags, p); + retval = copy_files(clone_flags, p, args->no_files); if (retval) goto bad_fork_cleanup_semundo; retval = copy_fs(clone_flags, p); @@ -2277,6 +2374,9 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cleanup_io; + if (args->ignore_signals) + ignore_signals(p); + stackleak_task_init(p); if (pid != &init_struct_pid) { @@ -2294,21 +2394,12 @@ static __latent_entropy struct task_struct *copy_process( * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { - retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + /* Note that no task has been attached to @pid yet. */ + retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile); if (retval < 0) goto bad_fork_free_pid; - pidfd = retval; - pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, - O_RDWR | O_CLOEXEC); - if (IS_ERR(pidfile)) { - put_unused_fd(pidfd); - retval = PTR_ERR(pidfile); - goto bad_fork_free_pid; - } - get_pid(pid); /* held by pidfile now */ - retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd; @@ -2625,6 +2716,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) .fn = fn, .fn_arg = arg, .io_thread = 1, + .user_worker = 1, }; return copy_process(NULL, 0, node, &args); @@ -2728,7 +2820,8 @@ pid_t kernel_clone(struct kernel_clone_args *args) /* * Create a kernel thread. */ -pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, + unsigned long flags) { struct kernel_clone_args args = { .flags = ((lower_32_bits(flags) | CLONE_VM | @@ -2736,6 +2829,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) .exit_signal = (lower_32_bits(flags) & CSIGNAL), .fn = fn, .fn_arg = arg, + .name = name, .kthread = 1, }; diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 54d077e1a2dc..5a60cc52adc0 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -337,11 +337,20 @@ static void delay_access(int type) */ static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size) { + /* + * In the below we don't necessarily need the read of the location to + * be atomic, and we don't use READ_ONCE(), since all we need for race + * detection is to observe 2 different values. + * + * Furthermore, on certain architectures (such as arm64), READ_ONCE() + * may turn into more complex instructions than a plain load that cannot + * do unaligned accesses. + */ switch (size) { - case 1: return READ_ONCE(*(const u8 *)ptr); - case 2: return READ_ONCE(*(const u16 *)ptr); - case 4: return READ_ONCE(*(const u32 *)ptr); - case 8: return READ_ONCE(*(const u64 *)ptr); + case 1: return *(const volatile u8 *)ptr; + case 2: return *(const volatile u16 *)ptr; + case 4: return *(const volatile u32 *)ptr; + case 8: return *(const volatile u64 *)ptr; default: return 0; /* Ignore; we do not diff the values. */ } } diff --git a/kernel/kthread.c b/kernel/kthread.c index 7e6751b29101..4bc6e0971ec9 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -38,6 +38,7 @@ struct task_struct *kthreadd_task; struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ + char *full_name; int (*threadfn)(void *data); void *data; int node; @@ -343,10 +344,12 @@ static int kthread(void *_create) /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { + kfree(create->full_name); kfree(create); kthread_exit(-EINTR); } + self->full_name = create->full_name; self->threadfn = threadfn; self->data = data; @@ -396,11 +399,13 @@ static void create_kthread(struct kthread_create_info *create) current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ - pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); + pid = kernel_thread(kthread, create, create->full_name, + CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); + kfree(create->full_name); if (!done) { kfree(create); return; @@ -427,6 +432,11 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), create->data = data; create->node = node; create->done = &done; + create->full_name = kvasprintf(GFP_KERNEL, namefmt, args); + if (!create->full_name) { + task = ERR_PTR(-ENOMEM); + goto free_create; + } spin_lock(&kthread_create_lock); list_add_tail(&create->list, &kthread_create_list); @@ -453,26 +463,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), wait_for_completion(&done); } task = create->result; - if (!IS_ERR(task)) { - char name[TASK_COMM_LEN]; - va_list aq; - int len; - - /* - * task is already visible to other tasks, so updating - * COMM must be protected. - */ - va_copy(aq, args); - len = vsnprintf(name, sizeof(name), namefmt, aq); - va_end(aq); - if (len >= TASK_COMM_LEN) { - struct kthread *kthread = to_kthread(task); - - /* leave it truncated when out of memory. */ - kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args); - } - set_task_comm(task, name); - } +free_create: kfree(create); return task; } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 50d4863974e7..dcd1d5bfc1e0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1881,6 +1881,8 @@ print_circular_lock_scenario(struct held_lock *src, struct lock_class *source = hlock_class(src); struct lock_class *target = hlock_class(tgt); struct lock_class *parent = prt->class; + int src_read = src->read; + int tgt_read = tgt->read; /* * A direct locking problem where unsafe_class lock is taken @@ -1908,7 +1910,10 @@ print_circular_lock_scenario(struct held_lock *src, printk(" Possible unsafe locking scenario:\n\n"); printk(" CPU0 CPU1\n"); printk(" ---- ----\n"); - printk(" lock("); + if (tgt_read != 0) + printk(" rlock("); + else + printk(" lock("); __print_lock_name(target); printk(KERN_CONT ");\n"); printk(" lock("); @@ -1917,7 +1922,12 @@ print_circular_lock_scenario(struct held_lock *src, printk(" lock("); __print_lock_name(target); printk(KERN_CONT ");\n"); - printk(" lock("); + if (src_read != 0) + printk(" rlock("); + else if (src->sync) + printk(" sync("); + else + printk(" lock("); __print_lock_name(source); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); @@ -4531,7 +4541,13 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) return 0; } } - if (!hlock->hardirqs_off) { + + /* + * For lock_sync(), don't mark the ENABLED usage, since lock_sync() + * creates no critical section and no extra dependency can be introduced + * by interrupts + */ + if (!hlock->hardirqs_off && !hlock->sync) { if (hlock->read) { if (!mark_lock(curr, hlock, LOCK_ENABLED_HARDIRQ_READ)) @@ -4910,7 +4926,7 @@ static int __lock_is_held(const struct lockdep_map *lock, int read); static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, struct lockdep_map *nest_lock, unsigned long ip, - int references, int pin_count) + int references, int pin_count, int sync) { struct task_struct *curr = current; struct lock_class *class = NULL; @@ -4961,7 +4977,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes; - if (depth) { /* we're holding locks */ + if (depth && !sync) { + /* we're holding locks and the new held lock is not a sync */ hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (!references) @@ -4995,6 +5012,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->trylock = trylock; hlock->read = read; hlock->check = check; + hlock->sync = !!sync; hlock->hardirqs_off = !!hardirqs_off; hlock->references = references; #ifdef CONFIG_LOCK_STAT @@ -5056,6 +5074,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!validate_chain(curr, hlock, chain_head, chain_key)) return 0; + /* For lock_sync(), we are done here since no actual critical section */ + if (hlock->sync) + return 1; + curr->curr_chain_key = chain_key; curr->lockdep_depth++; check_chain_key(curr); @@ -5197,7 +5219,7 @@ static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) { + hlock->references, hlock->pin_count, 0)) { case 0: return 1; case 1: @@ -5667,7 +5689,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, lockdep_recursion_inc(); __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), nest_lock, ip, 0, 0); + irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 0); lockdep_recursion_finish(); raw_local_irq_restore(flags); } @@ -5693,6 +5715,34 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_release); +/* + * lock_sync() - A special annotation for synchronize_{s,}rcu()-like API. + * + * No actual critical section is created by the APIs annotated with this: these + * APIs are used to wait for one or multiple critical sections (on other CPUs + * or threads), and it means that calling these APIs inside these critical + * sections is potential deadlock. + */ +void lock_sync(struct lockdep_map *lock, unsigned subclass, int read, + int check, struct lockdep_map *nest_lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(!lockdep_enabled())) + return; + + raw_local_irq_save(flags); + check_flags(flags); + + lockdep_recursion_inc(); + __lock_acquire(lock, subclass, 0, read, check, + irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 1); + check_chain_key(current); + lockdep_recursion_finish(); + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_sync); + noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) { unsigned long flags; diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f04b1978899d..153ddc4c47ef 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -51,8 +51,11 @@ torture_param(int, rt_boost, 2, torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); +torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); +/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */ +#define MAX_NESTED_LOCKS 8 -static char *torture_type = "spin_lock"; +static char *torture_type = IS_ENABLED(CONFIG_PREEMPT_RT) ? "raw_spin_lock" : "spin_lock"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)"); @@ -79,10 +82,12 @@ static void lock_torture_cleanup(void); struct lock_torture_ops { void (*init)(void); void (*exit)(void); + int (*nested_lock)(int tid, u32 lockset); int (*writelock)(int tid); void (*write_delay)(struct torture_random_state *trsp); void (*task_boost)(struct torture_random_state *trsp); void (*writeunlock)(int tid); + void (*nested_unlock)(int tid, u32 lockset); int (*readlock)(int tid); void (*read_delay)(struct torture_random_state *trsp); void (*readunlock)(int tid); @@ -252,6 +257,59 @@ static struct lock_torture_ops spin_lock_irq_ops = { .name = "spin_lock_irq" }; +static DEFINE_RAW_SPINLOCK(torture_raw_spinlock); + +static int torture_raw_spin_lock_write_lock(int tid __maybe_unused) +__acquires(torture_raw_spinlock) +{ + raw_spin_lock(&torture_raw_spinlock); + return 0; +} + +static void torture_raw_spin_lock_write_unlock(int tid __maybe_unused) +__releases(torture_raw_spinlock) +{ + raw_spin_unlock(&torture_raw_spinlock); +} + +static struct lock_torture_ops raw_spin_lock_ops = { + .writelock = torture_raw_spin_lock_write_lock, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_spin_lock_write_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_spin_lock" +}; + +static int torture_raw_spin_lock_write_lock_irq(int tid __maybe_unused) +__acquires(torture_raw_spinlock) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&torture_raw_spinlock, flags); + cxt.cur_ops->flags = flags; + return 0; +} + +static void torture_raw_spin_lock_write_unlock_irq(int tid __maybe_unused) +__releases(torture_raw_spinlock) +{ + raw_spin_unlock_irqrestore(&torture_raw_spinlock, cxt.cur_ops->flags); +} + +static struct lock_torture_ops raw_spin_lock_irq_ops = { + .writelock = torture_raw_spin_lock_write_lock_irq, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_spin_lock_write_unlock_irq, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_spin_lock_irq" +}; + static DEFINE_RWLOCK(torture_rwlock); static int torture_rwlock_write_lock(int tid __maybe_unused) @@ -365,6 +423,28 @@ static struct lock_torture_ops rw_lock_irq_ops = { }; static DEFINE_MUTEX(torture_mutex); +static struct mutex torture_nested_mutexes[MAX_NESTED_LOCKS]; +static struct lock_class_key nested_mutex_keys[MAX_NESTED_LOCKS]; + +static void torture_mutex_init(void) +{ + int i; + + for (i = 0; i < MAX_NESTED_LOCKS; i++) + __mutex_init(&torture_nested_mutexes[i], __func__, + &nested_mutex_keys[i]); +} + +static int torture_mutex_nested_lock(int tid __maybe_unused, + u32 lockset) +{ + int i; + + for (i = 0; i < nested_locks; i++) + if (lockset & (1 << i)) + mutex_lock(&torture_nested_mutexes[i]); + return 0; +} static int torture_mutex_lock(int tid __maybe_unused) __acquires(torture_mutex) @@ -393,11 +473,24 @@ __releases(torture_mutex) mutex_unlock(&torture_mutex); } +static void torture_mutex_nested_unlock(int tid __maybe_unused, + u32 lockset) +{ + int i; + + for (i = nested_locks - 1; i >= 0; i--) + if (lockset & (1 << i)) + mutex_unlock(&torture_nested_mutexes[i]); +} + static struct lock_torture_ops mutex_lock_ops = { + .init = torture_mutex_init, + .nested_lock = torture_mutex_nested_lock, .writelock = torture_mutex_lock, .write_delay = torture_mutex_delay, .task_boost = torture_rt_boost, .writeunlock = torture_mutex_unlock, + .nested_unlock = torture_mutex_nested_unlock, .readlock = NULL, .read_delay = NULL, .readunlock = NULL, @@ -504,6 +597,28 @@ static struct lock_torture_ops ww_mutex_lock_ops = { #ifdef CONFIG_RT_MUTEXES static DEFINE_RT_MUTEX(torture_rtmutex); +static struct rt_mutex torture_nested_rtmutexes[MAX_NESTED_LOCKS]; +static struct lock_class_key nested_rtmutex_keys[MAX_NESTED_LOCKS]; + +static void torture_rtmutex_init(void) +{ + int i; + + for (i = 0; i < MAX_NESTED_LOCKS; i++) + __rt_mutex_init(&torture_nested_rtmutexes[i], __func__, + &nested_rtmutex_keys[i]); +} + +static int torture_rtmutex_nested_lock(int tid __maybe_unused, + u32 lockset) +{ + int i; + + for (i = 0; i < nested_locks; i++) + if (lockset & (1 << i)) + rt_mutex_lock(&torture_nested_rtmutexes[i]); + return 0; +} static int torture_rtmutex_lock(int tid __maybe_unused) __acquires(torture_rtmutex) @@ -545,11 +660,24 @@ static void torture_rt_boost_rtmutex(struct torture_random_state *trsp) __torture_rt_boost(trsp); } +static void torture_rtmutex_nested_unlock(int tid __maybe_unused, + u32 lockset) +{ + int i; + + for (i = nested_locks - 1; i >= 0; i--) + if (lockset & (1 << i)) + rt_mutex_unlock(&torture_nested_rtmutexes[i]); +} + static struct lock_torture_ops rtmutex_lock_ops = { + .init = torture_rtmutex_init, + .nested_lock = torture_rtmutex_nested_lock, .writelock = torture_rtmutex_lock, .write_delay = torture_rtmutex_delay, .task_boost = torture_rt_boost_rtmutex, .writeunlock = torture_rtmutex_unlock, + .nested_unlock = torture_rtmutex_nested_unlock, .readlock = NULL, .read_delay = NULL, .readunlock = NULL, @@ -684,6 +812,8 @@ static int lock_torture_writer(void *arg) struct lock_stress_stats *lwsp = arg; int tid = lwsp - cxt.lwsa; DEFINE_TORTURE_RANDOM(rand); + u32 lockset_mask; + bool skip_main_lock; VERBOSE_TOROUT_STRING("lock_torture_writer task started"); set_user_nice(current, MAX_NICE); @@ -692,19 +822,40 @@ static int lock_torture_writer(void *arg) if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); - cxt.cur_ops->task_boost(&rand); - cxt.cur_ops->writelock(tid); - if (WARN_ON_ONCE(lock_is_write_held)) - lwsp->n_lock_fail++; - lock_is_write_held = true; - if (WARN_ON_ONCE(atomic_read(&lock_is_read_held))) - lwsp->n_lock_fail++; /* rare, but... */ + lockset_mask = torture_random(&rand); + /* + * When using nested_locks, we want to occasionally + * skip the main lock so we can avoid always serializing + * the lock chains on that central lock. By skipping the + * main lock occasionally, we can create different + * contention patterns (allowing for multiple disjoint + * blocked trees) + */ + skip_main_lock = (nested_locks && + !(torture_random(&rand) % 100)); - lwsp->n_lock_acquired++; + cxt.cur_ops->task_boost(&rand); + if (cxt.cur_ops->nested_lock) + cxt.cur_ops->nested_lock(tid, lockset_mask); + + if (!skip_main_lock) { + cxt.cur_ops->writelock(tid); + if (WARN_ON_ONCE(lock_is_write_held)) + lwsp->n_lock_fail++; + lock_is_write_held = true; + if (WARN_ON_ONCE(atomic_read(&lock_is_read_held))) + lwsp->n_lock_fail++; /* rare, but... */ + + lwsp->n_lock_acquired++; + } cxt.cur_ops->write_delay(&rand); - lock_is_write_held = false; - WRITE_ONCE(last_lock_release, jiffies); - cxt.cur_ops->writeunlock(tid); + if (!skip_main_lock) { + lock_is_write_held = false; + WRITE_ONCE(last_lock_release, jiffies); + cxt.cur_ops->writeunlock(tid); + } + if (cxt.cur_ops->nested_unlock) + cxt.cur_ops->nested_unlock(tid, lockset_mask); stutter_wait("lock_torture_writer"); } while (!torture_must_stop()); @@ -845,11 +996,11 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, const char *tag) { pr_alert("%s" TORTURE_FLAG - "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", + "--- %s%s: nwriters_stress=%d nreaders_stress=%d nested_locks=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", torture_type, tag, cxt.debug_lock ? " [debug]": "", - cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval, - verbose, shuffle_interval, stutter, shutdown_secs, - onoff_interval, onoff_holdoff); + cxt.nrealwriters_stress, cxt.nrealreaders_stress, + nested_locks, stat_interval, verbose, shuffle_interval, + stutter, shutdown_secs, onoff_interval, onoff_holdoff); } static void lock_torture_cleanup(void) @@ -919,6 +1070,7 @@ static int __init lock_torture_init(void) static struct lock_torture_ops *torture_ops[] = { &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, + &raw_spin_lock_ops, &raw_spin_lock_irq_ops, &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, &ww_mutex_lock_ops, @@ -1068,6 +1220,10 @@ static int __init lock_torture_init(void) } } + /* cap nested_locks to MAX_NESTED_LOCKS */ + if (nested_locks > MAX_NESTED_LOCKS) + nested_locks = MAX_NESTED_LOCKS; + if (cxt.cur_ops->readlock) { reader_tasks = kcalloc(cxt.nrealreaders_stress, sizeof(reader_tasks[0]), diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 29dc253d03af..93cca6e69860 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -659,7 +659,7 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; - ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); + ret = stress(2047, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); if (ret) return ret; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index a487ff24129b..80d9c6d77a45 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -545,21 +545,20 @@ static void commit_nsset(struct nsset *nsset) SYSCALL_DEFINE2(setns, int, fd, int, flags) { - struct file *file; + struct fd f = fdget(fd); struct ns_common *ns = NULL; struct nsset nsset = {}; int err = 0; - file = fget(fd); - if (!file) + if (!f.file) return -EBADF; - if (proc_ns_file(file)) { - ns = get_proc_ns(file_inode(file)); + if (proc_ns_file(f.file)) { + ns = get_proc_ns(file_inode(f.file)); if (flags && (ns->ops->type != flags)) err = -EINVAL; flags = ns->ops->type; - } else if (!IS_ERR(pidfd_pid(file))) { + } else if (!IS_ERR(pidfd_pid(f.file))) { err = check_setns_flags(flags); } else { err = -EINVAL; @@ -571,17 +570,17 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (err) goto out; - if (proc_ns_file(file)) + if (proc_ns_file(f.file)) err = validate_ns(&nsset, ns); else - err = validate_nsset(&nsset, file->private_data); + err = validate_nsset(&nsset, f.file->private_data); if (!err) { commit_nsset(&nsset); perf_event_namespaces(current); } put_nsset(&nsset); out: - fput(file); + fdput(f); return err; } diff --git a/kernel/pid.c b/kernel/pid.c index 3fbc5e46b721..f93954a0384d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -594,20 +594,15 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) */ int pidfd_create(struct pid *pid, unsigned int flags) { - int fd; - - if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) - return -EINVAL; + int pidfd; + struct file *pidfd_file; - if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) - return -EINVAL; - - fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), - flags | O_RDWR | O_CLOEXEC); - if (fd < 0) - put_pid(pid); + pidfd = pidfd_prepare(pid, flags, &pidfd_file); + if (pidfd < 0) + return pidfd; - return fd; + fd_install(pidfd, pidfd_file); + return pidfd; } /** diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index ab62074174c3..9071182b1284 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -53,9 +53,6 @@ config RCU_EXPERT Say N if you are unsure. -config SRCU - def_bool y - config TINY_SRCU bool default y if TINY_RCU diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 115616ac3bfa..4a1b9622598b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -14,6 +14,43 @@ /* * Grace-period counter management. + * + * The two least significant bits contain the control flags. + * The most significant bits contain the grace-period sequence counter. + * + * When both control flags are zero, no grace period is in progress. + * When either bit is non-zero, a grace period has started and is in + * progress. When the grace period completes, the control flags are reset + * to 0 and the grace-period sequence counter is incremented. + * + * However some specific RCU usages make use of custom values. + * + * SRCU special control values: + * + * SRCU_SNP_INIT_SEQ : Invalid/init value set when SRCU node + * is initialized. + * + * SRCU_STATE_IDLE : No SRCU gp is in progress + * + * SRCU_STATE_SCAN1 : State set by rcu_seq_start(). Indicates + * we are scanning the readers on the slot + * defined as inactive (there might well + * be pending readers that will use that + * index, but their number is bounded). + * + * SRCU_STATE_SCAN2 : State set manually via rcu_seq_set_state() + * Indicates we are flipping the readers + * index and then scanning the readers on the + * slot newly designated as inactive (again, + * the number of pending readers that will use + * this inactive index is bounded). + * + * RCU polled GP special control value: + * + * RCU_GET_STATE_COMPLETED : State value indicating an already-completed + * polled GP has completed. This value covers + * both the state and the counter of the + * grace-period sequence number. */ #define RCU_SEQ_CTR_SHIFT 2 @@ -341,11 +378,13 @@ extern void rcu_init_geometry(void); * specified state structure (for SRCU) or the only rcu_state structure * (for RCU). */ -#define srcu_for_each_node_breadth_first(sp, rnp) \ +#define _rcu_for_each_node_breadth_first(sp, rnp) \ for ((rnp) = &(sp)->node[0]; \ (rnp) < &(sp)->node[rcu_num_nodes]; (rnp)++) #define rcu_for_each_node_breadth_first(rnp) \ - srcu_for_each_node_breadth_first(&rcu_state, rnp) + _rcu_for_each_node_breadth_first(&rcu_state, rnp) +#define srcu_for_each_node_breadth_first(ssp, rnp) \ + _rcu_for_each_node_breadth_first(ssp->srcu_sup, rnp) /* * Scan the leaves of the rcu_node hierarchy for the rcu_state structure. diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 91fb5905a008..e82ec9f9a5d8 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -631,8 +631,7 @@ static int compute_real(int n) static int rcu_scale_shutdown(void *arg) { - wait_event(shutdown_wq, - atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); + wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); smp_mb(); /* Wake before output. */ rcu_scale_cleanup(); kernel_power_off(); @@ -716,7 +715,7 @@ kfree_scale_thread(void *arg) // is tested. if ((kfree_rcu_test_single && !kfree_rcu_test_double) || (kfree_rcu_test_both && torture_random(&tr) & 0x800)) - kfree_rcu(alloc_ptr); + kfree_rcu_mightsleep(alloc_ptr); else kfree_rcu(alloc_ptr, rh); } @@ -771,8 +770,8 @@ kfree_scale_cleanup(void) static int kfree_scale_shutdown(void *arg) { - wait_event(shutdown_wq, - atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads); + wait_event_idle(shutdown_wq, + atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads); smp_mb(); /* Wake before output. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8e6c023212cb..147551c23baf 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -119,7 +119,9 @@ torture_param(int, stutter, 5, "Number of seconds to run/halt test"); torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds."); torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); +torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); +torture_param(int, test_srcu_lockdep, 0, "Test specified SRCU deadlock scenario."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); static char *torture_type = "rcu"; @@ -179,7 +181,6 @@ static atomic_t n_rcu_torture_mbchk_tries; static atomic_t n_rcu_torture_error; static long n_rcu_torture_barrier_error; static long n_rcu_torture_boost_ktrerror; -static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static atomic_long_t n_rcu_torture_timers; @@ -2194,12 +2195,11 @@ rcu_torture_stats_print(void) atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free)); - pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld rtbre: %ld ", + pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld ", atomic_read(&n_rcu_torture_mberror), atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries), n_rcu_torture_barrier_error, - n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror); + n_rcu_torture_boost_ktrerror); pr_cont("rtbf: %ld rtb: %ld nt: %ld ", n_rcu_torture_boost_failure, n_rcu_torture_boosts, @@ -2217,15 +2217,13 @@ rcu_torture_stats_print(void) if (atomic_read(&n_rcu_torture_mberror) || atomic_read(&n_rcu_torture_mbchk_fail) || n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror || - n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure || - i > 1) { + n_rcu_torture_boost_failure || i > 1) { pr_cont("%s", "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror)); WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail)); WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier() WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread - WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio WARN_ON_ONCE(n_rcu_torture_boost_failure); // boost failed (TIMER_SOFTIRQ RT prio?) WARN_ON_ONCE(i > 1); // Too-short grace period } @@ -2358,7 +2356,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d " "read_exit_delay=%d read_exit_burst=%d " - "nocbs_nthreads=%d nocbs_toggle=%d\n", + "nocbs_nthreads=%d nocbs_toggle=%d " + "test_nmis=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -2369,7 +2368,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) n_barrier_cbs, onoff_interval, onoff_holdoff, read_exit_delay, read_exit_burst, - nocbs_nthreads, nocbs_toggle); + nocbs_nthreads, nocbs_toggle, + test_nmis); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -3273,6 +3273,29 @@ static void rcu_torture_read_exit_cleanup(void) torture_stop_kthread(rcutorture_read_exit, read_exit_task); } +static void rcutorture_test_nmis(int n) +{ +#if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) + int cpu; + int dumpcpu; + int i; + + for (i = 0; i < n; i++) { + preempt_disable(); + cpu = smp_processor_id(); + dumpcpu = cpu + 1; + if (dumpcpu >= nr_cpu_ids) + dumpcpu = 0; + pr_alert("%s: CPU %d invoking dump_cpu_task(%d)\n", __func__, cpu, dumpcpu); + dump_cpu_task(dumpcpu); + preempt_enable(); + schedule_timeout_uninterruptible(15 * HZ); + } +#else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) + WARN_ONCE(n, "Non-zero rcutorture.test_nmis=%d permitted only when rcutorture is built in.\n", test_nmis); +#endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) +} + static enum cpuhp_state rcutor_hp; static void @@ -3297,6 +3320,8 @@ rcu_torture_cleanup(void) return; } + rcutorture_test_nmis(test_nmis); + if (cur_ops->gp_kthread_dbg) cur_ops->gp_kthread_dbg(); rcu_torture_read_exit_cleanup(); @@ -3463,6 +3488,188 @@ static void rcutorture_sync(void) cur_ops->sync(); } +static DEFINE_MUTEX(mut0); +static DEFINE_MUTEX(mut1); +static DEFINE_MUTEX(mut2); +static DEFINE_MUTEX(mut3); +static DEFINE_MUTEX(mut4); +static DEFINE_MUTEX(mut5); +static DEFINE_MUTEX(mut6); +static DEFINE_MUTEX(mut7); +static DEFINE_MUTEX(mut8); +static DEFINE_MUTEX(mut9); + +static DECLARE_RWSEM(rwsem0); +static DECLARE_RWSEM(rwsem1); +static DECLARE_RWSEM(rwsem2); +static DECLARE_RWSEM(rwsem3); +static DECLARE_RWSEM(rwsem4); +static DECLARE_RWSEM(rwsem5); +static DECLARE_RWSEM(rwsem6); +static DECLARE_RWSEM(rwsem7); +static DECLARE_RWSEM(rwsem8); +static DECLARE_RWSEM(rwsem9); + +DEFINE_STATIC_SRCU(srcu0); +DEFINE_STATIC_SRCU(srcu1); +DEFINE_STATIC_SRCU(srcu2); +DEFINE_STATIC_SRCU(srcu3); +DEFINE_STATIC_SRCU(srcu4); +DEFINE_STATIC_SRCU(srcu5); +DEFINE_STATIC_SRCU(srcu6); +DEFINE_STATIC_SRCU(srcu7); +DEFINE_STATIC_SRCU(srcu8); +DEFINE_STATIC_SRCU(srcu9); + +static int srcu_lockdep_next(const char *f, const char *fl, const char *fs, const char *fu, int i, + int cyclelen, int deadlock) +{ + int j = i + 1; + + if (j >= cyclelen) + j = deadlock ? 0 : -1; + if (j >= 0) + pr_info("%s: %s(%d), %s(%d), %s(%d)\n", f, fl, i, fs, j, fu, i); + else + pr_info("%s: %s(%d), %s(%d)\n", f, fl, i, fu, i); + return j; +} + +// Test lockdep on SRCU-based deadlock scenarios. +static void rcu_torture_init_srcu_lockdep(void) +{ + int cyclelen; + int deadlock; + bool err = false; + int i; + int j; + int idx; + struct mutex *muts[] = { &mut0, &mut1, &mut2, &mut3, &mut4, + &mut5, &mut6, &mut7, &mut8, &mut9 }; + struct rw_semaphore *rwsems[] = { &rwsem0, &rwsem1, &rwsem2, &rwsem3, &rwsem4, + &rwsem5, &rwsem6, &rwsem7, &rwsem8, &rwsem9 }; + struct srcu_struct *srcus[] = { &srcu0, &srcu1, &srcu2, &srcu3, &srcu4, + &srcu5, &srcu6, &srcu7, &srcu8, &srcu9 }; + int testtype; + + if (!test_srcu_lockdep) + return; + + deadlock = test_srcu_lockdep / 1000; + testtype = (test_srcu_lockdep / 10) % 100; + cyclelen = test_srcu_lockdep % 10; + WARN_ON_ONCE(ARRAY_SIZE(muts) != ARRAY_SIZE(srcus)); + if (WARN_ONCE(deadlock != !!deadlock, + "%s: test_srcu_lockdep=%d and deadlock digit %d must be zero or one.\n", + __func__, test_srcu_lockdep, deadlock)) + err = true; + if (WARN_ONCE(cyclelen <= 0, + "%s: test_srcu_lockdep=%d and cycle-length digit %d must be greater than zero.\n", + __func__, test_srcu_lockdep, cyclelen)) + err = true; + if (err) + goto err_out; + + if (testtype == 0) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + if (deadlock && cyclelen == 1) + pr_info("%s: Expect hang.\n", __func__); + for (i = 0; i < cyclelen; i++) { + j = srcu_lockdep_next(__func__, "srcu_read_lock", "synchronize_srcu", + "srcu_read_unlock", i, cyclelen, deadlock); + idx = srcu_read_lock(srcus[i]); + if (j >= 0) + synchronize_srcu(srcus[j]); + srcu_read_unlock(srcus[i], idx); + } + return; + } + + if (testtype == 1) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU/mutex %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + for (i = 0; i < cyclelen; i++) { + pr_info("%s: srcu_read_lock(%d), mutex_lock(%d), mutex_unlock(%d), srcu_read_unlock(%d)\n", + __func__, i, i, i, i); + idx = srcu_read_lock(srcus[i]); + mutex_lock(muts[i]); + mutex_unlock(muts[i]); + srcu_read_unlock(srcus[i], idx); + + j = srcu_lockdep_next(__func__, "mutex_lock", "synchronize_srcu", + "mutex_unlock", i, cyclelen, deadlock); + mutex_lock(muts[i]); + if (j >= 0) + synchronize_srcu(srcus[j]); + mutex_unlock(muts[i]); + } + return; + } + + if (testtype == 2) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU/rwsem %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + for (i = 0; i < cyclelen; i++) { + pr_info("%s: srcu_read_lock(%d), down_read(%d), up_read(%d), srcu_read_unlock(%d)\n", + __func__, i, i, i, i); + idx = srcu_read_lock(srcus[i]); + down_read(rwsems[i]); + up_read(rwsems[i]); + srcu_read_unlock(srcus[i], idx); + + j = srcu_lockdep_next(__func__, "down_write", "synchronize_srcu", + "up_write", i, cyclelen, deadlock); + down_write(rwsems[i]); + if (j >= 0) + synchronize_srcu(srcus[j]); + up_write(rwsems[i]); + } + return; + } + +#ifdef CONFIG_TASKS_TRACE_RCU + if (testtype == 3) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU and Tasks Trace RCU %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + if (deadlock && cyclelen == 1) + pr_info("%s: Expect hang.\n", __func__); + for (i = 0; i < cyclelen; i++) { + char *fl = i == 0 ? "rcu_read_lock_trace" : "srcu_read_lock"; + char *fs = i == cyclelen - 1 ? "synchronize_rcu_tasks_trace" + : "synchronize_srcu"; + char *fu = i == 0 ? "rcu_read_unlock_trace" : "srcu_read_unlock"; + + j = srcu_lockdep_next(__func__, fl, fs, fu, i, cyclelen, deadlock); + if (i == 0) + rcu_read_lock_trace(); + else + idx = srcu_read_lock(srcus[i]); + if (j >= 0) { + if (i == cyclelen - 1) + synchronize_rcu_tasks_trace(); + else + synchronize_srcu(srcus[j]); + } + if (i == 0) + rcu_read_unlock_trace(); + else + srcu_read_unlock(srcus[i], idx); + } + return; + } +#endif // #ifdef CONFIG_TASKS_TRACE_RCU + +err_out: + pr_info("%s: test_srcu_lockdep = %05d does nothing.\n", __func__, test_srcu_lockdep); + pr_info("%s: test_srcu_lockdep = DNNL.\n", __func__); + pr_info("%s: D: Deadlock if nonzero.\n", __func__); + pr_info("%s: NN: Test number, 0=SRCU, 1=SRCU/mutex, 2=SRCU/rwsem, 3=SRCU/Tasks Trace RCU.\n", __func__); + pr_info("%s: L: Cycle length.\n", __func__); + if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU)) + pr_info("%s: NN=3 disallowed because kernel is built with CONFIG_TASKS_TRACE_RCU=n\n", __func__); +} + static int __init rcu_torture_init(void) { @@ -3501,9 +3708,17 @@ rcu_torture_init(void) pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } + if (nocbs_nthreads != 0 && (cur_ops != &rcu_ops || + !IS_ENABLED(CONFIG_RCU_NOCB_CPU))) { + pr_alert("rcu-torture types: %s and CONFIG_RCU_NOCB_CPU=%d, nocb toggle disabled.\n", + cur_ops->name, IS_ENABLED(CONFIG_RCU_NOCB_CPU)); + nocbs_nthreads = 0; + } if (cur_ops->init) cur_ops->init(); + rcu_torture_init_srcu_lockdep(); + if (nreaders >= 0) { nrealreaders = nreaders; } else { @@ -3540,7 +3755,6 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_error, 0); n_rcu_torture_barrier_error = 0; n_rcu_torture_boost_ktrerror = 0; - n_rcu_torture_boost_rterror = 0; n_rcu_torture_boost_failure = 0; n_rcu_torture_boosts = 0; for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index afa3e1a2f690..1970ce5f22d4 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -1031,7 +1031,7 @@ ref_scale_cleanup(void) static int ref_scale_shutdown(void *arg) { - wait_event(shutdown_wq, shutdown_start); + wait_event_idle(shutdown_wq, shutdown_start); smp_mb(); // Wake before output. ref_scale_cleanup(); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index b12fb0cec44d..336af24e0fe3 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -197,6 +197,8 @@ void synchronize_srcu(struct srcu_struct *ssp) { struct rcu_synchronize rs; + srcu_lock_sync(&ssp->dep_map); + RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index ab4ee58af84b..20d7a238d675 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -103,7 +103,7 @@ do { \ #define spin_trylock_irqsave_rcu_node(p, flags) \ ({ \ - bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ \ if (___locked) \ smp_mb__after_unlock_lock(); \ @@ -135,8 +135,8 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; - sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; - sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; + sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq; sdp->mynode = NULL; sdp->cpu = cpu; INIT_WORK(&sdp->work, srcu_invoke_callbacks); @@ -173,14 +173,14 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) /* Initialize geometry if it has not already been initialized. */ rcu_init_geometry(); - ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags); - if (!ssp->node) + ssp->srcu_sup->node = kcalloc(rcu_num_nodes, sizeof(*ssp->srcu_sup->node), gfp_flags); + if (!ssp->srcu_sup->node) return false; /* Work out the overall tree geometry. */ - ssp->level[0] = &ssp->node[0]; + ssp->srcu_sup->level[0] = &ssp->srcu_sup->node[0]; for (i = 1; i < rcu_num_lvls; i++) - ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1]; + ssp->srcu_sup->level[i] = ssp->srcu_sup->level[i - 1] + num_rcu_lvl[i - 1]; rcu_init_levelspread(levelspread, num_rcu_lvl); /* Each pass through this loop initializes one srcu_node structure. */ @@ -195,17 +195,17 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ; snp->grplo = -1; snp->grphi = -1; - if (snp == &ssp->node[0]) { + if (snp == &ssp->srcu_sup->node[0]) { /* Root node, special case. */ snp->srcu_parent = NULL; continue; } /* Non-root node. */ - if (snp == ssp->level[level + 1]) + if (snp == ssp->srcu_sup->level[level + 1]) level++; - snp->srcu_parent = ssp->level[level - 1] + - (snp - ssp->level[level]) / + snp->srcu_parent = ssp->srcu_sup->level[level - 1] + + (snp - ssp->srcu_sup->level[level]) / levelspread[level - 1]; } @@ -214,7 +214,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) * leaves of the srcu_node tree. */ level = rcu_num_lvls - 1; - snp_first = ssp->level[level]; + snp_first = ssp->srcu_sup->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); sdp->mynode = &snp_first[cpu / levelspread[level]]; @@ -225,7 +225,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) } sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); } - smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); + smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); return true; } @@ -236,36 +236,47 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) */ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) { - ssp->srcu_size_state = SRCU_SIZE_SMALL; - ssp->node = NULL; - mutex_init(&ssp->srcu_cb_mutex); - mutex_init(&ssp->srcu_gp_mutex); + if (!is_static) + ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL); + if (!ssp->srcu_sup) + return -ENOMEM; + if (!is_static) + spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; + ssp->srcu_sup->node = NULL; + mutex_init(&ssp->srcu_sup->srcu_cb_mutex); + mutex_init(&ssp->srcu_sup->srcu_gp_mutex); ssp->srcu_idx = 0; - ssp->srcu_gp_seq = 0; - ssp->srcu_barrier_seq = 0; - mutex_init(&ssp->srcu_barrier_mutex); - atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); - INIT_DELAYED_WORK(&ssp->work, process_srcu); - ssp->sda_is_static = is_static; + ssp->srcu_sup->srcu_gp_seq = 0; + ssp->srcu_sup->srcu_barrier_seq = 0; + mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); + atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); + INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); + ssp->srcu_sup->sda_is_static = is_static; if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); - if (!ssp->sda) + if (!ssp->sda) { + if (!is_static) + kfree(ssp->srcu_sup); return -ENOMEM; + } init_srcu_struct_data(ssp); - ssp->srcu_gp_seq_needed_exp = 0; - ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); - if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { + ssp->srcu_sup->srcu_gp_seq_needed_exp = 0; + ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); + if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { - if (!ssp->sda_is_static) { + if (!ssp->srcu_sup->sda_is_static) { free_percpu(ssp->sda); ssp->sda = NULL; + kfree(ssp->srcu_sup); return -ENOMEM; } } else { - WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG); + WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } } - smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ + ssp->srcu_sup->srcu_ssp = ssp; + smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */ return 0; } @@ -277,7 +288,6 @@ int __init_srcu_struct(struct srcu_struct *ssp, const char *name, /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); lockdep_init_map(&ssp->dep_map, name, key, 0); - spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -294,7 +304,6 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); */ int init_srcu_struct(struct srcu_struct *ssp) { - spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); @@ -306,8 +315,8 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); */ static void __srcu_transition_to_big(struct srcu_struct *ssp) { - lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); - smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC); + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC); } /* @@ -318,15 +327,15 @@ static void srcu_transition_to_big(struct srcu_struct *ssp) unsigned long flags; /* Double-checked locking on ->srcu_size-state. */ - if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) return; - spin_lock_irqsave_rcu_node(ssp, flags); - if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) { - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) { + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); return; } __srcu_transition_to_big(ssp); - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -337,14 +346,14 @@ static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) { unsigned long j; - if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state) + if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state) return; j = jiffies; - if (ssp->srcu_size_jiffies != j) { - ssp->srcu_size_jiffies = j; - ssp->srcu_n_lock_retries = 0; + if (ssp->srcu_sup->srcu_size_jiffies != j) { + ssp->srcu_sup->srcu_size_jiffies = j; + ssp->srcu_sup->srcu_n_lock_retries = 0; } - if (++ssp->srcu_n_lock_retries <= small_contention_lim) + if (++ssp->srcu_sup->srcu_n_lock_retries <= small_contention_lim) return; __srcu_transition_to_big(ssp); } @@ -361,9 +370,9 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon if (spin_trylock_irqsave_rcu_node(sdp, *flags)) return; - spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); spin_lock_irqsave_check_contention(ssp); - spin_unlock_irqrestore_rcu_node(ssp, *flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags); spin_lock_irqsave_rcu_node(sdp, *flags); } @@ -375,9 +384,9 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon */ static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) { - if (spin_trylock_irqsave_rcu_node(ssp, *flags)) + if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags)) return; - spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); spin_lock_irqsave_check_contention(ssp); } @@ -394,15 +403,15 @@ static void check_init_srcu_struct(struct srcu_struct *ssp) unsigned long flags; /* The smp_load_acquire() pairs with the smp_store_release(). */ - if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/ + if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - spin_lock_irqsave_rcu_node(ssp, flags); - if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) { - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) { + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); return; } init_srcu_struct_fields(ssp, true); - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -607,17 +616,18 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) unsigned long gpstart; unsigned long j; unsigned long jbase = SRCU_INTERVAL; + struct srcu_usage *sup = ssp->srcu_sup; - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp))) jbase = 0; - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) { + if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { j = jiffies - 1; - gpstart = READ_ONCE(ssp->srcu_gp_start); + gpstart = READ_ONCE(sup->srcu_gp_start); if (time_after(j, gpstart)) jbase += j - gpstart; if (!jbase) { - WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); - if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) + WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1); + if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) jbase = 1; } } @@ -634,12 +644,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) void cleanup_srcu_struct(struct srcu_struct *ssp) { int cpu; + struct srcu_usage *sup = ssp->srcu_sup; if (WARN_ON(!srcu_get_delay(ssp))) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ - flush_delayed_work(&ssp->work); + flush_delayed_work(&sup->work); for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); @@ -648,21 +659,23 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) return; /* Forgot srcu_barrier(), so just leak it! */ } - if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || - WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) || + if (WARN_ON(rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(rcu_seq_current(&sup->srcu_gp_seq) != sup->srcu_gp_seq_needed) || WARN_ON(srcu_readers_active(ssp))) { pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", - __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)), - rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed); + __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)), + rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed); return; /* Caller forgot to stop doing call_srcu()? */ } - if (!ssp->sda_is_static) { + kfree(sup->node); + sup->node = NULL; + sup->srcu_size_state = SRCU_SIZE_SMALL; + if (!sup->sda_is_static) { free_percpu(ssp->sda); ssp->sda = NULL; + kfree(sup); + ssp->srcu_sup = NULL; } - kfree(ssp->node); - ssp->node = NULL; - ssp->srcu_size_state = SRCU_SIZE_SMALL; } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); @@ -760,23 +773,23 @@ static void srcu_gp_start(struct srcu_struct *ssp) struct srcu_data *sdp; int state; - if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); else sdp = this_cpu_ptr(ssp->sda); - lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); - WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)); spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); + rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_gp_seq)); + rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq)); spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ - WRITE_ONCE(ssp->srcu_gp_start, jiffies); - WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0); + WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies); + WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0); smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ - rcu_seq_start(&ssp->srcu_gp_seq); - state = rcu_seq_state(ssp->srcu_gp_seq); + rcu_seq_start(&ssp->srcu_sup->srcu_gp_seq); + state = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq); WARN_ON_ONCE(state != SRCU_STATE_SCAN1); } @@ -849,28 +862,29 @@ static void srcu_gp_end(struct srcu_struct *ssp) unsigned long sgsne; struct srcu_node *snp; int ss_state; + struct srcu_usage *sup = ssp->srcu_sup; /* Prevent more than one additional grace period. */ - mutex_lock(&ssp->srcu_cb_mutex); + mutex_lock(&sup->srcu_cb_mutex); /* End the current grace period. */ - spin_lock_irq_rcu_node(ssp); - idx = rcu_seq_state(ssp->srcu_gp_seq); + spin_lock_irq_rcu_node(sup); + idx = rcu_seq_state(sup->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp))) cbdelay = 0; - WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); - rcu_seq_end(&ssp->srcu_gp_seq); - gpseq = rcu_seq_current(&ssp->srcu_gp_seq); - if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) - WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq); - spin_unlock_irq_rcu_node(ssp); - mutex_unlock(&ssp->srcu_gp_mutex); + WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns()); + rcu_seq_end(&sup->srcu_gp_seq); + gpseq = rcu_seq_current(&sup->srcu_gp_seq); + if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq)) + WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq); + spin_unlock_irq_rcu_node(sup); + mutex_unlock(&sup->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ - ss_state = smp_load_acquire(&ssp->srcu_size_state); + ss_state = smp_load_acquire(&sup->srcu_size_state); if (ss_state < SRCU_SIZE_WAIT_BARRIER) { srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()), cbdelay); @@ -879,7 +893,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_irq_rcu_node(snp); cbs = false; - last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; + last_lvl = snp >= sup->level[rcu_num_lvls - 1]; if (last_lvl) cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; @@ -911,18 +925,18 @@ static void srcu_gp_end(struct srcu_struct *ssp) } /* Callback initiation done, allow grace periods after next. */ - mutex_unlock(&ssp->srcu_cb_mutex); + mutex_unlock(&sup->srcu_cb_mutex); /* Start a new grace period if needed. */ - spin_lock_irq_rcu_node(ssp); - gpseq = rcu_seq_current(&ssp->srcu_gp_seq); + spin_lock_irq_rcu_node(sup); + gpseq = rcu_seq_current(&sup->srcu_gp_seq); if (!rcu_seq_state(gpseq) && - ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) { + ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) { srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(sup); srcu_reschedule(ssp, 0); } else { - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(sup); } /* Transition to big if needed. */ @@ -930,7 +944,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) if (ss_state == SRCU_SIZE_ALLOC) init_srcu_struct_nodes(ssp, GFP_KERNEL); else - smp_store_release(&ssp->srcu_size_state, ss_state + 1); + smp_store_release(&sup->srcu_size_state, ss_state + 1); } } @@ -950,7 +964,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp if (snp) for (; snp != NULL; snp = snp->srcu_parent) { sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp); - if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) || + if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) || (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) return; spin_lock_irqsave_rcu_node(snp, flags); @@ -963,9 +977,9 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp spin_unlock_irqrestore_rcu_node(snp, flags); } spin_lock_irqsave_ssp_contention(ssp, &flags); - if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(ssp, flags); + if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s)) + WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -990,9 +1004,10 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, struct srcu_node *snp; struct srcu_node *snp_leaf; unsigned long snp_seq; + struct srcu_usage *sup = ssp->srcu_sup; /* Ensure that snp node tree is fully initialized before traversing it */ - if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + if (smp_load_acquire(&sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) snp_leaf = NULL; else snp_leaf = sdp->mynode; @@ -1000,7 +1015,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, if (snp_leaf) /* Each pass through the loop does one level of the srcu_node tree. */ for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { - if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && snp != snp_leaf) + if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf) return; /* GP already done and CBs recorded. */ spin_lock_irqsave_rcu_node(snp, flags); snp_seq = snp->srcu_have_cbs[idx]; @@ -1027,20 +1042,20 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, /* Top of tree, must ensure the grace period will be started. */ spin_lock_irqsave_ssp_contention(ssp, &flags); - if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) { + if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load * acquire setting up for initialization. */ - smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/ + smp_store_release(&sup->srcu_gp_seq_needed, s); /*^^^*/ } - if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); + if (!do_norm && ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, s)) + WRITE_ONCE(sup->srcu_gp_seq_needed_exp, s); /* If grace period not already in progress, start it. */ - if (!WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && - rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { - WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); + if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && + rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) { + WARN_ON_ONCE(ULONG_CMP_GE(sup->srcu_gp_seq, sup->srcu_gp_seq_needed)); srcu_gp_start(ssp); // And how can that list_add() in the "else" clause @@ -1049,12 +1064,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, // can only be executed during early boot when there is only // the one boot CPU running with interrupts still disabled. if (likely(srcu_init_done)) - queue_delayed_work(rcu_gp_wq, &ssp->work, + queue_delayed_work(rcu_gp_wq, &sup->work, !!srcu_get_delay(ssp)); - else if (list_empty(&ssp->work.work.entry)) - list_add(&ssp->work.work.entry, &srcu_boot_list); + else if (list_empty(&sup->work.work.entry)) + list_add(&sup->work.work.entry, &srcu_boot_list); } - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(sup, flags); } /* @@ -1085,16 +1100,36 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) static void srcu_flip(struct srcu_struct *ssp) { /* - * Ensure that if this updater saw a given reader's increment - * from __srcu_read_lock(), that reader was using an old value - * of ->srcu_idx. Also ensure that if a given reader sees the - * new value of ->srcu_idx, this updater's earlier scans cannot - * have seen that reader's increments (which is OK, because this - * grace period need not wait on that reader). + * Because the flip of ->srcu_idx is executed only if the + * preceding call to srcu_readers_active_idx_check() found that + * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched + * and because that summing uses atomic_long_read(), there is + * ordering due to a control dependency between that summing and + * the WRITE_ONCE() in this call to srcu_flip(). This ordering + * ensures that if this updater saw a given reader's increment from + * __srcu_read_lock(), that reader was using a value of ->srcu_idx + * from before the previous call to srcu_flip(), which should be + * quite rare. This ordering thus helps forward progress because + * the grace period could otherwise be delayed by additional + * calls to __srcu_read_lock() using that old (soon to be new) + * value of ->srcu_idx. + * + * This sum-equality check and ordering also ensures that if + * a given call to __srcu_read_lock() uses the new value of + * ->srcu_idx, this updater's earlier scans cannot have seen + * that reader's increments, which is all to the good, because + * this grace period need not wait on that reader. After all, + * if those earlier scans had seen that reader, there would have + * been a sum mismatch and this code would not be reached. + * + * This means that the following smp_mb() is redundant, but + * it stays until either (1) Compilers learn about this sort of + * control dependency or (2) Some production workload running on + * a production system is unduly delayed by this slowpath smp_mb(). */ smp_mb(); /* E */ /* Pairs with B and C. */ - WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter. /* * Ensure that if the updater misses an __srcu_read_unlock() @@ -1154,18 +1189,18 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) /* First, see if enough time has passed since the last GP. */ t = ktime_get_mono_fast_ns(); - tlast = READ_ONCE(ssp->srcu_last_gp_end); + tlast = READ_ONCE(ssp->srcu_sup->srcu_last_gp_end); if (exp_holdoff == 0 || time_in_range_open(t, tlast, tlast + exp_holdoff)) return false; /* Too soon after last GP. */ /* Next, check for probable idleness. */ - curseq = rcu_seq_current(&ssp->srcu_gp_seq); + curseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ - if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed))) + if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed))) return false; /* Grace period in progress, so not idle. */ smp_mb(); /* Order ->srcu_gp_seq with prior access. */ - if (curseq != rcu_seq_current(&ssp->srcu_gp_seq)) + if (curseq != rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)) return false; /* GP # changed, so not idle. */ return true; /* With reasonable probability, idle! */ } @@ -1199,7 +1234,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, * sequence number cannot wrap around in the meantime. */ idx = __srcu_read_lock_nmisafe(ssp); - ss_state = smp_load_acquire(&ssp->srcu_size_state); + ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state); if (ss_state < SRCU_SIZE_WAIT_CALL) sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); else @@ -1208,8 +1243,8 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); - s = rcu_seq_snap(&ssp->srcu_gp_seq); + rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); + s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { sdp->srcu_gp_seq_needed = s; @@ -1307,6 +1342,8 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) { struct rcu_synchronize rcu; + srcu_lock_sync(&ssp->dep_map); + RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || @@ -1420,7 +1457,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) // Any prior manipulation of SRCU-protected data must happen // before the load from ->srcu_gp_seq. smp_mb(); - return rcu_seq_snap(&ssp->srcu_gp_seq); + return rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); @@ -1467,7 +1504,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); */ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) { - if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie)) + if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie)) return false; // Ensure that the end of the SRCU grace period happens before // any subsequent code that the caller might execute. @@ -1486,8 +1523,8 @@ static void srcu_barrier_cb(struct rcu_head *rhp) sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); ssp = sdp->ssp; - if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) - complete(&ssp->srcu_barrier_completion); + if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt)) + complete(&ssp->srcu_sup->srcu_barrier_completion); } /* @@ -1501,13 +1538,13 @@ static void srcu_barrier_cb(struct rcu_head *rhp) static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) { spin_lock_irq_rcu_node(sdp); - atomic_inc(&ssp->srcu_barrier_cpu_cnt); + atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); if (!rcu_segcblist_entrain(&sdp->srcu_cblist, &sdp->srcu_barrier_head)) { debug_rcu_head_unqueue(&sdp->srcu_barrier_head); - atomic_dec(&ssp->srcu_barrier_cpu_cnt); + atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt); } spin_unlock_irq_rcu_node(sdp); } @@ -1520,23 +1557,23 @@ void srcu_barrier(struct srcu_struct *ssp) { int cpu; int idx; - unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); + unsigned long s = rcu_seq_snap(&ssp->srcu_sup->srcu_barrier_seq); check_init_srcu_struct(ssp); - mutex_lock(&ssp->srcu_barrier_mutex); - if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) { + mutex_lock(&ssp->srcu_sup->srcu_barrier_mutex); + if (rcu_seq_done(&ssp->srcu_sup->srcu_barrier_seq, s)) { smp_mb(); /* Force ordering following return. */ - mutex_unlock(&ssp->srcu_barrier_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex); return; /* Someone else did our work for us. */ } - rcu_seq_start(&ssp->srcu_barrier_seq); - init_completion(&ssp->srcu_barrier_completion); + rcu_seq_start(&ssp->srcu_sup->srcu_barrier_seq); + init_completion(&ssp->srcu_sup->srcu_barrier_completion); /* Initial count prevents reaching zero until all CBs are posted. */ - atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); + atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 1); idx = __srcu_read_lock_nmisafe(ssp); - if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id())); else for_each_possible_cpu(cpu) @@ -1544,12 +1581,12 @@ void srcu_barrier(struct srcu_struct *ssp) __srcu_read_unlock_nmisafe(ssp, idx); /* Remove the initial count, at which point reaching zero can happen. */ - if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) - complete(&ssp->srcu_barrier_completion); - wait_for_completion(&ssp->srcu_barrier_completion); + if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt)) + complete(&ssp->srcu_sup->srcu_barrier_completion); + wait_for_completion(&ssp->srcu_sup->srcu_barrier_completion); - rcu_seq_end(&ssp->srcu_barrier_seq); - mutex_unlock(&ssp->srcu_barrier_mutex); + rcu_seq_end(&ssp->srcu_sup->srcu_barrier_seq); + mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex); } EXPORT_SYMBOL_GPL(srcu_barrier); @@ -1575,7 +1612,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) { int idx; - mutex_lock(&ssp->srcu_gp_mutex); + mutex_lock(&ssp->srcu_sup->srcu_gp_mutex); /* * Because readers might be delayed for an extended period after @@ -1587,39 +1624,39 @@ static void srcu_advance_state(struct srcu_struct *ssp) * The load-acquire ensures that we see the accesses performed * by the prior grace period. */ - idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */ + idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - spin_lock_irq_rcu_node(ssp); - if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { - WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq)); - spin_unlock_irq_rcu_node(ssp); - mutex_unlock(&ssp->srcu_gp_mutex); + spin_lock_irq_rcu_node(ssp->srcu_sup); + if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { + WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)); + spin_unlock_irq_rcu_node(ssp->srcu_sup); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; } - idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); + idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (idx != SRCU_STATE_IDLE) { - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* Someone else started the grace period. */ } } - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { + if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) { idx = 1 ^ (ssp->srcu_idx & 1); if (!try_check_zero(ssp, idx, 1)) { - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ } srcu_flip(ssp); - spin_lock_irq_rcu_node(ssp); - rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); - ssp->srcu_n_exp_nodelay = 0; - spin_unlock_irq_rcu_node(ssp); + spin_lock_irq_rcu_node(ssp->srcu_sup); + rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2); + ssp->srcu_sup->srcu_n_exp_nodelay = 0; + spin_unlock_irq_rcu_node(ssp->srcu_sup); } - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { + if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) { /* * SRCU read-side critical sections are normally short, @@ -1627,10 +1664,10 @@ static void srcu_advance_state(struct srcu_struct *ssp) */ idx = 1 ^ (ssp->srcu_idx & 1); if (!try_check_zero(ssp, idx, 2)) { - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ } - ssp->srcu_n_exp_nodelay = 0; + ssp->srcu_sup->srcu_n_exp_nodelay = 0; srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ } } @@ -1656,7 +1693,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) rcu_cblist_init(&ready_cbs); spin_lock_irq_rcu_node(sdp); rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); + rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { spin_unlock_irq_rcu_node(sdp); @@ -1684,7 +1721,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) spin_lock_irq_rcu_node(sdp); rcu_segcblist_add_len(&sdp->srcu_cblist, -len); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_gp_seq)); + rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); spin_unlock_irq_rcu_node(sdp); @@ -1700,20 +1737,20 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) { bool pushgp = true; - spin_lock_irq_rcu_node(ssp); - if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { - if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) { + spin_lock_irq_rcu_node(ssp->srcu_sup); + if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { + if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ pushgp = false; } - } else if (!rcu_seq_state(ssp->srcu_gp_seq)) { + } else if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)) { /* Outstanding request and no GP. Start one. */ srcu_gp_start(ssp); } - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (pushgp) - queue_delayed_work(rcu_gp_wq, &ssp->work, delay); + queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay); } /* @@ -1724,22 +1761,24 @@ static void process_srcu(struct work_struct *work) unsigned long curdelay; unsigned long j; struct srcu_struct *ssp; + struct srcu_usage *sup; - ssp = container_of(work, struct srcu_struct, work.work); + sup = container_of(work, struct srcu_usage, work.work); + ssp = sup->srcu_ssp; srcu_advance_state(ssp); curdelay = srcu_get_delay(ssp); if (curdelay) { - WRITE_ONCE(ssp->reschedule_count, 0); + WRITE_ONCE(sup->reschedule_count, 0); } else { j = jiffies; - if (READ_ONCE(ssp->reschedule_jiffies) == j) { - WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1); - if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay) + if (READ_ONCE(sup->reschedule_jiffies) == j) { + WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1); + if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay) curdelay = 1; } else { - WRITE_ONCE(ssp->reschedule_count, 1); - WRITE_ONCE(ssp->reschedule_jiffies, j); + WRITE_ONCE(sup->reschedule_count, 1); + WRITE_ONCE(sup->reschedule_jiffies, j); } } srcu_reschedule(ssp, curdelay); @@ -1752,7 +1791,7 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, if (test_type != SRCU_FLAVOR) return; *flags = 0; - *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq); + *gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); @@ -1774,14 +1813,14 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) int cpu; int idx; unsigned long s0 = 0, s1 = 0; - int ss_state = READ_ONCE(ssp->srcu_size_state); + int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state); int ss_state_idx = ss_state; idx = ssp->srcu_idx & 0x1; if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; pr_alert("%s%s Tree SRCU g%ld state %d (%s)", - tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state, + tt, tf, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ss_state, srcu_size_state_name[ss_state_idx]); if (!ssp->sda) { // Called after cleanup_srcu_struct(), perhaps. @@ -1838,7 +1877,7 @@ early_initcall(srcu_bootup_announce); void __init srcu_init(void) { - struct srcu_struct *ssp; + struct srcu_usage *sup; /* Decide on srcu_struct-size strategy. */ if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) { @@ -1858,12 +1897,13 @@ void __init srcu_init(void) */ srcu_init_done = true; while (!list_empty(&srcu_boot_list)) { - ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, + sup = list_first_entry(&srcu_boot_list, struct srcu_usage, work.work.entry); - list_del_init(&ssp->work.work.entry); - if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL) - ssp->srcu_size_state = SRCU_SIZE_ALLOC; - queue_work(rcu_gp_wq, &ssp->work.work); + list_del_init(&sup->work.work.entry); + if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && + sup->srcu_size_state == SRCU_SIZE_SMALL) + sup->srcu_size_state = SRCU_SIZE_ALLOC; + queue_work(rcu_gp_wq, &sup->work.work); } } @@ -1873,13 +1913,14 @@ void __init srcu_init(void) static int srcu_module_coming(struct module *mod) { int i; + struct srcu_struct *ssp; struct srcu_struct **sspp = mod->srcu_struct_ptrs; - int ret; for (i = 0; i < mod->num_srcu_structs; i++) { - ret = init_srcu_struct(*(sspp++)); - if (WARN_ON_ONCE(ret)) - return ret; + ssp = *(sspp++); + ssp->sda = alloc_percpu(struct srcu_data); + if (WARN_ON_ONCE(!ssp->sda)) + return -ENOMEM; } return 0; } @@ -1888,10 +1929,17 @@ static int srcu_module_coming(struct module *mod) static void srcu_module_going(struct module *mod) { int i; + struct srcu_struct *ssp; struct srcu_struct **sspp = mod->srcu_struct_ptrs; - for (i = 0; i < mod->num_srcu_structs; i++) - cleanup_srcu_struct(*(sspp++)); + for (i = 0; i < mod->num_srcu_structs; i++) { + ssp = *(sspp++); + if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) && + !WARN_ON_ONCE(!ssp->srcu_sup->sda_is_static)) + cleanup_srcu_struct(ssp); + if (!WARN_ON(srcu_readers_active(ssp))) + free_percpu(ssp->sda); + } } /* Handle one module, either coming or going. */ diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index bfb5e1549f2b..5f4fc8184dd0 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -136,8 +136,16 @@ static struct rcu_tasks rt_name = \ .kname = #rt_name, \ } +#ifdef CONFIG_TASKS_RCU /* Track exiting tasks in order to allow them to be waited for. */ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); +#endif + +#ifdef CONFIG_TASKS_RCU +/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */ +static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); +static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall); +#endif /* Avoid IPIing CPUs early in the grace period. */ #define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0) @@ -830,6 +838,13 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) /* Processing between scanning taskslist and draining the holdout list. */ static void rcu_tasks_postscan(struct list_head *hop) { + int rtsi = READ_ONCE(rcu_task_stall_info); + + if (!IS_ENABLED(CONFIG_TINY_RCU)) { + tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi; + add_timer(&tasks_rcu_exit_srcu_stall_timer); + } + /* * Exiting tasks may escape the tasklist scan. Those are vulnerable * until their final schedule() with TASK_DEAD state. To cope with @@ -848,6 +863,9 @@ static void rcu_tasks_postscan(struct list_head *hop) * call to synchronize_rcu(). */ synchronize_srcu(&tasks_rcu_exit_srcu); + + if (!IS_ENABLED(CONFIG_TINY_RCU)) + del_timer_sync(&tasks_rcu_exit_srcu_stall_timer); } /* See if tasks are still holding out, complain if so. */ @@ -923,6 +941,21 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp) void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); +static void tasks_rcu_exit_srcu_stall(struct timer_list *unused) +{ +#ifndef CONFIG_TINY_RCU + int rtsi; + + rtsi = READ_ONCE(rcu_task_stall_info); + pr_info("%s: %s grace period number %lu (since boot) gp_state: %s is %lu jiffies old.\n", + __func__, rcu_tasks.kname, rcu_tasks.tasks_gp_seq, + tasks_gp_state_getname(&rcu_tasks), jiffies - rcu_tasks.gp_jiffies); + pr_info("Please check any exiting tasks stuck between calls to exit_tasks_rcu_start() and exit_tasks_rcu_finish()\n"); + tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi; + add_timer(&tasks_rcu_exit_srcu_stall_timer); +#endif // #ifndef CONFIG_TINY_RCU +} + /** * call_rcu_tasks() - Queue an RCU for invocation task-based grace period * @rhp: structure to be used for queueing the RCU updates. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e880c09ab59..f52ff7241041 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -640,6 +640,7 @@ void __rcu_irq_enter_check_tick(void) } raw_spin_unlock_rcu_node(rdp->mynode); } +NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick); #endif /* CONFIG_NO_HZ_FULL */ /* @@ -1955,7 +1956,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; - bool needwake = false; bool needacc = false; struct rcu_node *rnp; @@ -1987,7 +1987,12 @@ rcu_report_qs_rdp(struct rcu_data *rdp) * NOCB kthreads have their own way to deal with that... */ if (!rcu_rdp_is_offloaded(rdp)) { - needwake = rcu_accelerate_cbs(rnp, rdp); + /* + * The current GP has not yet ended, so it + * should not be possible for rcu_accelerate_cbs() + * to return true. So complain, but don't awaken. + */ + WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp)); } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { /* * ...but NOCB kthreads may miss or delay callbacks acceleration @@ -1999,8 +2004,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp) rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ - if (needwake) - rcu_gp_kthread_wake(); if (needacc) { rcu_nocb_lock_irqsave(rdp, flags); @@ -2131,6 +2134,8 @@ static void rcu_do_batch(struct rcu_data *rdp) break; } } else { + // In rcuoc context, so no worries about depriving + // other softirq vectors of CPU cycles. local_bh_enable(); lockdep_assert_irqs_enabled(); cond_resched_tasks_rcu_qs(); @@ -3024,6 +3029,18 @@ need_offload_krc(struct kfree_rcu_cpu *krcp) return !!READ_ONCE(krcp->head); } +static bool +need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp) +{ + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + if (!list_empty(&krwp->bulk_head_free[i])) + return true; + + return !!krwp->head_free; +} + static int krc_count(struct kfree_rcu_cpu *krcp) { int sum = atomic_read(&krcp->head_count); @@ -3107,15 +3124,14 @@ static void kfree_rcu_monitor(struct work_struct *work) for (i = 0; i < KFREE_N_BATCHES; i++) { struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); - // Try to detach bulk_head or head and attach it over any - // available corresponding free channel. It can be that - // a previous RCU batch is in progress, it means that - // immediately to queue another one is not possible so - // in that case the monitor work is rearmed. - if ((!list_empty(&krcp->bulk_head[0]) && list_empty(&krwp->bulk_head_free[0])) || - (!list_empty(&krcp->bulk_head[1]) && list_empty(&krwp->bulk_head_free[1])) || - (READ_ONCE(krcp->head) && !krwp->head_free)) { + // Try to detach bulk_head or head and attach it, only when + // all channels are free. Any channel is not free means at krwp + // there is on-going rcu work to handle krwp's free business. + if (need_wait_for_krwp_work(krwp)) + continue; + // kvfree_rcu_drain_ready() might handle this krcp, if so give up. + if (need_offload_krc(krcp)) { // Channel 1 corresponds to the SLAB-pointer bulk path. // Channel 2 corresponds to vmalloc-pointer bulk path. for (j = 0; j < FREE_N_CHANNELS; j++) { @@ -4940,9 +4956,8 @@ void __init rcu_init(void) else qovld_calc = qovld; - // Kick-start any polled grace periods that started early. - if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1)) - (void)start_poll_synchronize_rcu_expedited(); + // Kick-start in case any polled grace periods started early. + (void)start_poll_synchronize_rcu_expedited(); rcu_test_sync_prims(); } diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 249c2967d9e6..3b7abb58157d 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -594,6 +594,7 @@ static void synchronize_rcu_expedited_wait(void) struct rcu_data *rdp; struct rcu_node *rnp; struct rcu_node *rnp_root = rcu_get_root(); + unsigned long flags; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); jiffies_stall = rcu_exp_jiffies_till_stall_check(); @@ -602,17 +603,17 @@ static void synchronize_rcu_expedited_wait(void) if (synchronize_rcu_expedited_wait_once(1)) return; rcu_for_each_leaf_node(rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); mask = READ_ONCE(rnp->expmask); for_each_leaf_node_cpu_mask(rnp, cpu, mask) { rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->rcu_forced_tick_exp) continue; rdp->rcu_forced_tick_exp = true; - preempt_disable(); if (cpu_online(cpu)) tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); - preempt_enable(); } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } j = READ_ONCE(jiffies_till_first_fqs); if (synchronize_rcu_expedited_wait_once(j + HZ)) @@ -802,9 +803,11 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) int ndetected = 0; struct task_struct *t; - if (!READ_ONCE(rnp->exp_tasks)) - return 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!rnp->exp_tasks) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return 0; + } t = list_entry(rnp->exp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { @@ -1065,9 +1068,10 @@ unsigned long start_poll_synchronize_rcu_expedited(void) if (rcu_init_invoked()) raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); if (!poll_state_synchronize_rcu(s)) { - rnp->exp_seq_poll_rq = s; - if (rcu_init_invoked()) + if (rcu_init_invoked()) { + rnp->exp_seq_poll_rq = s; queue_work(rcu_gp_wq, &rnp->exp_poll_wq); + } } if (rcu_init_invoked()) raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 9e1c8caec5ce..f2280616f9d5 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1312,6 +1312,7 @@ int rcu_nocb_cpu_offload(int cpu) } EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); +#ifdef CONFIG_RCU_LAZY static unsigned long lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -1360,6 +1361,7 @@ static struct shrinker lazy_rcu_shrinker = { .batch = 0, .seeks = DEFAULT_SEEKS, }; +#endif // #ifdef CONFIG_RCU_LAZY void __init rcu_init_nohz(void) { @@ -1391,8 +1393,10 @@ void __init rcu_init_nohz(void) if (!rcu_state.nocb_is_setup) return; +#ifdef CONFIG_RCU_LAZY if (register_shrinker(&lazy_rcu_shrinker, "rcu-lazy")) pr_err("Failed to register lazy_rcu shrinker!\n"); +#endif // #ifdef CONFIG_RCU_LAZY if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6986ea31c984..5f6587d94c1d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10238,6 +10238,16 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / sds->total_capacity; + + /* + * If the local group is more loaded than the average system + * load, don't try to pull any tasks. + */ + if (local->avg_load >= sds->avg_load) { + env->imbalance = 0; + return; + } + } /* diff --git a/kernel/sys.c b/kernel/sys.c index 495cd87d9bf4..351de7916302 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -664,6 +664,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) struct cred *new; int retval; kuid_t kruid, keuid, ksuid; + bool ruid_new, euid_new, suid_new; kruid = make_kuid(ns, ruid); keuid = make_kuid(ns, euid); @@ -678,25 +679,29 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) if ((suid != (uid_t) -1) && !uid_valid(ksuid)) return -EINVAL; + old = current_cred(); + + /* check for no-op */ + if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) && + (euid == (uid_t) -1 || (uid_eq(keuid, old->euid) && + uid_eq(keuid, old->fsuid))) && + (suid == (uid_t) -1 || uid_eq(ksuid, old->suid))) + return 0; + + ruid_new = ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && + !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid); + euid_new = euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && + !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid); + suid_new = suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && + !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid); + if ((ruid_new || euid_new || suid_new) && + !ns_capable_setid(old->user_ns, CAP_SETUID)) + return -EPERM; + new = prepare_creds(); if (!new) return -ENOMEM; - old = current_cred(); - - retval = -EPERM; - if (!ns_capable_setid(old->user_ns, CAP_SETUID)) { - if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && - !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) - goto error; - if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && - !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid)) - goto error; - if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && - !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid)) - goto error; - } - if (ruid != (uid_t) -1) { new->uid = kruid; if (!uid_eq(kruid, old->uid)) { @@ -761,6 +766,7 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) struct cred *new; int retval; kgid_t krgid, kegid, ksgid; + bool rgid_new, egid_new, sgid_new; krgid = make_kgid(ns, rgid); kegid = make_kgid(ns, egid); @@ -773,23 +779,28 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) return -EINVAL; + old = current_cred(); + + /* check for no-op */ + if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) && + (egid == (gid_t) -1 || (gid_eq(kegid, old->egid) && + gid_eq(kegid, old->fsgid))) && + (sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid))) + return 0; + + rgid_new = rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && + !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid); + egid_new = egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && + !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid); + sgid_new = sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && + !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid); + if ((rgid_new || egid_new || sgid_new) && + !ns_capable_setid(old->user_ns, CAP_SETGID)) + return -EPERM; + new = prepare_creds(); if (!new) return -ENOMEM; - old = current_cred(); - - retval = -EPERM; - if (!ns_capable_setid(old->user_ns, CAP_SETGID)) { - if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && - !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) - goto error; - if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && - !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid)) - goto error; - if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && - !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid)) - goto error; - } if (rgid != (gid_t) -1) new->gid = krgid; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b0e3c9205946..a46506f7ec6d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -281,6 +281,11 @@ static bool check_tick_dependency(atomic_t *dep) return true; } + if (val & TICK_DEP_MASK_RCU_EXP) { + trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP); + return true; + } + return false; } @@ -527,7 +532,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask) tick_nohz_full_running = true; } -static int tick_nohz_cpu_down(unsigned int cpu) +bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { /* * The tick_do_timer_cpu CPU handles housekeeping duty (unbound @@ -535,8 +540,13 @@ static int tick_nohz_cpu_down(unsigned int cpu) * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && tick_do_timer_cpu == cpu) - return -EBUSY; - return 0; + return false; + return true; +} + +static int tick_nohz_cpu_down(unsigned int cpu) +{ + return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY; } void __init tick_nohz_init(void) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 4496975f2029..efbbec2caff8 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -159,7 +159,7 @@ static void osnoise_unregister_instance(struct trace_array *tr) if (!found) return; - kvfree_rcu(inst); + kvfree_rcu_mightsleep(inst); } /* diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 20d0c4a97633..2d2616678295 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1172,7 +1172,7 @@ int trace_probe_remove_file(struct trace_probe *tp, return -ENOENT; list_del_rcu(&link->list); - kvfree_rcu(link); + kvfree_rcu_mightsleep(link); if (list_empty(&tp->event->files)) trace_probe_clear_flag(tp, TP_FLAG_TRACE); diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c new file mode 100644 index 000000000000..b7cbd66f889e --- /dev/null +++ b/kernel/vhost_task.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Oracle Corporation + */ +#include <linux/slab.h> +#include <linux/completion.h> +#include <linux/sched/task.h> +#include <linux/sched/vhost_task.h> +#include <linux/sched/signal.h> + +enum vhost_task_flags { + VHOST_TASK_FLAGS_STOP, +}; + +static int vhost_task_fn(void *data) +{ + struct vhost_task *vtsk = data; + int ret; + + ret = vtsk->fn(vtsk->data); + complete(&vtsk->exited); + do_exit(ret); +} + +/** + * vhost_task_stop - stop a vhost_task + * @vtsk: vhost_task to stop + * + * Callers must call vhost_task_should_stop and return from their worker + * function when it returns true; + */ +void vhost_task_stop(struct vhost_task *vtsk) +{ + pid_t pid = vtsk->task->pid; + + set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); + wake_up_process(vtsk->task); + /* + * Make sure vhost_task_fn is no longer accessing the vhost_task before + * freeing it below. If userspace crashed or exited without closing, + * then the vhost_task->task could already be marked dead so + * kernel_wait will return early. + */ + wait_for_completion(&vtsk->exited); + /* + * If we are just closing/removing a device and the parent process is + * not exiting then reap the task. + */ + kernel_wait4(pid, NULL, __WCLONE, NULL); + kfree(vtsk); +} +EXPORT_SYMBOL_GPL(vhost_task_stop); + +/** + * vhost_task_should_stop - should the vhost task return from the work function + * @vtsk: vhost_task to stop + */ +bool vhost_task_should_stop(struct vhost_task *vtsk) +{ + return test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); +} +EXPORT_SYMBOL_GPL(vhost_task_should_stop); + +/** + * vhost_task_create - create a copy of a process to be used by the kernel + * @fn: thread stack + * @arg: data to be passed to fn + * @name: the thread's name + * + * This returns a specialized task for use by the vhost layer or NULL on + * failure. The returned task is inactive, and the caller must fire it up + * through vhost_task_start(). + */ +struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg, + const char *name) +{ + struct kernel_clone_args args = { + .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM, + .exit_signal = 0, + .fn = vhost_task_fn, + .name = name, + .user_worker = 1, + .no_files = 1, + .ignore_signals = 1, + }; + struct vhost_task *vtsk; + struct task_struct *tsk; + + vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL); + if (!vtsk) + return NULL; + init_completion(&vtsk->exited); + vtsk->data = arg; + vtsk->fn = fn; + + args.fn_arg = vtsk; + + tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args); + if (IS_ERR(tsk)) { + kfree(vtsk); + return NULL; + } + + vtsk->task = tsk; + return vtsk; +} +EXPORT_SYMBOL_GPL(vhost_task_create); + +/** + * vhost_task_start - start a vhost_task created with vhost_task_create + * @vtsk: vhost_task to wake up + */ +void vhost_task_start(struct vhost_task *vtsk) +{ + wake_up_new_task(vtsk->task); +} +EXPORT_SYMBOL_GPL(vhost_task_start); diff --git a/lib/debugobjects.c b/lib/debugobjects.c index df86e649d8be..b796799fadb2 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -216,10 +216,6 @@ static struct debug_obj *__alloc_object(struct hlist_head *list) return obj; } -/* - * Allocate a new object. If the pool is empty, switch off the debugger. - * Must be called with interrupts disabled. - */ static struct debug_obj * alloc_object(void *addr, struct debug_bucket *b, const struct debug_obj_descr *descr) { @@ -552,11 +548,49 @@ static void debug_object_is_on_stack(void *addr, int onstack) WARN_ON(1); } +static struct debug_obj *lookup_object_or_alloc(void *addr, struct debug_bucket *b, + const struct debug_obj_descr *descr, + bool onstack, bool alloc_ifstatic) +{ + struct debug_obj *obj = lookup_object(addr, b); + enum debug_obj_state state = ODEBUG_STATE_NONE; + + if (likely(obj)) + return obj; + + /* + * debug_object_init() unconditionally allocates untracked + * objects. It does not matter whether it is a static object or + * not. + * + * debug_object_assert_init() and debug_object_activate() allow + * allocation only if the descriptor callback confirms that the + * object is static and considered initialized. For non-static + * objects the allocation needs to be done from the fixup callback. + */ + if (unlikely(alloc_ifstatic)) { + if (!descr->is_static_object || !descr->is_static_object(addr)) + return ERR_PTR(-ENOENT); + /* Statically allocated objects are considered initialized */ + state = ODEBUG_STATE_INIT; + } + + obj = alloc_object(addr, b, descr); + if (likely(obj)) { + obj->state = state; + debug_object_is_on_stack(addr, onstack); + return obj; + } + + /* Out of memory. Do the cleanup outside of the locked region */ + debug_objects_enabled = 0; + return NULL; +} + static void __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack) { enum debug_obj_state state; - bool check_stack = false; struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; @@ -572,16 +606,11 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack raw_spin_lock_irqsave(&db->lock, flags); - obj = lookup_object(addr, db); - if (!obj) { - obj = alloc_object(addr, db, descr); - if (!obj) { - debug_objects_enabled = 0; - raw_spin_unlock_irqrestore(&db->lock, flags); - debug_objects_oom(); - return; - } - check_stack = true; + obj = lookup_object_or_alloc(addr, db, descr, onstack, false); + if (unlikely(!obj)) { + raw_spin_unlock_irqrestore(&db->lock, flags); + debug_objects_oom(); + return; } switch (obj->state) { @@ -607,8 +636,6 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack } raw_spin_unlock_irqrestore(&db->lock, flags); - if (check_stack) - debug_object_is_on_stack(addr, onstack); } /** @@ -648,14 +675,12 @@ EXPORT_SYMBOL_GPL(debug_object_init_on_stack); */ int debug_object_activate(void *addr, const struct debug_obj_descr *descr) { + struct debug_obj o = { .object = addr, .state = ODEBUG_STATE_NOTAVAILABLE, .descr = descr }; enum debug_obj_state state; struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; int ret; - struct debug_obj o = { .object = addr, - .state = ODEBUG_STATE_NOTAVAILABLE, - .descr = descr }; if (!debug_objects_enabled) return 0; @@ -664,8 +689,8 @@ int debug_object_activate(void *addr, const struct debug_obj_descr *descr) raw_spin_lock_irqsave(&db->lock, flags); - obj = lookup_object(addr, db); - if (obj) { + obj = lookup_object_or_alloc(addr, db, descr, false, true); + if (likely(!IS_ERR_OR_NULL(obj))) { bool print_object = false; switch (obj->state) { @@ -698,24 +723,16 @@ int debug_object_activate(void *addr, const struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); - /* - * We are here when a static object is activated. We - * let the type specific code confirm whether this is - * true or not. if true, we just make sure that the - * static object is tracked in the object tracker. If - * not, this must be a bug, so we try to fix it up. - */ - if (descr->is_static_object && descr->is_static_object(addr)) { - /* track this static object */ - debug_object_init(addr, descr); - debug_object_activate(addr, descr); - } else { - debug_print_object(&o, "activate"); - ret = debug_object_fixup(descr->fixup_activate, addr, - ODEBUG_STATE_NOTAVAILABLE); - return ret ? 0 : -EINVAL; + /* If NULL the allocation has hit OOM */ + if (!obj) { + debug_objects_oom(); + return 0; } - return 0; + + /* Object is neither static nor tracked. It's not initialized */ + debug_print_object(&o, "activate"); + ret = debug_object_fixup(descr->fixup_activate, addr, ODEBUG_STATE_NOTAVAILABLE); + return ret ? 0 : -EINVAL; } EXPORT_SYMBOL_GPL(debug_object_activate); @@ -869,6 +886,7 @@ EXPORT_SYMBOL_GPL(debug_object_free); */ void debug_object_assert_init(void *addr, const struct debug_obj_descr *descr) { + struct debug_obj o = { .object = addr, .state = ODEBUG_STATE_NOTAVAILABLE, .descr = descr }; struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; @@ -879,31 +897,20 @@ void debug_object_assert_init(void *addr, const struct debug_obj_descr *descr) db = get_bucket((unsigned long) addr); raw_spin_lock_irqsave(&db->lock, flags); + obj = lookup_object_or_alloc(addr, db, descr, false, true); + raw_spin_unlock_irqrestore(&db->lock, flags); + if (likely(!IS_ERR_OR_NULL(obj))) + return; - obj = lookup_object(addr, db); + /* If NULL the allocation has hit OOM */ if (!obj) { - struct debug_obj o = { .object = addr, - .state = ODEBUG_STATE_NOTAVAILABLE, - .descr = descr }; - - raw_spin_unlock_irqrestore(&db->lock, flags); - /* - * Maybe the object is static, and we let the type specific - * code confirm. Track this static object if true, else invoke - * fixup. - */ - if (descr->is_static_object && descr->is_static_object(addr)) { - /* Track this static object */ - debug_object_init(addr, descr); - } else { - debug_print_object(&o, "assert_init"); - debug_object_fixup(descr->fixup_assert_init, addr, - ODEBUG_STATE_NOTAVAILABLE); - } + debug_objects_oom(); return; } - raw_spin_unlock_irqrestore(&db->lock, flags); + /* Object is neither tracked nor static. It's not initialized. */ + debug_print_object(&o, "assert_init"); + debug_object_fixup(descr->fixup_assert_init, addr, ODEBUG_STATE_NOTAVAILABLE); } EXPORT_SYMBOL_GPL(debug_object_assert_init); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 274014e4eafe..967fba189c5f 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -126,13 +126,13 @@ __out: \ iterate_buf(i, n, base, len, off, \ i->ubuf, (I)) \ } else if (likely(iter_is_iovec(i))) { \ - const struct iovec *iov = i->iov; \ + const struct iovec *iov = iter_iov(i); \ void __user *base; \ size_t len; \ iterate_iovec(i, n, base, len, off, \ iov, (I)) \ - i->nr_segs -= iov - i->iov; \ - i->iov = iov; \ + i->nr_segs -= iov - iter_iov(i); \ + i->__iov = iov; \ } else if (iov_iter_is_bvec(i)) { \ const struct bio_vec *bvec = i->bvec; \ void *base; \ @@ -355,7 +355,7 @@ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) size_t skip; size -= count; - for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { + for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; @@ -398,7 +398,7 @@ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) size_t skip; size -= count; - for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { + for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; @@ -425,7 +425,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, .nofault = false, .user_backed = true, .data_source = direction, - .iov = iov, + .__iov = iov, .nr_segs = nr_segs, .iov_offset = 0, .count = count @@ -876,14 +876,14 @@ static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) i->count -= size; size += i->iov_offset; // from beginning of current segment - for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) { + for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) { if (likely(size < iov->iov_len)) break; size -= iov->iov_len; } i->iov_offset = size; - i->nr_segs -= iov - i->iov; - i->iov = iov; + i->nr_segs -= iov - iter_iov(i); + i->__iov = iov; } void iov_iter_advance(struct iov_iter *i, size_t size) @@ -958,12 +958,12 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) unroll -= n; } } else { /* same logics for iovec and kvec */ - const struct iovec *iov = i->iov; + const struct iovec *iov = iter_iov(i); while (1) { size_t n = (--iov)->iov_len; i->nr_segs++; if (unroll <= n) { - i->iov = iov; + i->__iov = iov; i->iov_offset = n - unroll; return; } @@ -980,7 +980,7 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) { if (i->nr_segs > 1) { if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) - return min(i->count, i->iov->iov_len - i->iov_offset); + return min(i->count, iter_iov(i)->iov_len - i->iov_offset); if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); } @@ -1095,13 +1095,14 @@ static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, unsigned k; for (k = 0; k < i->nr_segs; k++, skip = 0) { - size_t len = i->iov[k].iov_len - skip; + const struct iovec *iov = iter_iov(i) + k; + size_t len = iov->iov_len - skip; if (len > size) len = size; if (len & len_mask) return false; - if ((unsigned long)(i->iov[k].iov_base + skip) & addr_mask) + if ((unsigned long)(iov->iov_base + skip) & addr_mask) return false; size -= len; @@ -1194,9 +1195,10 @@ static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) unsigned k; for (k = 0; k < i->nr_segs; k++, skip = 0) { - size_t len = i->iov[k].iov_len - skip; + const struct iovec *iov = iter_iov(i) + k; + size_t len = iov->iov_len - skip; if (len) { - res |= (unsigned long)i->iov[k].iov_base + skip; + res |= (unsigned long)iov->iov_base + skip; if (len > size) len = size; res |= len; @@ -1273,14 +1275,15 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) return ~0U; for (k = 0; k < i->nr_segs; k++) { - if (i->iov[k].iov_len) { - unsigned long base = (unsigned long)i->iov[k].iov_base; + const struct iovec *iov = iter_iov(i) + k; + if (iov->iov_len) { + unsigned long base = (unsigned long)iov->iov_base; if (v) // if not the first one res |= base | v; // this start | previous end - v = base + i->iov[k].iov_len; - if (size <= i->iov[k].iov_len) + v = base + iov->iov_len; + if (size <= iov->iov_len) break; - size -= i->iov[k].iov_len; + size -= iov->iov_len; } } return res; @@ -1396,13 +1399,14 @@ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) return (unsigned long)i->ubuf + i->iov_offset; for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { - size_t len = i->iov[k].iov_len - skip; + const struct iovec *iov = iter_iov(i) + k; + size_t len = iov->iov_len - skip; if (unlikely(!len)) continue; if (*size > len) *size = len; - return (unsigned long)i->iov[k].iov_base + skip; + return (unsigned long)iov->iov_base + skip; } BUG(); // if it had been empty, we wouldn't get called } @@ -1614,7 +1618,7 @@ static int iov_npages(const struct iov_iter *i, int maxpages) const struct iovec *p; int npages = 0; - for (p = i->iov; size; skip = 0, p++) { + for (p = iter_iov(i); size; skip = 0, p++) { unsigned offs = offset_in_page(p->iov_base + skip); size_t len = min(p->iov_len - skip, size); @@ -1691,14 +1695,14 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) flags); else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) /* iovec and kvec have identical layout */ - return new->iov = kmemdup(new->iov, + return new->__iov = kmemdup(new->__iov, new->nr_segs * sizeof(struct iovec), flags); return NULL; } EXPORT_SYMBOL(dup_iter); -static int copy_compat_iovec_from_user(struct iovec *iov, +static __noclone int copy_compat_iovec_from_user(struct iovec *iov, const struct iovec __user *uvec, unsigned long nr_segs) { const struct compat_iovec __user *uiov = @@ -1731,18 +1735,35 @@ uaccess_end: } static int copy_iovec_from_user(struct iovec *iov, - const struct iovec __user *uvec, unsigned long nr_segs) + const struct iovec __user *uiov, unsigned long nr_segs) { - unsigned long seg; + int ret = -EFAULT; - if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec))) + if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; - for (seg = 0; seg < nr_segs; seg++) { - if ((ssize_t)iov[seg].iov_len < 0) - return -EINVAL; - } - return 0; + do { + void __user *buf; + ssize_t len; + + unsafe_get_user(len, &uiov->iov_len, uaccess_end); + unsafe_get_user(buf, &uiov->iov_base, uaccess_end); + + /* check for size_t not fitting in ssize_t .. */ + if (unlikely(len < 0)) { + ret = -EINVAL; + goto uaccess_end; + } + iov->iov_base = buf; + iov->iov_len = len; + + uiov++; iov++; + } while (--nr_segs); + + ret = 0; +uaccess_end: + user_access_end(); + return ret; } struct iovec *iovec_from_user(const struct iovec __user *uvec, @@ -1767,7 +1788,7 @@ struct iovec *iovec_from_user(const struct iovec __user *uvec, return ERR_PTR(-ENOMEM); } - if (compat) + if (unlikely(compat)) ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); else ret = copy_iovec_from_user(iov, uvec, nr_segs); @@ -1780,6 +1801,30 @@ struct iovec *iovec_from_user(const struct iovec __user *uvec, return iov; } +/* + * Single segment iovec supplied by the user, import it as ITER_UBUF. + */ +static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, + struct iovec **iovp, struct iov_iter *i, + bool compat) +{ + struct iovec *iov = *iovp; + ssize_t ret; + + if (compat) + ret = copy_compat_iovec_from_user(iov, uvec, 1); + else + ret = copy_iovec_from_user(iov, uvec, 1); + if (unlikely(ret)) + return ret; + + ret = import_ubuf(type, iov->iov_base, iov->iov_len, i); + if (unlikely(ret)) + return ret; + *iovp = NULL; + return i->count; +} + ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat) @@ -1788,6 +1833,9 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned long seg; struct iovec *iov; + if (nr_segs == 1) + return __import_iovec_ubuf(type, uvec, iovp, i, compat); + iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); if (IS_ERR(iov)) { *iovp = NULL; @@ -1866,9 +1914,7 @@ int import_single_range(int rw, void __user *buf, size_t len, if (unlikely(!access_ok(buf, len))) return -EFAULT; - iov->iov_base = buf; - iov->iov_len = len; - iov_iter_init(i, rw, iov, 1, len); + iov_iter_ubuf(i, rw, buf, len); return 0; } EXPORT_SYMBOL(import_single_range); @@ -1918,7 +1964,7 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) if (iov_iter_is_bvec(i)) i->bvec -= state->nr_segs - i->nr_segs; else - i->iov -= state->nr_segs - i->nr_segs; + i->__iov -= state->nr_segs - i->nr_segs; i->nr_segs = state->nr_segs; } diff --git a/lib/kunit/debugfs.c b/lib/kunit/debugfs.c index de0ee2e03ed6..b08bb1fba106 100644 --- a/lib/kunit/debugfs.c +++ b/lib/kunit/debugfs.c @@ -55,14 +55,24 @@ static int debugfs_print_results(struct seq_file *seq, void *v) enum kunit_status success = kunit_suite_has_succeeded(suite); struct kunit_case *test_case; - if (!suite || !suite->log) + if (!suite) return 0; - seq_printf(seq, "%s", suite->log); + /* Print KTAP header so the debugfs log can be parsed as valid KTAP. */ + seq_puts(seq, "KTAP version 1\n"); + seq_puts(seq, "1..1\n"); + + /* Print suite header because it is not stored in the test logs. */ + seq_puts(seq, KUNIT_SUBTEST_INDENT "KTAP version 1\n"); + seq_printf(seq, KUNIT_SUBTEST_INDENT "# Subtest: %s\n", suite->name); + seq_printf(seq, KUNIT_SUBTEST_INDENT "1..%zd\n", kunit_suite_num_test_cases(suite)); kunit_suite_for_each_test_case(suite, test_case) debugfs_print_result(seq, suite, test_case); + if (suite->log) + seq_printf(seq, "%s", suite->log); + seq_printf(seq, "%s %d %s\n", kunit_status_to_ok_not_ok(success), 1, suite->name); return 0; diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c index 4df0335d0d06..42e44caa1bdd 100644 --- a/lib/kunit/kunit-test.c +++ b/lib/kunit/kunit-test.c @@ -6,6 +6,7 @@ * Author: Brendan Higgins <brendanhiggins@google.com> */ #include <kunit/test.h> +#include <kunit/test-bug.h> #include "try-catch-impl.h" @@ -443,18 +444,6 @@ static struct kunit_suite kunit_resource_test_suite = { .test_cases = kunit_resource_test_cases, }; -static void kunit_log_test(struct kunit *test); - -static struct kunit_case kunit_log_test_cases[] = { - KUNIT_CASE(kunit_log_test), - {} -}; - -static struct kunit_suite kunit_log_test_suite = { - .name = "kunit-log-test", - .test_cases = kunit_log_test_cases, -}; - static void kunit_log_test(struct kunit *test) { struct kunit_suite suite; @@ -481,6 +470,29 @@ static void kunit_log_test(struct kunit *test) #endif } +static void kunit_log_newline_test(struct kunit *test) +{ + kunit_info(test, "Add newline\n"); + if (test->log) { + KUNIT_ASSERT_NOT_NULL_MSG(test, strstr(test->log, "Add newline\n"), + "Missing log line, full log:\n%s", test->log); + KUNIT_EXPECT_NULL(test, strstr(test->log, "Add newline\n\n")); + } else { + kunit_skip(test, "only useful when debugfs is enabled"); + } +} + +static struct kunit_case kunit_log_test_cases[] = { + KUNIT_CASE(kunit_log_test), + KUNIT_CASE(kunit_log_newline_test), + {} +}; + +static struct kunit_suite kunit_log_test_suite = { + .name = "kunit-log-test", + .test_cases = kunit_log_test_cases, +}; + static void kunit_status_set_failure_test(struct kunit *test) { struct kunit fake; @@ -521,7 +533,46 @@ static struct kunit_suite kunit_status_test_suite = { .test_cases = kunit_status_test_cases, }; +static void kunit_current_test(struct kunit *test) +{ + /* Check results of both current->kunit_test and + * kunit_get_current_test() are equivalent to current test. + */ + KUNIT_EXPECT_PTR_EQ(test, test, current->kunit_test); + KUNIT_EXPECT_PTR_EQ(test, test, kunit_get_current_test()); +} + +static void kunit_current_fail_test(struct kunit *test) +{ + struct kunit fake; + + kunit_init_test(&fake, "fake test", NULL); + KUNIT_EXPECT_EQ(test, fake.status, KUNIT_SUCCESS); + + /* Set current->kunit_test to fake test. */ + current->kunit_test = &fake; + + kunit_fail_current_test("This should make `fake` test fail."); + KUNIT_EXPECT_EQ(test, fake.status, (enum kunit_status)KUNIT_FAILURE); + kunit_cleanup(&fake); + + /* Reset current->kunit_test to current test. */ + current->kunit_test = test; +} + +static struct kunit_case kunit_current_test_cases[] = { + KUNIT_CASE(kunit_current_test), + KUNIT_CASE(kunit_current_fail_test), + {} +}; + +static struct kunit_suite kunit_current_test_suite = { + .name = "kunit_current", + .test_cases = kunit_current_test_cases, +}; + kunit_test_suites(&kunit_try_catch_test_suite, &kunit_resource_test_suite, - &kunit_log_test_suite, &kunit_status_test_suite); + &kunit_log_test_suite, &kunit_status_test_suite, + &kunit_current_test_suite); MODULE_LICENSE("GPL v2"); diff --git a/lib/kunit/test.c b/lib/kunit/test.c index c9e15bb60058..e2910b261112 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -108,28 +108,51 @@ static void kunit_print_test_stats(struct kunit *test, stats.total); } +/** + * kunit_log_newline() - Add newline to the end of log if one is not + * already present. + * @log: The log to add the newline to. + */ +static void kunit_log_newline(char *log) +{ + int log_len, len_left; + + log_len = strlen(log); + len_left = KUNIT_LOG_SIZE - log_len - 1; + + if (log_len > 0 && log[log_len - 1] != '\n') + strncat(log, "\n", len_left); +} + /* * Append formatted message to log, size of which is limited to * KUNIT_LOG_SIZE bytes (including null terminating byte). */ void kunit_log_append(char *log, const char *fmt, ...) { - char line[KUNIT_LOG_SIZE]; va_list args; - int len_left; + int len, log_len, len_left; if (!log) return; - len_left = KUNIT_LOG_SIZE - strlen(log) - 1; + log_len = strlen(log); + len_left = KUNIT_LOG_SIZE - log_len - 1; if (len_left <= 0) return; + /* Evaluate length of line to add to log */ + va_start(args, fmt); + len = vsnprintf(NULL, 0, fmt, args) + 1; + va_end(args); + + /* Print formatted line to the log */ va_start(args, fmt); - vsnprintf(line, sizeof(line), fmt, args); + vsnprintf(log + log_len, min(len, len_left), fmt, args); va_end(args); - strncat(log, line, len_left); + /* Add newline to end of log if not already present. */ + kunit_log_newline(log); } EXPORT_SYMBOL_GPL(kunit_log_append); @@ -147,10 +170,18 @@ EXPORT_SYMBOL_GPL(kunit_suite_num_test_cases); static void kunit_print_suite_start(struct kunit_suite *suite) { - kunit_log(KERN_INFO, suite, KUNIT_SUBTEST_INDENT "KTAP version 1\n"); - kunit_log(KERN_INFO, suite, KUNIT_SUBTEST_INDENT "# Subtest: %s", + /* + * We do not log the test suite header as doing so would + * mean debugfs display would consist of the test suite + * header prior to individual test results. + * Hence directly printk the suite status, and we will + * separately seq_printf() the suite header for the debugfs + * representation. + */ + pr_info(KUNIT_SUBTEST_INDENT "KTAP version 1\n"); + pr_info(KUNIT_SUBTEST_INDENT "# Subtest: %s\n", suite->name); - kunit_log(KERN_INFO, suite, KUNIT_SUBTEST_INDENT "1..%zd", + pr_info(KUNIT_SUBTEST_INDENT "1..%zd\n", kunit_suite_num_test_cases(suite)); } @@ -167,10 +198,9 @@ static void kunit_print_ok_not_ok(void *test_or_suite, /* * We do not log the test suite results as doing so would - * mean debugfs display would consist of the test suite - * description and status prior to individual test results. - * Hence directly printk the suite status, and we will - * separately seq_printf() the suite status for the debugfs + * mean debugfs display would consist of an incorrect test + * number. Hence directly printk the suite result, and we will + * separately seq_printf() the suite results for the debugfs * representation. */ if (suite) @@ -437,7 +467,6 @@ static void kunit_run_case_catch_errors(struct kunit_suite *suite, struct kunit_try_catch_context context; struct kunit_try_catch *try_catch; - kunit_init_test(test, test_case->name, test_case->log); try_catch = &test->try_catch; kunit_try_catch_init(try_catch, @@ -533,6 +562,8 @@ int kunit_run_tests(struct kunit_suite *suite) struct kunit_result_stats param_stats = { 0 }; test_case->status = KUNIT_SKIPPED; + kunit_init_test(&test, test_case->name, test_case->log); + if (!test_case->generate_params) { /* Non-parameterised test. */ kunit_run_case_catch_errors(suite, test_case, &test); diff --git a/lib/list-test.c b/lib/list-test.c index d374cf5d1a57..0cc27de9cec8 100644 --- a/lib/list-test.c +++ b/lib/list-test.c @@ -8,6 +8,7 @@ #include <kunit/test.h> #include <linux/list.h> +#include <linux/klist.h> struct list_test_struct { int data; @@ -1199,6 +1200,303 @@ static struct kunit_suite hlist_test_module = { .test_cases = hlist_test_cases, }; -kunit_test_suites(&list_test_module, &hlist_test_module); + +struct klist_test_struct { + int data; + struct klist klist; + struct klist_node klist_node; +}; + +static int node_count; +static struct klist_node *last_node; + +static void check_node(struct klist_node *node_ptr) +{ + node_count++; + last_node = node_ptr; +} + +static void check_delete_node(struct klist_node *node_ptr) +{ + node_count--; + last_node = node_ptr; +} + +static void klist_test_add_tail(struct kunit *test) +{ + struct klist_node a, b; + struct klist mylist; + struct klist_iter i; + + node_count = 0; + klist_init(&mylist, &check_node, NULL); + + klist_add_tail(&a, &mylist); + KUNIT_EXPECT_EQ(test, node_count, 1); + KUNIT_EXPECT_PTR_EQ(test, last_node, &a); + + klist_add_tail(&b, &mylist); + KUNIT_EXPECT_EQ(test, node_count, 2); + KUNIT_EXPECT_PTR_EQ(test, last_node, &b); + + /* should be [list] -> a -> b */ + klist_iter_init(&mylist, &i); + + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &a); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &b); + KUNIT_EXPECT_NULL(test, klist_next(&i)); + + klist_iter_exit(&i); + +} + +static void klist_test_add_head(struct kunit *test) +{ + struct klist_node a, b; + struct klist mylist; + struct klist_iter i; + + node_count = 0; + klist_init(&mylist, &check_node, NULL); + + klist_add_head(&a, &mylist); + KUNIT_EXPECT_EQ(test, node_count, 1); + KUNIT_EXPECT_PTR_EQ(test, last_node, &a); + + klist_add_head(&b, &mylist); + KUNIT_EXPECT_EQ(test, node_count, 2); + KUNIT_EXPECT_PTR_EQ(test, last_node, &b); + + /* should be [list] -> b -> a */ + klist_iter_init(&mylist, &i); + + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &b); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &a); + KUNIT_EXPECT_NULL(test, klist_next(&i)); + + klist_iter_exit(&i); + +} + +static void klist_test_add_behind(struct kunit *test) +{ + struct klist_node a, b, c, d; + struct klist mylist; + struct klist_iter i; + + node_count = 0; + klist_init(&mylist, &check_node, NULL); + + klist_add_head(&a, &mylist); + klist_add_head(&b, &mylist); + + klist_add_behind(&c, &a); + KUNIT_EXPECT_EQ(test, node_count, 3); + KUNIT_EXPECT_PTR_EQ(test, last_node, &c); + + klist_add_behind(&d, &b); + KUNIT_EXPECT_EQ(test, node_count, 4); + KUNIT_EXPECT_PTR_EQ(test, last_node, &d); + + klist_iter_init(&mylist, &i); + + /* should be [list] -> b -> d -> a -> c*/ + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &b); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &d); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &a); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &c); + KUNIT_EXPECT_NULL(test, klist_next(&i)); + + klist_iter_exit(&i); + +} + +static void klist_test_add_before(struct kunit *test) +{ + struct klist_node a, b, c, d; + struct klist mylist; + struct klist_iter i; + + node_count = 0; + klist_init(&mylist, &check_node, NULL); + + klist_add_head(&a, &mylist); + klist_add_head(&b, &mylist); + klist_add_before(&c, &a); + KUNIT_EXPECT_EQ(test, node_count, 3); + KUNIT_EXPECT_PTR_EQ(test, last_node, &c); + + klist_add_before(&d, &b); + KUNIT_EXPECT_EQ(test, node_count, 4); + KUNIT_EXPECT_PTR_EQ(test, last_node, &d); + + klist_iter_init(&mylist, &i); + + /* should be [list] -> b -> d -> a -> c*/ + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &d); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &b); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &c); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &a); + KUNIT_EXPECT_NULL(test, klist_next(&i)); + + klist_iter_exit(&i); + +} + +/* + * Verify that klist_del() delays the deletion of a node until there + * are no other references to it + */ +static void klist_test_del_refcount_greater_than_zero(struct kunit *test) +{ + struct klist_node a, b, c, d; + struct klist mylist; + struct klist_iter i; + + node_count = 0; + klist_init(&mylist, &check_node, &check_delete_node); + + /* Add nodes a,b,c,d to the list*/ + klist_add_tail(&a, &mylist); + klist_add_tail(&b, &mylist); + klist_add_tail(&c, &mylist); + klist_add_tail(&d, &mylist); + + klist_iter_init(&mylist, &i); + + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &a); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &b); + /* Advance the iterator to point to node c*/ + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &c); + + /* Try to delete node c while there is a reference to it*/ + klist_del(&c); + + /* + * Verify that node c is still attached to the list even after being + * deleted. Since the iterator still points to c, the reference count is not + * decreased to 0 + */ + KUNIT_EXPECT_TRUE(test, klist_node_attached(&c)); + + /* Check that node c has not been removed yet*/ + KUNIT_EXPECT_EQ(test, node_count, 4); + KUNIT_EXPECT_PTR_EQ(test, last_node, &d); + + klist_iter_exit(&i); + + /* + * Since the iterator is no longer pointing to node c, node c is removed + * from the list + */ + KUNIT_EXPECT_EQ(test, node_count, 3); + KUNIT_EXPECT_PTR_EQ(test, last_node, &c); + +} + +/* + * Verify that klist_del() deletes a node immediately when there are no + * other references to it. + */ +static void klist_test_del_refcount_zero(struct kunit *test) +{ + struct klist_node a, b, c, d; + struct klist mylist; + struct klist_iter i; + + node_count = 0; + klist_init(&mylist, &check_node, &check_delete_node); + + /* Add nodes a,b,c,d to the list*/ + klist_add_tail(&a, &mylist); + klist_add_tail(&b, &mylist); + klist_add_tail(&c, &mylist); + klist_add_tail(&d, &mylist); + /* Delete node c*/ + klist_del(&c); + + /* Check that node c is deleted from the list*/ + KUNIT_EXPECT_EQ(test, node_count, 3); + KUNIT_EXPECT_PTR_EQ(test, last_node, &c); + + /* Should be [list] -> a -> b -> d*/ + klist_iter_init(&mylist, &i); + + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &a); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &b); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &d); + KUNIT_EXPECT_NULL(test, klist_next(&i)); + + klist_iter_exit(&i); + +} + +static void klist_test_remove(struct kunit *test) +{ + /* This test doesn't check correctness under concurrent access */ + struct klist_node a, b, c, d; + struct klist mylist; + struct klist_iter i; + + node_count = 0; + klist_init(&mylist, &check_node, &check_delete_node); + + /* Add nodes a,b,c,d to the list*/ + klist_add_tail(&a, &mylist); + klist_add_tail(&b, &mylist); + klist_add_tail(&c, &mylist); + klist_add_tail(&d, &mylist); + /* Delete node c*/ + klist_remove(&c); + + /* Check the nodes in the list*/ + KUNIT_EXPECT_EQ(test, node_count, 3); + KUNIT_EXPECT_PTR_EQ(test, last_node, &c); + + /* should be [list] -> a -> b -> d*/ + klist_iter_init(&mylist, &i); + + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &a); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &b); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &d); + KUNIT_EXPECT_NULL(test, klist_next(&i)); + + klist_iter_exit(&i); + +} + +static void klist_test_node_attached(struct kunit *test) +{ + struct klist_node a = {}; + struct klist mylist; + + klist_init(&mylist, NULL, NULL); + + KUNIT_EXPECT_FALSE(test, klist_node_attached(&a)); + klist_add_head(&a, &mylist); + KUNIT_EXPECT_TRUE(test, klist_node_attached(&a)); + klist_del(&a); + KUNIT_EXPECT_FALSE(test, klist_node_attached(&a)); + +} + +static struct kunit_case klist_test_cases[] = { + KUNIT_CASE(klist_test_add_tail), + KUNIT_CASE(klist_test_add_head), + KUNIT_CASE(klist_test_add_behind), + KUNIT_CASE(klist_test_add_before), + KUNIT_CASE(klist_test_del_refcount_greater_than_zero), + KUNIT_CASE(klist_test_del_refcount_zero), + KUNIT_CASE(klist_test_remove), + KUNIT_CASE(klist_test_node_attached), + {}, +}; + +static struct kunit_suite klist_test_module = { + .name = "klist", + .test_cases = klist_test_cases, +}; + +kunit_test_suites(&list_test_module, &hlist_test_module, &klist_test_module); MODULE_LICENSE("GPL v2"); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index db60edb55f2f..1281a40d5735 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1303,26 +1303,21 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) node = mas->alloc; node->request_count = 0; while (requested) { - max_req = MAPLE_ALLOC_SLOTS; - if (node->node_count) { - unsigned int offset = node->node_count; - - slots = (void **)&node->slot[offset]; - max_req -= offset; - } else { - slots = (void **)&node->slot; - } - + max_req = MAPLE_ALLOC_SLOTS - node->node_count; + slots = (void **)&node->slot[node->node_count]; max_req = min(requested, max_req); count = mt_alloc_bulk(gfp, max_req, slots); if (!count) goto nomem_bulk; + if (node->node_count == 0) { + node->slot[0]->node_count = 0; + node->slot[0]->request_count = 0; + } + node->node_count += count; allocated += count; node = node->slot[0]; - node->node_count = 0; - node->request_count = 0; requested -= count; } mas->alloc->total = allocated; @@ -4970,7 +4965,8 @@ not_found: * Return: True if found in a leaf, false otherwise. * */ -static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) +static bool mas_rev_awalk(struct ma_state *mas, unsigned long size, + unsigned long *gap_min, unsigned long *gap_max) { enum maple_type type = mte_node_type(mas->node); struct maple_node *node = mas_mn(mas); @@ -5035,8 +5031,8 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) if (unlikely(ma_is_leaf(type))) { mas->offset = offset; - mas->min = min; - mas->max = min + gap - 1; + *gap_min = min; + *gap_max = min + gap - 1; return true; } @@ -5060,10 +5056,10 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) { enum maple_type type = mte_node_type(mas->node); unsigned long pivot, min, gap = 0; - unsigned char offset; - unsigned long *gaps; - unsigned long *pivots = ma_pivots(mas_mn(mas), type); - void __rcu **slots = ma_slots(mas_mn(mas), type); + unsigned char offset, data_end; + unsigned long *gaps, *pivots; + void __rcu **slots; + struct maple_node *node; bool found = false; if (ma_is_dense(type)) { @@ -5071,13 +5067,15 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) return true; } - gaps = ma_gaps(mte_to_node(mas->node), type); + node = mas_mn(mas); + pivots = ma_pivots(node, type); + slots = ma_slots(node, type); + gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); - for (; offset < mt_slots[type]; offset++) { - pivot = mas_safe_pivot(mas, pivots, offset, type); - if (offset && !pivot) - break; + data_end = ma_data_end(node, type, pivots, mas->max); + for (; offset <= data_end; offset++) { + pivot = mas_logical_pivot(mas, pivots, offset, type); /* Not within lower bounds */ if (mas->index > pivot) @@ -5312,6 +5310,9 @@ int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long *pivots; enum maple_type mt; + if (min >= max) + return -EINVAL; + if (mas_is_start(mas)) mas_start(mas); else if (mas->offset >= 2) @@ -5366,6 +5367,9 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, { struct maple_enode *last = mas->node; + if (min >= max) + return -EINVAL; + if (mas_is_start(mas)) { mas_start(mas); mas->offset = mas_data_end(mas); @@ -5385,7 +5389,7 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, mas->index = min; mas->last = max; - while (!mas_rev_awalk(mas, size)) { + while (!mas_rev_awalk(mas, size, &min, &max)) { if (last == mas->node) { if (!mas_rewind_node(mas)) return -EBUSY; @@ -5400,17 +5404,9 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, if (unlikely(mas->offset == MAPLE_NODE_SLOTS)) return -EBUSY; - /* - * mas_rev_awalk() has set mas->min and mas->max to the gap values. If - * the maximum is outside the window we are searching, then use the last - * location in the search. - * mas->max and mas->min is the range of the gap. - * mas->index and mas->last are currently set to the search range. - */ - /* Trim the upper limit to the max. */ - if (mas->max <= mas->last) - mas->last = mas->max; + if (max <= mas->last) + mas->last = max; mas->index = mas->last - size + 1; return 0; diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index de4ee0d50906..cd2bdba6d3ed 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -334,7 +334,7 @@ kvfree_rcu_1_arg_vmalloc_test(void) return -1; p->array[0] = 'a'; - kvfree_rcu(p); + kvfree_rcu_mightsleep(p); } return 0; diff --git a/mm/Kconfig b/mm/Kconfig index 4751031f3f05..ebfe5796adf8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -686,7 +686,6 @@ config BOUNCE config MMU_NOTIFIER bool - select SRCU select INTERVAL_TREE config KSM diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a53b9360b72e..30d2d0386fdb 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -507,6 +507,15 @@ static LIST_HEAD(offline_cgwbs); static void cleanup_offline_cgwbs_workfn(struct work_struct *work); static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); +static void cgwb_free_rcu(struct rcu_head *rcu_head) +{ + struct bdi_writeback *wb = container_of(rcu_head, + struct bdi_writeback, rcu); + + percpu_ref_exit(&wb->refcnt); + kfree(wb); +} + static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, @@ -529,11 +538,10 @@ static void cgwb_release_workfn(struct work_struct *work) list_del(&wb->offline_node); spin_unlock_irq(&cgwb_lock); - percpu_ref_exit(&wb->refcnt); wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); - kfree_rcu(wb, rcu); + call_rcu(&wb->rcu, cgwb_free_rcu); } static void cgwb_release(struct percpu_ref *refcnt) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 032fb0ef9cd1..3fae2d2496ab 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1838,10 +1838,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); struct page *page = pfn_swap_entry_to_page(entry); + pmd_t newpmd; VM_BUG_ON(!is_pmd_migration_entry(*pmd)); if (is_writable_migration_entry(entry)) { - pmd_t newpmd; /* * A protection check is difficult so * just be safe and disable write @@ -1855,8 +1855,16 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, newpmd = pmd_swp_mksoft_dirty(newpmd); if (pmd_swp_uffd_wp(*pmd)) newpmd = pmd_swp_mkuffd_wp(newpmd); - set_pmd_at(mm, addr, pmd, newpmd); + } else { + newpmd = *pmd; } + + if (uffd_wp) + newpmd = pmd_swp_mkuffd_wp(newpmd); + else if (uffd_wp_resolve) + newpmd = pmd_swp_clear_uffd_wp(newpmd); + if (!pmd_same(*pmd, newpmd)) + set_pmd_at(mm, addr, pmd, newpmd); goto unlock; } #endif @@ -2657,9 +2665,10 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); is_hzp = is_huge_zero_page(&folio->page); - VM_WARN_ON_ONCE_FOLIO(is_hzp, folio); - if (is_hzp) + if (is_hzp) { + pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); return -EBUSY; + } if (folio_test_writeback(folio)) return -EBUSY; @@ -3251,6 +3260,8 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, pmdswp = swp_entry_to_pmd(entry); if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); + if (pmd_uffd_wp(pmdval)) + pmdswp = pmd_swp_mkuffd_wp(pmdswp); set_pmd_at(mm, address, pvmw->pmd, pmdswp); page_remove_rmap(page, vma, true); put_page(page); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 92e6f56a932d..0ec69b96b497 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -572,6 +572,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_PTE_NON_PRESENT; goto out; } + if (pte_uffd_wp(pteval)) { + result = SCAN_PTE_UFFD_WP; + goto out; + } page = vm_normal_page(vma, address, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 3807502766a3..ec0da72e65aa 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -148,35 +148,74 @@ void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end) * into the virtual memory. If those physical pages already had shadow/origin, * those are ignored. */ -void kmsan_ioremap_page_range(unsigned long start, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int page_shift) +int kmsan_ioremap_page_range(unsigned long start, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift) { gfp_t gfp_mask = GFP_KERNEL | __GFP_ZERO; struct page *shadow, *origin; unsigned long off = 0; - int nr; + int nr, err = 0, clean = 0, mapped; if (!kmsan_enabled || kmsan_in_runtime()) - return; + return 0; nr = (end - start) / PAGE_SIZE; kmsan_enter_runtime(); - for (int i = 0; i < nr; i++, off += PAGE_SIZE) { + for (int i = 0; i < nr; i++, off += PAGE_SIZE, clean = i) { shadow = alloc_pages(gfp_mask, 1); origin = alloc_pages(gfp_mask, 1); - __vmap_pages_range_noflush( + if (!shadow || !origin) { + err = -ENOMEM; + goto ret; + } + mapped = __vmap_pages_range_noflush( vmalloc_shadow(start + off), vmalloc_shadow(start + off + PAGE_SIZE), prot, &shadow, PAGE_SHIFT); - __vmap_pages_range_noflush( + if (mapped) { + err = mapped; + goto ret; + } + shadow = NULL; + mapped = __vmap_pages_range_noflush( vmalloc_origin(start + off), vmalloc_origin(start + off + PAGE_SIZE), prot, &origin, PAGE_SHIFT); + if (mapped) { + __vunmap_range_noflush( + vmalloc_shadow(start + off), + vmalloc_shadow(start + off + PAGE_SIZE)); + err = mapped; + goto ret; + } + origin = NULL; + } + /* Page mapping loop finished normally, nothing to clean up. */ + clean = 0; + +ret: + if (clean > 0) { + /* + * Something went wrong. Clean up shadow/origin pages allocated + * on the last loop iteration, then delete mappings created + * during the previous iterations. + */ + if (shadow) + __free_pages(shadow, 1); + if (origin) + __free_pages(origin, 1); + __vunmap_range_noflush( + vmalloc_shadow(start), + vmalloc_shadow(start + clean * PAGE_SIZE)); + __vunmap_range_noflush( + vmalloc_origin(start), + vmalloc_origin(start + clean * PAGE_SIZE)); } flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); kmsan_leave_runtime(); + return err; } void kmsan_iounmap_page_range(unsigned long start, unsigned long end) diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index a787c04e9583..b8bb95eea5e3 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -216,27 +216,29 @@ void kmsan_free_page(struct page *page, unsigned int order) kmsan_leave_runtime(); } -void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages, - unsigned int page_shift) +int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) { unsigned long shadow_start, origin_start, shadow_end, origin_end; struct page **s_pages, **o_pages; - int nr, mapped; + int nr, mapped, err = 0; if (!kmsan_enabled) - return; + return 0; shadow_start = vmalloc_meta((void *)start, KMSAN_META_SHADOW); shadow_end = vmalloc_meta((void *)end, KMSAN_META_SHADOW); if (!shadow_start) - return; + return 0; nr = (end - start) / PAGE_SIZE; s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL); o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL); - if (!s_pages || !o_pages) + if (!s_pages || !o_pages) { + err = -ENOMEM; goto ret; + } for (int i = 0; i < nr; i++) { s_pages[i] = shadow_page_for(pages[i]); o_pages[i] = origin_page_for(pages[i]); @@ -249,10 +251,16 @@ void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, kmsan_enter_runtime(); mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot, s_pages, page_shift); - KMSAN_WARN_ON(mapped); + if (mapped) { + err = mapped; + goto ret; + } mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot, o_pages, page_shift); - KMSAN_WARN_ON(mapped); + if (mapped) { + err = mapped; + goto ret; + } kmsan_leave_runtime(); flush_tlb_kernel_range(shadow_start, shadow_end); flush_tlb_kernel_range(origin_start, origin_end); @@ -262,6 +270,7 @@ void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, ret: kfree(s_pages); kfree(o_pages); + return err; } /* Allocate metadata for pages allocated at boot time. */ diff --git a/mm/madvise.c b/mm/madvise.c index 340125d08c03..9f389c5304d2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1456,7 +1456,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t, vlen, int, behavior, unsigned int, flags) { ssize_t ret; - struct iovec iovstack[UIO_FASTIOV], iovec; + struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; struct task_struct *task; @@ -1503,12 +1503,11 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, total_len = iov_iter_count(&iter); while (iov_iter_count(&iter)) { - iovec = iov_iter_iovec(&iter); - ret = do_madvise(mm, (unsigned long)iovec.iov_base, - iovec.iov_len, behavior); + ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter), + iter_iov_len(&iter), behavior); if (ret < 0) break; - iov_iter_advance(&iter, iovec.iov_len); + iov_iter_advance(&iter, iter_iov_len(&iter)); } ret = (total_len - iov_iter_count(&iter)) ? : ret; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a256a241fd1d..2068b594dc88 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -790,61 +790,50 @@ static int vma_replace_policy(struct vm_area_struct *vma, return err; } -/* Step 2: apply policy to a range and do splits. */ -static int mbind_range(struct mm_struct *mm, unsigned long start, - unsigned long end, struct mempolicy *new_pol) +/* Split or merge the VMA (if required) and apply the new policy */ +static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, struct mempolicy *new_pol) { - VMA_ITERATOR(vmi, mm, start); - struct vm_area_struct *prev; - struct vm_area_struct *vma; - int err = 0; + struct vm_area_struct *merged; + unsigned long vmstart, vmend; pgoff_t pgoff; + int err; - prev = vma_prev(&vmi); - vma = vma_find(&vmi, end); - if (WARN_ON(!vma)) + vmend = min(end, vma->vm_end); + if (start > vma->vm_start) { + *prev = vma; + vmstart = start; + } else { + vmstart = vma->vm_start; + } + + if (mpol_equal(vma_policy(vma), new_pol)) return 0; - if (start > vma->vm_start) - prev = vma; - - do { - unsigned long vmstart = max(start, vma->vm_start); - unsigned long vmend = min(end, vma->vm_end); - - if (mpol_equal(vma_policy(vma), new_pol)) - goto next; - - pgoff = vma->vm_pgoff + - ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); - if (prev) { - vma = prev; - goto replace; - } - if (vma->vm_start != vmstart) { - err = split_vma(&vmi, vma, vmstart, 1); - if (err) - goto out; - } - if (vma->vm_end != vmend) { - err = split_vma(&vmi, vma, vmend, 0); - if (err) - goto out; - } -replace: - err = vma_replace_policy(vma, new_pol); + pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); + merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, new_pol, + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + if (merged) { + *prev = merged; + return vma_replace_policy(merged, new_pol); + } + + if (vma->vm_start != vmstart) { + err = split_vma(vmi, vma, vmstart, 1); if (err) - goto out; -next: - prev = vma; - } for_each_vma_range(vmi, vma, end); + return err; + } -out: - return err; + if (vma->vm_end != vmend) { + err = split_vma(vmi, vma, vmend, 0); + if (err) + return err; + } + + *prev = vma; + return vma_replace_policy(vma, new_pol); } /* Set the process memory policy */ @@ -1259,6 +1248,8 @@ static long do_mbind(unsigned long start, unsigned long len, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + struct vma_iterator vmi; struct mempolicy *new; unsigned long end; int err; @@ -1328,7 +1319,13 @@ static long do_mbind(unsigned long start, unsigned long len, goto up_out; } - err = mbind_range(mm, start, end, new); + vma_iter_init(&vmi, mm, start); + prev = vma_prev(&vmi); + for_each_vma_range(vmi, vma, end) { + err = mbind_range(&vmi, vma, &prev, start, end, new); + if (err) + break; + } if (!err) { int nr_failed = 0; @@ -1489,10 +1486,8 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le unsigned long, home_node, unsigned long, flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; struct mempolicy *new, *old; - unsigned long vmstart; - unsigned long vmend; unsigned long end; int err = -ENOENT; VMA_ITERATOR(vmi, mm, start); @@ -1521,6 +1516,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le if (end == start) return 0; mmap_write_lock(mm); + prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { /* * If any vma in the range got policy other than MPOL_BIND @@ -1541,9 +1537,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le } new->home_node = home_node; - vmstart = max(start, vma->vm_start); - vmend = min(end, vma->vm_end); - err = mbind_range(mm, vmstart, vmend, new); + err = mbind_range(&vmi, vma, &prev, start, end, new); mpol_put(new); if (err) break; diff --git a/mm/mmap.c b/mm/mmap.c index ff68a67a2a7c..d5475fbf5729 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1518,7 +1518,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) */ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) { - unsigned long length, gap; + unsigned long length, gap, low_limit; + struct vm_area_struct *tmp; MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); @@ -1527,12 +1528,29 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) if (length < info->length) return -ENOMEM; - if (mas_empty_area(&mas, info->low_limit, info->high_limit - 1, - length)) + low_limit = info->low_limit; +retry: + if (mas_empty_area(&mas, low_limit, info->high_limit - 1, length)) return -ENOMEM; gap = mas.index; gap += (info->align_offset - gap) & info->align_mask; + tmp = mas_next(&mas, ULONG_MAX); + if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) < gap + length - 1) { + low_limit = tmp->vm_end; + mas_reset(&mas); + goto retry; + } + } else { + tmp = mas_prev(&mas, 0); + if (tmp && vm_end_gap(tmp) > gap) { + low_limit = vm_end_gap(tmp); + mas_reset(&mas); + goto retry; + } + } + return gap; } @@ -1548,7 +1566,8 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) */ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { - unsigned long length, gap; + unsigned long length, gap, high_limit, gap_end; + struct vm_area_struct *tmp; MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ @@ -1556,12 +1575,31 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) if (length < info->length) return -ENOMEM; - if (mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1, + high_limit = info->high_limit; +retry: + if (mas_empty_area_rev(&mas, info->low_limit, high_limit - 1, length)) return -ENOMEM; gap = mas.last + 1 - info->length; gap -= (gap - info->align_offset) & info->align_mask; + gap_end = mas.last; + tmp = mas_next(&mas, ULONG_MAX); + if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) <= gap_end) { + high_limit = vm_start_gap(tmp); + mas_reset(&mas); + goto retry; + } + } else { + tmp = mas_prev(&mas, 0); + if (tmp && vm_end_gap(tmp) > gap) { + high_limit = tmp->vm_start; + mas_reset(&mas); + goto retry; + } + } + return gap; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 13e84d8c0797..36351a00c0e8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -838,7 +838,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } tlb_finish_mmu(&tlb); - if (vma_iter_end(&vmi) < end) + if (!error && vma_iter_end(&vmi) < end) error = -ENOMEM; out: diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 516b1aa247e8..db7943999007 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2583,46 +2583,6 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) return ret; } -/** - * folio_write_one - write out a single folio and wait on I/O. - * @folio: The folio to write. - * - * The folio must be locked by the caller and will be unlocked upon return. - * - * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this - * function returns. - * - * Return: %0 on success, negative error code otherwise - */ -int folio_write_one(struct folio *folio) -{ - struct address_space *mapping = folio->mapping; - int ret = 0; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = folio_nr_pages(folio), - }; - - BUG_ON(!folio_test_locked(folio)); - - folio_wait_writeback(folio); - - if (folio_clear_dirty_for_io(folio)) { - folio_get(folio); - ret = mapping->a_ops->writepage(&folio->page, &wbc); - if (ret == 0) - folio_wait_writeback(folio); - folio_put(folio); - } else { - folio_unlock(folio); - } - - if (!ret) - ret = filemap_check_errors(mapping); - return ret; -} -EXPORT_SYMBOL(folio_write_one); - /* * For address_spaces which do not use buffers nor write back. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7136c36c5d01..8e39705c7bdc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6632,7 +6632,21 @@ static void __build_all_zonelists(void *data) int nid; int __maybe_unused cpu; pg_data_t *self = data; + unsigned long flags; + /* + * Explicitly disable this CPU's interrupts before taking seqlock + * to prevent any IRQ handler from calling into the page allocator + * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. + */ + local_irq_save(flags); + /* + * Explicitly disable this CPU's synchronous printk() before taking + * seqlock to prevent any printk() from trying to hold port->lock, for + * tty_insert_flip_string_and_push_buffer() on other CPU might be + * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. + */ + printk_deferred_enter(); write_seqlock(&zonelist_update_seq); #ifdef CONFIG_NUMA @@ -6671,6 +6685,8 @@ static void __build_all_zonelists(void *data) } write_sequnlock(&zonelist_update_seq); + printk_deferred_exit(); + local_irq_restore(flags); } static noinline void __init @@ -9450,6 +9466,9 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, if (PageReserved(page)) return false; + + if (PageHuge(page)) + return false; } return true; } diff --git a/mm/shmem.c b/mm/shmem.c index 448f393d8ab2..b76521ed372d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3339,10 +3339,6 @@ static const struct xattr_handler shmem_trusted_xattr_handler = { }; static const struct xattr_handler *shmem_xattr_handlers[] = { -#ifdef CONFIG_TMPFS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &shmem_security_xattr_handler, &shmem_trusted_xattr_handler, NULL diff --git a/mm/swap.c b/mm/swap.c index 57cb01b042f6..423199ee8478 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -222,7 +222,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); folios_put(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_init(fbatch); + folio_batch_reinit(fbatch); } static void folio_batch_add_and_move(struct folio_batch *fbatch, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a50072066221..31ff782d368b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -313,8 +313,8 @@ int ioremap_page_range(unsigned long addr, unsigned long end, ioremap_max_page_shift); flush_cache_vmap(addr, end); if (!err) - kmsan_ioremap_page_range(addr, end, phys_addr, prot, - ioremap_max_page_shift); + err = kmsan_ioremap_page_range(addr, end, phys_addr, prot, + ioremap_max_page_shift); return err; } @@ -605,7 +605,11 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, int vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { - kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, + page_shift); + + if (ret) + return ret; return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 9c1c5e8b24b8..7ba6bfdd9a5f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1151,12 +1151,12 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) DEFINE_WAIT(wait); /* - * Do not throttle IO workers, kthreads other than kswapd or + * Do not throttle user workers, kthreads other than kswapd or * workqueues. They may be required for reclaim to make * forward progress (e.g. journalling workqueues or kthreads). */ if (!current_is_kswapd() && - current->flags & (PF_IO_WORKER|PF_KTHREAD)) { + current->flags & (PF_USER_WORKER|PF_KTHREAD)) { cond_resched(); return; } diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index c64050e839ac..1fffe2bed5b0 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -280,6 +280,10 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) write_unlock(&xen_9pfs_lock); for (i = 0; i < priv->num_rings; i++) { + struct xen_9pfs_dataring *ring = &priv->rings[i]; + + cancel_work_sync(&ring->work); + if (!priv->rings[i].intf) break; if (priv->rings[i].irq > 0) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 17b946f9ba31..8455ba141ee6 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -68,7 +68,7 @@ static const struct sco_param esco_param_msbc[] = { }; /* This function requires the caller holds hdev->lock */ -static void hci_connect_le_scan_cleanup(struct hci_conn *conn) +static void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status) { struct hci_conn_params *params; struct hci_dev *hdev = conn->hdev; @@ -88,9 +88,28 @@ static void hci_connect_le_scan_cleanup(struct hci_conn *conn) params = hci_pend_le_action_lookup(&hdev->pend_le_conns, bdaddr, bdaddr_type); - if (!params || !params->explicit_connect) + if (!params) return; + if (params->conn) { + hci_conn_drop(params->conn); + hci_conn_put(params->conn); + params->conn = NULL; + } + + if (!params->explicit_connect) + return; + + /* If the status indicates successful cancellation of + * the attempt (i.e. Unknown Connection Id) there's no point of + * notifying failure since we'll go back to keep trying to + * connect. The only exception is explicit connect requests + * where a timeout + cancel does indicate an actual failure. + */ + if (status && status != HCI_ERROR_UNKNOWN_CONN_ID) + mgmt_connect_failed(hdev, &conn->dst, conn->type, + conn->dst_type, status); + /* The connection attempt was doing scan for new RPA, and is * in scan phase. If params are not associated with any other * autoconnect action, remove them completely. If they are, just unmark @@ -178,7 +197,7 @@ static void le_scan_cleanup(struct work_struct *work) rcu_read_unlock(); if (c == conn) { - hci_connect_le_scan_cleanup(conn); + hci_connect_le_scan_cleanup(conn, 0x00); hci_conn_cleanup(conn); } @@ -1049,6 +1068,17 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, return conn; } +static bool hci_conn_unlink(struct hci_conn *conn) +{ + if (!conn->link) + return false; + + conn->link->link = NULL; + conn->link = NULL; + + return true; +} + int hci_conn_del(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; @@ -1060,15 +1090,16 @@ int hci_conn_del(struct hci_conn *conn) cancel_delayed_work_sync(&conn->idle_work); if (conn->type == ACL_LINK) { - struct hci_conn *sco = conn->link; - if (sco) { - sco->link = NULL; + struct hci_conn *link = conn->link; + + if (link) { + hci_conn_unlink(conn); /* Due to race, SCO connection might be not established * yet at this point. Delete it now, otherwise it is * possible for it to be stuck and can't be deleted. */ - if (sco->handle == HCI_CONN_HANDLE_UNSET) - hci_conn_del(sco); + if (link->handle == HCI_CONN_HANDLE_UNSET) + hci_conn_del(link); } /* Unacked frames */ @@ -1084,7 +1115,7 @@ int hci_conn_del(struct hci_conn *conn) struct hci_conn *acl = conn->link; if (acl) { - acl->link = NULL; + hci_conn_unlink(conn); hci_conn_drop(acl); } @@ -1179,31 +1210,8 @@ EXPORT_SYMBOL(hci_get_route); static void hci_le_conn_failed(struct hci_conn *conn, u8 status) { struct hci_dev *hdev = conn->hdev; - struct hci_conn_params *params; - params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst, - conn->dst_type); - if (params && params->conn) { - hci_conn_drop(params->conn); - hci_conn_put(params->conn); - params->conn = NULL; - } - - /* If the status indicates successful cancellation of - * the attempt (i.e. Unknown Connection Id) there's no point of - * notifying failure since we'll go back to keep trying to - * connect. The only exception is explicit connect requests - * where a timeout + cancel does indicate an actual failure. - */ - if (status != HCI_ERROR_UNKNOWN_CONN_ID || - (params && params->explicit_connect)) - mgmt_connect_failed(hdev, &conn->dst, conn->type, - conn->dst_type, status); - - /* Since we may have temporarily stopped the background scanning in - * favor of connection establishment, we should restart it. - */ - hci_update_passive_scan(hdev); + hci_connect_le_scan_cleanup(conn, status); /* Enable advertising in case this was a failed connection * attempt as a peripheral. @@ -1237,15 +1245,15 @@ static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) { struct hci_conn *conn = data; + bt_dev_dbg(hdev, "err %d", err); + hci_dev_lock(hdev); if (!err) { - hci_connect_le_scan_cleanup(conn); + hci_connect_le_scan_cleanup(conn, 0x00); goto done; } - bt_dev_err(hdev, "request failed to create LE connection: err %d", err); - /* Check if connection is still pending */ if (conn != hci_lookup_le_connect(hdev)) goto done; @@ -2438,6 +2446,12 @@ void hci_conn_hash_flush(struct hci_dev *hdev) c->state = BT_CLOSED; hci_disconn_cfm(c, HCI_ERROR_LOCAL_HOST_TERM); + + /* Unlink before deleting otherwise it is possible that + * hci_conn_del removes the link which may cause the list to + * contain items already freed. + */ + hci_conn_unlink(c); hci_conn_del(c); } } @@ -2775,6 +2789,9 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason) { int r = 0; + if (test_and_set_bit(HCI_CONN_CANCEL, &conn->flags)) + return 0; + switch (conn->state) { case BT_CONNECTED: case BT_CONFIG: diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index ad92a4be5851..e87c928c9e17 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2881,16 +2881,6 @@ static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr, conn->resp_addr_type = peer_addr_type; bacpy(&conn->resp_addr, peer_addr); - - /* We don't want the connection attempt to stick around - * indefinitely since LE doesn't have a page timeout concept - * like BR/EDR. Set a timer for any connection that doesn't use - * the accept list for connecting. - */ - if (filter_policy == HCI_LE_USE_PEER_ADDR) - queue_delayed_work(conn->hdev->workqueue, - &conn->le_conn_timeout, - conn->conn_timeout); } static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status) @@ -5902,6 +5892,12 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, if (status) goto unlock; + /* Drop the connection if it has been aborted */ + if (test_bit(HCI_CONN_CANCEL, &conn->flags)) { + hci_conn_drop(conn); + goto unlock; + } + if (conn->dst_type == ADDR_LE_DEV_PUBLIC) addr_type = BDADDR_LE_PUBLIC; else @@ -6995,7 +6991,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, bis->iso_qos.in.latency = le16_to_cpu(ev->interval) * 125 / 100; bis->iso_qos.in.sdu = le16_to_cpu(ev->max_pdu); - hci_connect_cfm(bis, ev->status); + hci_iso_setup_path(bis); } hci_dev_unlock(hdev); diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 5a6aa1627791..632be1267288 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -246,8 +246,9 @@ int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen, skb = __hci_cmd_sync_sk(hdev, opcode, plen, param, event, timeout, sk); if (IS_ERR(skb)) { - bt_dev_err(hdev, "Opcode 0x%4x failed: %ld", opcode, - PTR_ERR(skb)); + if (!event) + bt_dev_err(hdev, "Opcode 0x%4x failed: %ld", opcode, + PTR_ERR(skb)); return PTR_ERR(skb); } @@ -5126,8 +5127,11 @@ static int hci_le_connect_cancel_sync(struct hci_dev *hdev, if (test_bit(HCI_CONN_SCANNING, &conn->flags)) return 0; + if (test_and_set_bit(HCI_CONN_CANCEL, &conn->flags)) + return 0; + return __hci_cmd_sync_status(hdev, HCI_OP_LE_CREATE_CONN_CANCEL, - 6, &conn->dst, HCI_CMD_TIMEOUT); + 0, NULL, HCI_CMD_TIMEOUT); } static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn) @@ -6102,6 +6106,9 @@ int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn) conn->conn_timeout, NULL); done: + if (err == -ETIMEDOUT) + hci_le_connect_cancel_sync(hdev, conn); + /* Re-enable advertising after the connection attempt is finished. */ hci_resume_advertising_sync(hdev); return err; diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index bed1a7b9205c..707f229f896a 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -433,7 +433,7 @@ static void hidp_set_timer(struct hidp_session *session) static void hidp_del_timer(struct hidp_session *session) { if (session->idle_to > 0) - del_timer(&session->timer); + del_timer_sync(&session->timer); } static void hidp_process_report(struct hidp_session *session, int type, diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 49926f59cc12..55a7226233f9 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -4652,33 +4652,27 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn, BT_DBG("scid 0x%4.4x dcid 0x%4.4x", scid, dcid); - mutex_lock(&conn->chan_lock); - - chan = __l2cap_get_chan_by_scid(conn, dcid); + chan = l2cap_get_chan_by_scid(conn, dcid); if (!chan) { - mutex_unlock(&conn->chan_lock); cmd_reject_invalid_cid(conn, cmd->ident, dcid, scid); return 0; } - l2cap_chan_hold(chan); - l2cap_chan_lock(chan); - rsp.dcid = cpu_to_le16(chan->scid); rsp.scid = cpu_to_le16(chan->dcid); l2cap_send_cmd(conn, cmd->ident, L2CAP_DISCONN_RSP, sizeof(rsp), &rsp); chan->ops->set_shutdown(chan); + mutex_lock(&conn->chan_lock); l2cap_chan_del(chan, ECONNRESET); + mutex_unlock(&conn->chan_lock); chan->ops->close(chan); l2cap_chan_unlock(chan); l2cap_chan_put(chan); - mutex_unlock(&conn->chan_lock); - return 0; } @@ -4698,33 +4692,27 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, BT_DBG("dcid 0x%4.4x scid 0x%4.4x", dcid, scid); - mutex_lock(&conn->chan_lock); - - chan = __l2cap_get_chan_by_scid(conn, scid); + chan = l2cap_get_chan_by_scid(conn, scid); if (!chan) { mutex_unlock(&conn->chan_lock); return 0; } - l2cap_chan_hold(chan); - l2cap_chan_lock(chan); - if (chan->state != BT_DISCONN) { l2cap_chan_unlock(chan); l2cap_chan_put(chan); - mutex_unlock(&conn->chan_lock); return 0; } + mutex_lock(&conn->chan_lock); l2cap_chan_del(chan, 0); + mutex_unlock(&conn->chan_lock); chan->ops->close(chan); l2cap_chan_unlock(chan); l2cap_chan_put(chan); - mutex_unlock(&conn->chan_lock); - return 0; } diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 1111da4e2f2b..cd1a27ac555d 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -235,27 +235,41 @@ static int sco_chan_add(struct sco_conn *conn, struct sock *sk, return err; } -static int sco_connect(struct hci_dev *hdev, struct sock *sk) +static int sco_connect(struct sock *sk) { struct sco_conn *conn; struct hci_conn *hcon; + struct hci_dev *hdev; int err, type; BT_DBG("%pMR -> %pMR", &sco_pi(sk)->src, &sco_pi(sk)->dst); + hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); + if (!hdev) + return -EHOSTUNREACH; + + hci_dev_lock(hdev); + if (lmp_esco_capable(hdev) && !disable_esco) type = ESCO_LINK; else type = SCO_LINK; if (sco_pi(sk)->setting == BT_VOICE_TRANSPARENT && - (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev))) - return -EOPNOTSUPP; + (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev))) { + err = -EOPNOTSUPP; + goto unlock; + } hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst, sco_pi(sk)->setting, &sco_pi(sk)->codec); - if (IS_ERR(hcon)) - return PTR_ERR(hcon); + if (IS_ERR(hcon)) { + err = PTR_ERR(hcon); + goto unlock; + } + + hci_dev_unlock(hdev); + hci_dev_put(hdev); conn = sco_conn_add(hcon); if (!conn) { @@ -263,13 +277,15 @@ static int sco_connect(struct hci_dev *hdev, struct sock *sk) return -ENOMEM; } - /* Update source addr of the socket */ - bacpy(&sco_pi(sk)->src, &hcon->src); - err = sco_chan_add(conn, sk, NULL); if (err) return err; + lock_sock(sk); + + /* Update source addr of the socket */ + bacpy(&sco_pi(sk)->src, &hcon->src); + if (hcon->state == BT_CONNECTED) { sco_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; @@ -278,6 +294,13 @@ static int sco_connect(struct hci_dev *hdev, struct sock *sk) sco_sock_set_timer(sk, sk->sk_sndtimeo); } + release_sock(sk); + + return err; + +unlock: + hci_dev_unlock(hdev); + hci_dev_put(hdev); return err; } @@ -565,7 +588,6 @@ static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; - struct hci_dev *hdev; int err; BT_DBG("sk %p", sk); @@ -574,37 +596,26 @@ static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen addr->sa_family != AF_BLUETOOTH) return -EINVAL; - lock_sock(sk); - if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) { - err = -EBADFD; - goto done; - } + if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) + return -EBADFD; - if (sk->sk_type != SOCK_SEQPACKET) { + if (sk->sk_type != SOCK_SEQPACKET) err = -EINVAL; - goto done; - } - - hdev = hci_get_route(&sa->sco_bdaddr, &sco_pi(sk)->src, BDADDR_BREDR); - if (!hdev) { - err = -EHOSTUNREACH; - goto done; - } - hci_dev_lock(hdev); + lock_sock(sk); /* Set destination address and psm */ bacpy(&sco_pi(sk)->dst, &sa->sco_bdaddr); + release_sock(sk); - err = sco_connect(hdev, sk); - hci_dev_unlock(hdev); - hci_dev_put(hdev); + err = sco_connect(sk); if (err) - goto done; + return err; + + lock_sock(sk); err = bt_sock_wait_state(sk, BT_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); -done: release_sock(sk); return err; } @@ -1129,6 +1140,8 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, break; } + release_sock(sk); + /* find total buffer size required to copy codec + caps */ hci_dev_lock(hdev); list_for_each_entry(c, &hdev->local_codecs, list) { @@ -1146,15 +1159,13 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, buf_len += sizeof(struct bt_codecs); if (buf_len > len) { hci_dev_put(hdev); - err = -ENOBUFS; - break; + return -ENOBUFS; } ptr = optval; if (put_user(num_codecs, ptr)) { hci_dev_put(hdev); - err = -EFAULT; - break; + return -EFAULT; } ptr += sizeof(num_codecs); @@ -1194,12 +1205,14 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, ptr += len; } - if (!err && put_user(buf_len, optlen)) - err = -EFAULT; - hci_dev_unlock(hdev); hci_dev_put(hdev); + lock_sock(sk); + + if (!err && put_user(buf_len, optlen)) + err = -EFAULT; + break; default: diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 638a4d5359db..4bc6761517bb 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -868,12 +868,17 @@ static unsigned int ip_sabotage_in(void *priv, { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - if (nf_bridge && !nf_bridge->in_prerouting && - !netif_is_l3_master(skb->dev) && - !netif_is_l3_slave(skb->dev)) { - nf_bridge_info_free(skb); - state->okfn(state->net, state->sk, skb); - return NF_STOLEN; + if (nf_bridge) { + if (nf_bridge->sabotage_in_done) + return NF_ACCEPT; + + if (!nf_bridge->in_prerouting && + !netif_is_l3_master(skb->dev) && + !netif_is_l3_slave(skb->dev)) { + nf_bridge->sabotage_in_done = 1; + state->okfn(state->net, state->sk, skb); + return NF_STOLEN; + } } return NF_ACCEPT; diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index de18e9c1d7a7..ba95c4d74a60 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -148,6 +148,17 @@ br_switchdev_fdb_notify(struct net_bridge *br, if (test_bit(BR_FDB_LOCKED, &fdb->flags)) return; + /* Entries with these flags were created using ndm_state == NUD_REACHABLE, + * ndm_flags == NTF_MASTER( | NTF_STICKY), ext_flags == 0 by something + * equivalent to 'bridge fdb add ... master dynamic (sticky)'. + * Drivers don't know how to deal with these, so don't notify them to + * avoid confusing them. + */ + if (test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags) && + !test_bit(BR_FDB_STATIC, &fdb->flags) && + !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) + return; + br_switchdev_fdb_populate(br, &item, fdb, NULL); switch (type) { diff --git a/net/core/dev.c b/net/core/dev.c index 253584777101..1488f700bf81 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3199,6 +3199,7 @@ static u16 skb_tx_hash(const struct net_device *dev, } if (skb_rx_queue_recorded(skb)) { + DEBUG_NET_WARN_ON_ONCE(qcount == 0); hash = skb_get_rx_queue(skb); if (hash >= qoffset) hash -= qoffset; @@ -10846,7 +10847,7 @@ void unregister_netdevice_many_notify(struct list_head *head, dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL, NULL, 0, - portid, nlmsg_seq(nlh)); + portid, nlh); /* * Flush the unicast and multicast chains diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 7b69cf882b8e..3e3598cd49f2 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -20,6 +20,7 @@ #include <linux/sched/task.h> #include <linux/uidgid.h> #include <linux/cookie.h> +#include <linux/proc_fs.h> #include <net/sock.h> #include <net/netlink.h> @@ -676,21 +677,19 @@ EXPORT_SYMBOL_GPL(get_net_ns); struct net *get_net_ns_by_fd(int fd) { - struct file *file; - struct ns_common *ns; - struct net *net; + struct fd f = fdget(fd); + struct net *net = ERR_PTR(-EINVAL); - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return ERR_CAST(file); + if (!f.file) + return ERR_PTR(-EBADF); - ns = get_proc_ns(file_inode(file)); - if (ns->ops == &netns_operations) - net = get_net(container_of(ns, struct net, ns)); - else - net = ERR_PTR(-EINVAL); + if (proc_ns_file(f.file)) { + struct ns_common *ns = get_proc_ns(file_inode(f.file)); + if (ns->ops == &netns_operations) + net = get_net(container_of(ns, struct net, ns)); + } + fdput(f); - fput(file); return net; } EXPORT_SYMBOL_GPL(get_net_ns_by_fd); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5d8eb57867a9..6e44e92ebdf5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3972,16 +3972,23 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, u32 event, gfp_t flags, int *new_nsid, - int new_ifindex, u32 portid, u32 seq) + int new_ifindex, u32 portid, + const struct nlmsghdr *nlh) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; + u32 seq = 0; skb = nlmsg_new(if_nlmsg_size(dev, 0), flags); if (skb == NULL) goto errout; + if (nlmsg_report(nlh)) + seq = nlmsg_seq(nlh); + else + portid = 0; + err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), type, portid, seq, change, 0, 0, event, new_nsid, new_ifindex, -1, flags); @@ -4017,7 +4024,7 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev, return; skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid, - new_ifindex, portid, nlmsg_seq(nlh)); + new_ifindex, portid, nlh); if (skb) rtmsg_ifinfo_send(skb, dev, flags, portid, nlh); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1a31815104d6..4c0879798eb8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5599,18 +5599,18 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_cloned(to)) return false; - /* In general, avoid mixing slab allocated and page_pool allocated - * pages within the same SKB. However when @to is not pp_recycle and - * @from is cloned, we can transition frag pages from page_pool to - * reference counted. - * - * On the other hand, don't allow coalescing two pp_recycle SKBs if - * @from is cloned, in case the SKB is using page_pool fragment + /* In general, avoid mixing page_pool and non-page_pool allocated + * pages within the same SKB. Additionally avoid dealing with clones + * with page_pool pages, in case the SKB is using page_pool fragment * references (PP_FLAG_PAGE_FRAG). Since we only take full page * references for cloned SKBs at the moment that would result in * inconsistent reference counts. + * In theory we could take full references if @from is cloned and + * !@to->pp_recycle but its tricky (due to potential race with + * the clone disappearing) and rare, so not worth dealing with. */ - if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from))) + if (to->pp_recycle != from->pp_recycle || + (from->pp_recycle && skb_cloned(from))) return false; if (len <= skb_tailroom(to)) { diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 74842b453407..782273bb93c2 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -177,7 +177,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); - kvfree_rcu(orig_sock_table); + kvfree_rcu_mightsleep(orig_sock_table); } } } @@ -215,7 +215,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); - kfree_rcu(cur); + kfree_rcu_mightsleep(cur); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); diff --git a/net/core/xdp.c b/net/core/xdp.c index 528d4b37983d..fb85aca81961 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -734,13 +734,21 @@ __bpf_kfunc int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *tim * bpf_xdp_metadata_rx_hash - Read XDP frame RX hash. * @ctx: XDP context pointer. * @hash: Return value pointer. + * @rss_type: Return value pointer for RSS type. + * + * The RSS hash type (@rss_type) specifies what portion of packet headers NIC + * hardware used when calculating RSS hash value. The RSS type can be decoded + * via &enum xdp_rss_hash_type either matching on individual L3/L4 bits + * ``XDP_RSS_L*`` or by combined traditional *RSS Hashing Types* + * ``XDP_RSS_TYPE_L*``. * * Return: * * Returns 0 on success or ``-errno`` on error. * * ``-EOPNOTSUPP`` : means device driver doesn't implement kfunc * * ``-ENODATA`` : means no RX-hash available for this frame */ -__bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash) +__bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type) { return -EOPNOTSUPP; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0d0cc4ef2b85..40fe70fc2015 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -25,6 +25,7 @@ static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; +static int tcp_app_win_max = 31; static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS; static int tcp_min_snd_mss_max = 65535; static int ip_privileged_port_min; @@ -1198,6 +1199,8 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &tcp_app_win_max, }, { .procname = "tcp_adv_win_scale", diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ea370afa70ed..b9d55277cb85 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2780,7 +2780,7 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) { while (iter->cur_sk < iter->end_sk) - sock_put(iter->batch[iter->cur_sk++]); + sock_gen_put(iter->batch[iter->cur_sk++]); } static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, @@ -2941,7 +2941,7 @@ static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) * st->bucket. See tcp_seek_last_pos(). */ st->offset++; - sock_put(iter->batch[iter->cur_sk++]); + sock_gen_put(iter->batch[iter->cur_sk++]); } if (iter->cur_sk < iter->end_sk) diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c index 488aec9e1a74..d1876f192225 100644 --- a/net/ipv6/rpl.c +++ b/net/ipv6/rpl.c @@ -32,7 +32,8 @@ static void *ipv6_rpl_segdata_pos(const struct ipv6_rpl_sr_hdr *hdr, int i) size_t ipv6_rpl_srh_size(unsigned char n, unsigned char cmpri, unsigned char cmpre) { - return (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre); + return sizeof(struct ipv6_rpl_sr_hdr) + (n * IPV6_PFXTAIL_LEN(cmpri)) + + IPV6_PFXTAIL_LEN(cmpre); } void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9fb2f33ee3a7..a675acfb901d 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1395,9 +1395,11 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) msg->msg_name = &sin; msg->msg_namelen = sizeof(sin); do_udp_sendmsg: - if (ipv6_only_sock(sk)) - return -ENETUNREACH; - return udp_sendmsg(sk, msg, len); + err = ipv6_only_sock(sk) ? + -ENETUNREACH : udp_sendmsg(sk, msg, len); + msg->msg_name = sin6; + msg->msg_namelen = addr_len; + return err; } } diff --git a/net/mac802154/scan.c b/net/mac802154/scan.c index 9b0933a185eb..5c191bedd72c 100644 --- a/net/mac802154/scan.c +++ b/net/mac802154/scan.c @@ -52,7 +52,7 @@ static int mac802154_scan_cleanup_locked(struct ieee802154_local *local, request = rcu_replace_pointer(local->scan_req, NULL, 1); if (!request) return 0; - kfree_rcu(request); + kvfree_rcu_mightsleep(request); /* Advertize first, while we know the devices cannot be removed */ if (aborted) @@ -403,7 +403,7 @@ int mac802154_stop_beacons_locked(struct ieee802154_local *local, request = rcu_replace_pointer(local->beacon_req, NULL, 1); if (!request) return 0; - kfree_rcu(request); + kvfree_rcu_mightsleep(request); nl802154_beaconing_done(wpan_dev); diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index d237d142171c..bceaab8dd8e4 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -9,11 +9,18 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, struct request_sock *req) { - struct sock *ssk = subflow->tcp_sock; - struct sock *sk = subflow->conn; + struct sock *sk, *ssk; struct sk_buff *skb; struct tcp_sock *tp; + /* on early fallback the subflow context is deleted by + * subflow_syn_recv_sock() + */ + if (!subflow) + return; + + ssk = subflow->tcp_sock; + sk = subflow->conn; tp = tcp_sk(ssk); subflow->is_mptfo = 1; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index b30cea2fbf3f..355f798d575a 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1192,9 +1192,8 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) */ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { if (mp_opt.data_fin && mp_opt.data_len == 1 && - mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64) && - schedule_work(&msk->work)) - sock_hold(subflow->conn); + mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64)) + mptcp_schedule_work((struct sock *)msk); return true; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 60b23b2716c4..b998e9df53ce 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2315,7 +2315,26 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, unsigned int flags) { struct mptcp_sock *msk = mptcp_sk(sk); - bool need_push, dispose_it; + bool dispose_it, need_push = false; + + /* If the first subflow moved to a close state before accept, e.g. due + * to an incoming reset, mptcp either: + * - if either the subflow or the msk are dead, destroy the context + * (the subflow socket is deleted by inet_child_forget) and the msk + * - otherwise do nothing at the moment and take action at accept and/or + * listener shutdown - user-space must be able to accept() the closed + * socket. + */ + if (msk->in_accept_queue && msk->first == ssk) { + if (!sock_flag(sk, SOCK_DEAD) && !sock_flag(ssk, SOCK_DEAD)) + return; + + /* ensure later check in mptcp_worker() will dispose the msk */ + sock_set_flag(sk, SOCK_DEAD); + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + mptcp_subflow_drop_ctx(ssk); + goto out_release; + } dispose_it = !msk->subflow || ssk != msk->subflow->sk; if (dispose_it) @@ -2351,28 +2370,22 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (!inet_csk(ssk)->icsk_ulp_ops) { WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD)); kfree_rcu(subflow, rcu); - } else if (msk->in_accept_queue && msk->first == ssk) { - /* if the first subflow moved to a close state, e.g. due to - * incoming reset and we reach here before inet_child_forget() - * the TCP stack could later try to close it via - * inet_csk_listen_stop(), or deliver it to the user space via - * accept(). - * We can't delete the subflow - or risk a double free - nor let - * the msk survive - or will be leaked in the non accept scenario: - * fallback and let TCP cope with the subflow cleanup. - */ - WARN_ON_ONCE(sock_flag(ssk, SOCK_DEAD)); - mptcp_subflow_drop_ctx(ssk); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ - if (ssk->sk_state == TCP_LISTEN) + if (ssk->sk_state == TCP_LISTEN) { + tcp_set_state(ssk, TCP_CLOSE); + mptcp_subflow_queue_clean(sk, ssk); + inet_csk_listen_stop(ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); + } __tcp_close(ssk, 0); /* close acquired an extra ref */ __sock_put(ssk); } + +out_release: release_sock(ssk); sock_put(ssk); @@ -2427,21 +2440,14 @@ static void __mptcp_close_subflow(struct sock *sk) mptcp_close_ssk(sk, ssk, subflow); } - /* if the MPC subflow has been closed before the msk is accepted, - * msk will never be accept-ed, close it now - */ - if (!msk->first && msk->in_accept_queue) { - sock_set_flag(sk, SOCK_DEAD); - inet_sk_state_store(sk, TCP_CLOSE); - } } -static bool mptcp_check_close_timeout(const struct sock *sk) +static bool mptcp_should_close(const struct sock *sk) { s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp; struct mptcp_subflow_context *subflow; - if (delta >= TCP_TIMEWAIT_LEN) + if (delta >= TCP_TIMEWAIT_LEN || mptcp_sk(sk)->in_accept_queue) return true; /* if all subflows are in closed status don't bother with additional @@ -2626,7 +2632,7 @@ static void mptcp_worker(struct work_struct *work) lock_sock(sk); state = sk->sk_state; - if (unlikely(state == TCP_CLOSE)) + if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN))) goto unlock; mptcp_check_data_fin_ack(sk); @@ -2649,7 +2655,7 @@ static void mptcp_worker(struct work_struct *work) * even if it is orphaned and in FIN_WAIT2 state */ if (sock_flag(sk, SOCK_DEAD)) { - if (mptcp_check_close_timeout(sk)) { + if (mptcp_should_close(sk)) { inet_sk_state_store(sk, TCP_CLOSE); mptcp_do_fastclose(sk); } @@ -2895,6 +2901,14 @@ static void __mptcp_destroy_sock(struct sock *sk) sock_put(sk); } +void __mptcp_unaccepted_force_close(struct sock *sk) +{ + sock_set_flag(sk, SOCK_DEAD); + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_do_fastclose(sk); + __mptcp_destroy_sock(sk); +} + static __poll_t mptcp_check_readable(struct mptcp_sock *msk) { /* Concurrent splices from sk_receive_queue into receive_queue will @@ -3733,6 +3747,18 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } + + /* Do late cleanup for the first subflow as necessary. Also + * deal with bad peers not doing a complete shutdown. + */ + if (msk->first && + unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { + __mptcp_close_ssk(newsk, msk->first, + mptcp_subflow_ctx(msk->first), 0); + if (unlikely(list_empty(&msk->conn_list))) + inet_sk_state_store(newsk, TCP_CLOSE); + } + release_sock(newsk); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 339a6f072989..d6469b6ab38e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -629,10 +629,12 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); +void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); +void __mptcp_unaccepted_force_close(struct sock *sk); void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index a0041360ee9d..281c1cc8dc8d 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -408,9 +408,8 @@ void mptcp_subflow_reset(struct sock *ssk) tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_done(ssk); - if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && - schedule_work(&mptcp_sk(sk)->work)) - return; /* worker will put sk for us */ + if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) + mptcp_schedule_work(sk); sock_put(sk); } @@ -724,9 +723,12 @@ void mptcp_subflow_drop_ctx(struct sock *ssk) if (!ctx) return; - subflow_ulp_fallback(ssk, ctx); - if (ctx->conn) - sock_put(ctx->conn); + list_del(&mptcp_subflow_ctx(ssk)->node); + if (inet_csk(ssk)->icsk_ulp_ops) { + subflow_ulp_fallback(ssk, ctx); + if (ctx->conn) + sock_put(ctx->conn); + } kfree_rcu(ctx, rcu); } @@ -1118,8 +1120,8 @@ static enum mapping_status get_mapping_status(struct sock *ssk, skb_ext_del(skb, SKB_EXT_MPTCP); return MAPPING_OK; } else { - if (updated && schedule_work(&msk->work)) - sock_hold((struct sock *)msk); + if (updated) + mptcp_schedule_work((struct sock *)msk); return MAPPING_DATA_FIN; } @@ -1222,17 +1224,12 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, /* sched mptcp worker to remove the subflow if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { - struct sock *sk = (struct sock *)msk; - if (likely(ssk->sk_state != TCP_CLOSE)) return; if (skb_queue_empty(&ssk->sk_receive_queue) && - !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) { - sock_hold(sk); - if (!schedule_work(&msk->work)) - sock_put(sk); - } + !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + mptcp_schedule_work((struct sock *)msk); } static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) @@ -1825,6 +1822,77 @@ static void subflow_state_change(struct sock *sk) } } +void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) +{ + struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; + struct mptcp_sock *msk, *next, *head = NULL; + struct request_sock *req; + struct sock *sk; + + /* build a list of all unaccepted mptcp sockets */ + spin_lock_bh(&queue->rskq_lock); + for (req = queue->rskq_accept_head; req; req = req->dl_next) { + struct mptcp_subflow_context *subflow; + struct sock *ssk = req->sk; + + if (!sk_is_mptcp(ssk)) + continue; + + subflow = mptcp_subflow_ctx(ssk); + if (!subflow || !subflow->conn) + continue; + + /* skip if already in list */ + sk = subflow->conn; + msk = mptcp_sk(sk); + if (msk->dl_next || msk == head) + continue; + + sock_hold(sk); + msk->dl_next = head; + head = msk; + } + spin_unlock_bh(&queue->rskq_lock); + if (!head) + return; + + /* can't acquire the msk socket lock under the subflow one, + * or will cause ABBA deadlock + */ + release_sock(listener_ssk); + + for (msk = head; msk; msk = next) { + sk = (struct sock *)msk; + + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + next = msk->dl_next; + msk->dl_next = NULL; + + __mptcp_unaccepted_force_close(sk); + release_sock(sk); + + /* lockdep will report a false positive ABBA deadlock + * between cancel_work_sync and the listener socket. + * The involved locks belong to different sockets WRT + * the existing AB chain. + * Using a per socket key is problematic as key + * deregistration requires process context and must be + * performed at socket disposal time, in atomic + * context. + * Just tell lockdep to consider the listener socket + * released here. + */ + mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); + mptcp_cancel_work(sk); + mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_); + + sock_put(sk); + } + + /* we are still under the listener msk socket lock */ + lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); +} + static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6004d4b24451..e48ab8dfb541 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3447,6 +3447,64 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) return 0; } +int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + struct nft_ctx *pctx = (struct nft_ctx *)ctx; + const struct nft_data *data; + int err; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && + *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) + return 0; + + data = nft_set_ext_data(ext); + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + pctx->level++; + err = nft_chain_validate(ctx, data->verdict.chain); + if (err < 0) + return err; + pctx->level--; + break; + default: + break; + } + + return 0; +} + +struct nft_set_elem_catchall { + struct list_head list; + struct rcu_head rcu; + void *elem; +}; + +int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_elem elem; + struct nft_set_ext *ext; + int ret = 0; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + elem.priv = catchall->elem; + ret = nft_setelem_validate(ctx, set, NULL, &elem); + if (ret < 0) + return ret; + } + + return ret; +} + static struct nft_rule *nft_rule_lookup_byid(const struct net *net, const struct nft_chain *chain, const struct nlattr *nla); @@ -4759,12 +4817,6 @@ err_set_name: return err; } -struct nft_set_elem_catchall { - struct list_head list; - struct rcu_head rcu; - void *elem; -}; - static void nft_set_catchall_destroy(const struct nft_ctx *ctx, struct nft_set *set) { @@ -6056,7 +6108,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) + if (((flags & NFT_SET_ELEM_CATCHALL) && nla[NFTA_SET_ELEM_KEY]) || + (!(flags & NFT_SET_ELEM_CATCHALL) && !nla[NFTA_SET_ELEM_KEY])) return -EINVAL; if (flags != 0) { @@ -7052,7 +7105,7 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, } if (nla[NFTA_OBJ_USERDATA]) { - obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL); + obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL_ACCOUNT); if (obj->udata == NULL) goto err_userdata; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index cae5a6724163..cecf8ab90e58 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -199,37 +199,6 @@ nla_put_failure: return -1; } -static int nft_lookup_validate_setelem(const struct nft_ctx *ctx, - struct nft_set *set, - const struct nft_set_iter *iter, - struct nft_set_elem *elem) -{ - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); - struct nft_ctx *pctx = (struct nft_ctx *)ctx; - const struct nft_data *data; - int err; - - if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && - *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) - return 0; - - data = nft_set_ext_data(ext); - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - pctx->level++; - err = nft_chain_validate(ctx, data->verdict.chain); - if (err < 0) - return err; - pctx->level--; - break; - default: - break; - } - - return 0; -} - static int nft_lookup_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **d) @@ -245,9 +214,12 @@ static int nft_lookup_validate(const struct nft_ctx *ctx, iter.skip = 0; iter.count = 0; iter.err = 0; - iter.fn = nft_lookup_validate_setelem; + iter.fn = nft_setelem_validate; priv->set->ops->walk(ctx, priv->set, &iter); + if (!iter.err) + iter.err = nft_set_catchall_validate(ctx, priv->set); + if (iter.err < 0) return iter.err; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index ca3ebfdb3023..a8cf9a88758e 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -913,7 +913,7 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, { struct vport *vport = ovs_vport_rcu(dp, out_port); - if (likely(vport)) { + if (likely(vport && netif_carrier_ok(vport->dev))) { u16 mru = OVS_CB(skb)->mru; u32 cutlen = OVS_CB(skb)->cutlen; diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c index 3a70255c8d02..76f0434d3d06 100644 --- a/net/qrtr/af_qrtr.c +++ b/net/qrtr/af_qrtr.c @@ -498,6 +498,11 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) if (!size || len != ALIGN(size, 4) + hdrlen) goto err; + if ((cb->type == QRTR_TYPE_NEW_SERVER || + cb->type == QRTR_TYPE_RESUME_TX) && + size < sizeof(struct qrtr_ctrl_pkt)) + goto err; + if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA && cb->type != QRTR_TYPE_RESUME_TX) goto err; @@ -510,9 +515,6 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) /* Remote node endpoint can bridge other distant nodes */ const struct qrtr_ctrl_pkt *pkt; - if (size < sizeof(*pkt)) - goto err; - pkt = data + hdrlen; qrtr_node_assign(node, le32_to_cpu(pkt->server.node)); } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 2a6b6be0811b..35785a36c802 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3235,6 +3235,9 @@ int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, err_miss_alloc: tcf_exts_destroy(exts); +#ifdef CONFIG_NET_CLS_ACT + exts->actions = NULL; +#endif return err; } EXPORT_SYMBOL(tcf_exts_init_ex); diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index cf5ebe43b3b4..02098a02943e 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -421,15 +421,16 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } else weight = 1; - if (tb[TCA_QFQ_LMAX]) { + if (tb[TCA_QFQ_LMAX]) lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); - if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { - pr_notice("qfq: invalid max length %u\n", lmax); - return -EINVAL; - } - } else + else lmax = psched_mtu(qdisc_dev(sch)); + if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { + pr_notice("qfq: invalid max length %u\n", lmax); + return -EINVAL; + } + inv_w = ONE_FP / weight; weight = ONE_FP / inv_w; diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 94727feb07b3..b046b11200c9 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -1154,7 +1154,8 @@ static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn) #define _sctp_walk_ifwdtsn(pos, chunk, end) \ for (pos = chunk->subh.ifwdtsn_hdr->skip; \ - (void *)pos < (void *)chunk->subh.ifwdtsn_hdr->skip + (end); pos++) + (void *)pos <= (void *)chunk->subh.ifwdtsn_hdr->skip + (end) - \ + sizeof(struct sctp_ifwdtsn_skip); pos++) #define sctp_walk_ifwdtsn(pos, ch) \ _sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \ diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c6b4a62276f6..50c38b624f77 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3270,6 +3270,17 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, sk_common_release(sk); goto out; } + + /* smc_clcsock_release() does not wait smc->clcsock->sk's + * destruction; its sk_state might not be TCP_CLOSE after + * smc->sk is close()d, and TCP timers can be fired later, + * which need net ref. + */ + sk = smc->clcsock->sk; + __netns_tracker_free(net, &sk->ns_tracker, false); + sk->sk_net_refcnt = 1; + get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); } else { smc->clcsock = clcsock; } diff --git a/net/sunrpc/auth_gss/gss_krb5_test.c b/net/sunrpc/auth_gss/gss_krb5_test.c index ce0541e32fc9..95ca783795c5 100644 --- a/net/sunrpc/auth_gss/gss_krb5_test.c +++ b/net/sunrpc/auth_gss/gss_krb5_test.c @@ -73,7 +73,6 @@ static void checksum_case(struct kunit *test) { const struct gss_krb5_test_param *param = test->param_value; struct xdr_buf buf = { - .head[0].iov_base = param->plaintext->data, .head[0].iov_len = param->plaintext->len, .len = param->plaintext->len, }; @@ -99,6 +98,10 @@ static void checksum_case(struct kunit *test) err = crypto_ahash_setkey(tfm, Kc.data, Kc.len); KUNIT_ASSERT_EQ(test, err, 0); + buf.head[0].iov_base = kunit_kzalloc(test, buf.head[0].iov_len, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buf.head[0].iov_base); + memcpy(buf.head[0].iov_base, param->plaintext->data, buf.head[0].iov_len); + checksum.len = gk5e->cksumlength; checksum.data = kunit_kzalloc(test, checksum.len, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, checksum.data); @@ -1327,6 +1330,7 @@ static void rfc6803_encrypt_case(struct kunit *test) if (!gk5e) kunit_skip(test, "Encryption type is not available"); + memset(usage_data, 0, sizeof(usage_data)); usage.data[3] = param->constant; Ke.len = gk5e->Ke_length; diff --git a/rust/Makefile b/rust/Makefile index f88d108fbef0..aef85e9e8eeb 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -262,6 +262,20 @@ BINDGEN_TARGET := $(BINDGEN_TARGET_$(SRCARCH)) # some configurations, with new GCC versions, etc. bindgen_extra_c_flags = -w --target=$(BINDGEN_TARGET) +# Auto variable zero-initialization requires an additional special option with +# clang that is going to be removed sometime in the future (likely in +# clang-18), so make sure to pass this option only if clang supports it +# (libclang major version < 16). +# +# https://github.com/llvm/llvm-project/issues/44842 +# https://github.com/llvm/llvm-project/blob/llvmorg-16.0.0-rc2/clang/docs/ReleaseNotes.rst#deprecated-compiler-flags +ifdef CONFIG_INIT_STACK_ALL_ZERO +libclang_maj_ver=$(shell $(BINDGEN) $(srctree)/scripts/rust_is_available_bindgen_libclang.h 2>&1 | sed -ne 's/.*clang version \([0-9]*\).*/\1/p') +ifeq ($(shell expr $(libclang_maj_ver) \< 16), 1) +bindgen_extra_c_flags += -enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang +endif +endif + bindgen_c_flags = $(filter-out $(bindgen_skip_c_flags), $(c_flags)) \ $(bindgen_extra_c_flags) endif @@ -283,7 +297,7 @@ quiet_cmd_bindgen = BINDGEN $@ $(bindgen_target_cflags) $(bindgen_target_extra) $(obj)/bindings/bindings_generated.rs: private bindgen_target_flags = \ - $(shell grep -v '^\#\|^$$' $(srctree)/$(src)/bindgen_parameters) + $(shell grep -v '^#\|^$$' $(srctree)/$(src)/bindgen_parameters) $(obj)/bindings/bindings_generated.rs: $(src)/bindings/bindings_helper.h \ $(src)/bindgen_parameters FORCE $(call if_changed_dep,bindgen) diff --git a/rust/kernel/print.rs b/rust/kernel/print.rs index 30103325696d..8009184bf6d7 100644 --- a/rust/kernel/print.rs +++ b/rust/kernel/print.rs @@ -18,7 +18,11 @@ use crate::bindings; // Called from `vsprintf` with format specifier `%pA`. #[no_mangle] -unsafe fn rust_fmt_argument(buf: *mut c_char, end: *mut c_char, ptr: *const c_void) -> *mut c_char { +unsafe extern "C" fn rust_fmt_argument( + buf: *mut c_char, + end: *mut c_char, + ptr: *const c_void, +) -> *mut c_char { use fmt::Write; // SAFETY: The C contract guarantees that `buf` is valid if it's less than `end`. let mut w = unsafe { RawFormatter::from_ptrs(buf.cast(), end.cast()) }; diff --git a/rust/kernel/str.rs b/rust/kernel/str.rs index b771310fa4a4..cd3d2a6cf1fc 100644 --- a/rust/kernel/str.rs +++ b/rust/kernel/str.rs @@ -408,7 +408,7 @@ impl RawFormatter { /// If `pos` is less than `end`, then the region between `pos` (inclusive) and `end` /// (exclusive) must be valid for writes for the lifetime of the returned [`RawFormatter`]. pub(crate) unsafe fn from_ptrs(pos: *mut u8, end: *mut u8) -> Self { - // INVARIANT: The safety requierments guarantee the type invariants. + // INVARIANT: The safety requirements guarantee the type invariants. Self { beg: pos as _, pos: pos as _, diff --git a/scripts/Makefile.package b/scripts/Makefile.package index 61f72eb8d9be..4000ad04c122 100644 --- a/scripts/Makefile.package +++ b/scripts/Makefile.package @@ -27,21 +27,6 @@ fi ; \ tar -I $(KGZIP) -c $(RCS_TAR_IGNORE) -f $(2).tar.gz \ --transform 's:^:$(2)/:S' $(TAR_CONTENT) $(3) -# tarball compression -# --------------------------------------------------------------------------- - -%.tar.gz: %.tar - $(call cmd,gzip) - -%.tar.bz2: %.tar - $(call cmd,bzip2) - -%.tar.xz: %.tar - $(call cmd,xzmisc) - -%.tar.zst: %.tar - $(call cmd,zstd) - # Git # --------------------------------------------------------------------------- @@ -57,16 +42,24 @@ check-git: false; \ fi +git-config-tar.gz = -c tar.tar.gz.command="$(KGZIP)" +git-config-tar.bz2 = -c tar.tar.bz2.command="$(KBZIP2)" +git-config-tar.xz = -c tar.tar.xz.command="$(XZ)" +git-config-tar.zst = -c tar.tar.zst.command="$(ZSTD)" + +quiet_cmd_archive = ARCHIVE $@ + cmd_archive = git -C $(srctree) $(git-config-tar$(suffix $@)) archive \ + --output=$$(realpath $@) $(archive-args) + # Linux source tarball # --------------------------------------------------------------------------- -quiet_cmd_archive_linux = ARCHIVE $@ - cmd_archive_linux = \ - git -C $(srctree) archive --output=$$(realpath $@) --prefix=$(basename $@)/ $$(cat $<) +linux-tarballs := $(addprefix linux, .tar.gz) -targets += linux.tar -linux.tar: .tmp_HEAD FORCE - $(call if_changed,archive_linux) +targets += $(linux-tarballs) +$(linux-tarballs): archive-args = --prefix=linux/ $$(cat $<) +$(linux-tarballs): .tmp_HEAD FORCE + $(call if_changed,archive) # rpm-pkg # --------------------------------------------------------------------------- @@ -94,7 +87,7 @@ binrpm-pkg: $(UTS_MACHINE)-linux -bb $(objtree)/binkernel.spec quiet_cmd_debianize = GEN $@ - cmd_debianize = $(srctree)/scripts/package/mkdebian + cmd_debianize = $(srctree)/scripts/package/mkdebian $(mkdebian-opts) debian: FORCE $(call cmd,debianize) @@ -103,6 +96,7 @@ PHONY += debian-orig debian-orig: private source = $(shell dpkg-parsechangelog -S Source) debian-orig: private version = $(shell dpkg-parsechangelog -S Version | sed 's/-[^-]*$$//') debian-orig: private orig-name = $(source)_$(version).orig.tar.gz +debian-orig: mkdebian-opts = --need-source debian-orig: linux.tar.gz debian $(Q)if [ "$(df --output=target .. 2>/dev/null)" = "$(df --output=target $< 2>/dev/null)" ]; then \ ln -f $< ../$(orig-name); \ @@ -145,10 +139,17 @@ tar-install: FORCE $(Q)$(MAKE) -f $(srctree)/Makefile +$(Q)$(srctree)/scripts/package/buildtar $@ +compress-tar.gz = -I "$(KGZIP)" +compress-tar.bz2 = -I "$(KBZIP2)" +compress-tar.xz = -I "$(XZ)" +compress-tar.zst = -I "$(ZSTD)" + quiet_cmd_tar = TAR $@ - cmd_tar = cd $<; tar cf ../$@ --owner=root --group=root --sort=name * + cmd_tar = cd $<; tar cf ../$@ $(compress-tar$(suffix $@)) --owner=root --group=root --sort=name * -linux-$(KERNELRELEASE)-$(ARCH).tar: tar-install +dir-tarballs := $(addprefix linux-$(KERNELRELEASE)-$(ARCH), .tar .tar.gz .tar.bz2 .tar.xz .tar.zst) + +$(dir-tarballs): tar-install $(call cmd,tar) PHONY += dir-pkg @@ -180,16 +181,17 @@ quiet_cmd_perf_version_file = GEN $@ .tmp_perf/PERF-VERSION-FILE: .tmp_HEAD $(srctree)/tools/perf/util/PERF-VERSION-GEN | .tmp_perf $(call cmd,perf_version_file) -quiet_cmd_archive_perf = ARCHIVE $@ - cmd_archive_perf = \ - git -C $(srctree) archive --output=$$(realpath $@) --prefix=$(basename $@)/ \ - --add-file=$$(realpath $(word 2, $^)) \ +perf-archive-args = --add-file=$$(realpath $(word 2, $^)) \ --add-file=$$(realpath $(word 3, $^)) \ $$(cat $(word 2, $^))^{tree} $$(cat $<) -targets += perf-$(KERNELVERSION).tar -perf-$(KERNELVERSION).tar: tools/perf/MANIFEST .tmp_perf/HEAD .tmp_perf/PERF-VERSION-FILE FORCE - $(call if_changed,archive_perf) + +perf-tarballs := $(addprefix perf-$(KERNELVERSION), .tar .tar.gz .tar.bz2 .tar.xz .tar.zst) + +targets += $(perf-tarballs) +$(perf-tarballs): archive-args = --prefix=perf-$(KERNELVERSION)/ $(perf-archive-args) +$(perf-tarballs): tools/perf/MANIFEST .tmp_perf/HEAD .tmp_perf/PERF-VERSION-FILE FORCE + $(call if_changed,archive) PHONY += perf-tar-src-pkg perf-tar-src-pkg: perf-$(KERNELVERSION).tar diff --git a/scripts/asn1_compiler.c b/scripts/asn1_compiler.c index 7b6756a8c15d..4c3f645065a4 100644 --- a/scripts/asn1_compiler.c +++ b/scripts/asn1_compiler.c @@ -625,7 +625,7 @@ int main(int argc, char **argv) p = strrchr(argv[1], '/'); p = p ? p + 1 : argv[1]; grammar_name = strdup(p); - if (!p) { + if (!grammar_name) { perror(NULL); exit(1); } diff --git a/scripts/cc-version.sh b/scripts/cc-version.sh index 0573c92e841d..a7e28b6a514e 100755 --- a/scripts/cc-version.sh +++ b/scripts/cc-version.sh @@ -45,10 +45,6 @@ Clang) version=$2.$3.$4 min_version=$($min_tool_version llvm) ;; -ICC) - version=$(($2 / 100)).$(($2 % 100)).$3 - min_version=$($min_tool_version icc) - ;; *) echo "$orig_args: unknown C compiler" >&2 exit 1 diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index bd44d12965c9..4bfbe3c9fa15 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6388,6 +6388,15 @@ sub process { } } +# check for soon-to-be-deprecated single-argument k[v]free_rcu() API + if ($line =~ /\bk[v]?free_rcu\s*\([^(]+\)/) { + if ($line =~ /\bk[v]?free_rcu\s*\([^,]+\)/) { + ERROR("DEPRECATED_API", + "Single-argument k[v]free_rcu() API is deprecated, please pass rcu_head object or call k[v]free_rcu_mightsleep()." . $herecurr); + } + } + + # check for unnecessary "Out of Memory" messages if ($line =~ /^\+.*\b$logFunctions\s*\(/ && $prevline =~ /^[ \+]\s*if\s*\(\s*(\!\s*|NULL\s*==\s*)?($Lval)(\s*==\s*NULL\s*)?\s*\)/ && diff --git a/scripts/generate_rust_analyzer.py b/scripts/generate_rust_analyzer.py index ecc7ea9a4dcf..946e250c1b2a 100755 --- a/scripts/generate_rust_analyzer.py +++ b/scripts/generate_rust_analyzer.py @@ -104,7 +104,10 @@ def generate_crates(srctree, objtree, sysroot_src): name = path.name.replace(".rs", "") # Skip those that are not crate roots. - if f"{name}.o" not in open(path.parent / "Makefile").read(): + try: + if f"{name}.o" not in open(path.parent / "Makefile").read(): + continue + except FileNotFoundError: continue logging.info("Adding %s", name) diff --git a/scripts/is_rust_module.sh b/scripts/is_rust_module.sh index 28b3831a7593..464761a7cf7f 100755 --- a/scripts/is_rust_module.sh +++ b/scripts/is_rust_module.sh @@ -13,4 +13,4 @@ set -e # # In the future, checking for the `.comment` section may be another # option, see https://github.com/rust-lang/rust/pull/97550. -${NM} "$*" | grep -qE '^[0-9a-fA-F]+ r _R[^[:space:]]+16___IS_RUST_MODULE[^[:space:]]*$' +${NM} "$*" | grep -qE '^[0-9a-fA-F]+ [Rr] _R[^[:space:]]+16___IS_RUST_MODULE[^[:space:]]*$' diff --git a/scripts/package/gen-diff-patch b/scripts/package/gen-diff-patch index f842ab50a780..8a98b7bb78a0 100755 --- a/scripts/package/gen-diff-patch +++ b/scripts/package/gen-diff-patch @@ -1,44 +1,36 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0-only -diff_patch="${1}" -untracked_patch="${2}" -srctree=$(dirname $0)/../.. +diff_patch=$1 -rm -f ${diff_patch} ${untracked_patch} +mkdir -p "$(dirname "${diff_patch}")" -if ! ${srctree}/scripts/check-git; then - exit -fi - -mkdir -p "$(dirname ${diff_patch})" "$(dirname ${untracked_patch})" +git -C "${srctree:-.}" diff HEAD > "${diff_patch}" -git -C "${srctree}" diff HEAD > "${diff_patch}" - -if [ ! -s "${diff_patch}" ]; then - rm -f "${diff_patch}" +if [ ! -s "${diff_patch}" ] || + [ -z "$(git -C "${srctree:-.}" ls-files --other --exclude-standard | head -n1)" ]; then exit fi -git -C ${srctree} status --porcelain --untracked-files=all | -while read stat path -do - if [ "${stat}" = '??' ]; then - - if ! diff -u /dev/null "${srctree}/${path}" > .tmp_diff && - ! head -n1 .tmp_diff | grep -q "Binary files"; then - { - echo "--- /dev/null" - echo "+++ linux/$path" - cat .tmp_diff | tail -n +3 - } >> ${untracked_patch} - fi - fi -done - -rm -f .tmp_diff - -if [ ! -s "${diff_patch}" ]; then - rm -f "${diff_patch}" - exit -fi +# The source tarball, which is generated by 'git archive', contains everything +# you committed in the repository. If you have local diff ('git diff HEAD'), +# it will go into ${diff_patch}. If untracked files are remaining, the resulting +# source package may not be correct. +# +# Examples: +# - You modified a source file to add #include "new-header.h" +# but forgot to add new-header.h +# - You modified a Makefile to add 'obj-$(CONFIG_FOO) += new-dirver.o' +# but you forgot to add new-driver.c +# +# You need to commit them, or at least stage them by 'git add'. +# +# This script does not take care of untracked files because doing so would +# introduce additional complexity. Instead, print a warning message here if +# untracked files are found. +# If all untracked files are just garbage, you can ignore this warning. +echo >&2 "============================ WARNING ============================" +echo >&2 "Your working tree has diff from HEAD, and also untracked file(s)." +echo >&2 "Please make sure you did 'git add' for all new files you need in" +echo >&2 "the source package." +echo >&2 "=================================================================" diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian index e20a2b5be9eb..74b83c9ae0a8 100755 --- a/scripts/package/mkdebian +++ b/scripts/package/mkdebian @@ -84,7 +84,66 @@ set_debarch() { fi } +# Create debian/source/ if it is a source package build +gen_source () +{ + mkdir -p debian/source + + echo "3.0 (quilt)" > debian/source/format + + { + echo "diff-ignore" + echo "extend-diff-ignore = .*" + } > debian/source/local-options + + # Add .config as a patch + mkdir -p debian/patches + { + echo "Subject: Add .config" + echo "Author: ${maintainer}" + echo + echo "--- /dev/null" + echo "+++ linux/.config" + diff -u /dev/null "${KCONFIG_CONFIG}" | tail -n +3 + } > debian/patches/config.patch + echo config.patch > debian/patches/series + + "${srctree}/scripts/package/gen-diff-patch" debian/patches/diff.patch + if [ -s debian/patches/diff.patch ]; then + sed -i " + 1iSubject: Add local diff + 1iAuthor: ${maintainer} + 1i + " debian/patches/diff.patch + + echo diff.patch >> debian/patches/series + else + rm -f debian/patches/diff.patch + fi +} + rm -rf debian +mkdir debian + +email=${DEBEMAIL-$EMAIL} + +# use email string directly if it contains <email> +if echo "${email}" | grep -q '<.*>'; then + maintainer=${email} +else + # or construct the maintainer string + user=${KBUILD_BUILD_USER-$(id -nu)} + name=${DEBFULLNAME-${user}} + if [ -z "${email}" ]; then + buildhost=${KBUILD_BUILD_HOST-$(hostname -f 2>/dev/null || hostname)} + email="${user}@${buildhost}" + fi + maintainer="${name} <${email}>" +fi + +if [ "$1" = --need-source ]; then + gen_source +fi # Some variables and settings used throughout the script version=$KERNELRELEASE @@ -104,22 +163,6 @@ fi debarch= set_debarch -email=${DEBEMAIL-$EMAIL} - -# use email string directly if it contains <email> -if echo $email | grep -q '<.*>'; then - maintainer=$email -else - # or construct the maintainer string - user=${KBUILD_BUILD_USER-$(id -nu)} - name=${DEBFULLNAME-$user} - if [ -z "$email" ]; then - buildhost=${KBUILD_BUILD_HOST-$(hostname -f 2>/dev/null || hostname)} - email="$user@$buildhost" - fi - maintainer="$name <$email>" -fi - # Try to determine distribution if [ -n "$KDEB_CHANGELOG_DIST" ]; then distribution=$KDEB_CHANGELOG_DIST @@ -132,34 +175,6 @@ else echo >&2 "Install lsb-release or set \$KDEB_CHANGELOG_DIST explicitly" fi -mkdir -p debian/source/ -echo "3.0 (quilt)" > debian/source/format - -{ - echo "diff-ignore" - echo "extend-diff-ignore = .*" -} > debian/source/local-options - -# Add .config as a patch -mkdir -p debian/patches -{ - echo "Subject: Add .config" - echo "Author: ${maintainer}" - echo - echo "--- /dev/null" - echo "+++ linux/.config" - diff -u /dev/null "${KCONFIG_CONFIG}" | tail -n +3 -} > debian/patches/config -echo config > debian/patches/series - -$(dirname $0)/gen-diff-patch debian/patches/diff.patch debian/patches/untracked.patch -if [ -f debian/patches/diff.patch ]; then - echo diff.patch >> debian/patches/series -fi -if [ -f debian/patches/untracked.patch ]; then - echo untracked.patch >> debian/patches/series -fi - echo $debarch > debian/arch extra_build_depends=", $(if_enabled_echo CONFIG_UNWINDER_ORC libelf-dev:native)" extra_build_depends="$extra_build_depends, $(if_enabled_echo CONFIG_SYSTEM_TRUSTED_KEYRING libssl-dev:native)" @@ -175,7 +190,7 @@ EOF # Generate copyright file cat <<EOF > debian/copyright -This is a packacked upstream version of the Linux kernel. +This is a packaged upstream version of the Linux kernel. The sources may be found at most Linux archive sites, including: https://www.kernel.org/pub/linux/kernel diff --git a/scripts/package/mkspec b/scripts/package/mkspec index b7d1dc28a5d6..fc8ad3fbc0a9 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -19,8 +19,7 @@ else mkdir -p rpmbuild/SOURCES cp linux.tar.gz rpmbuild/SOURCES cp "${KCONFIG_CONFIG}" rpmbuild/SOURCES/config - $(dirname $0)/gen-diff-patch rpmbuild/SOURCES/diff.patch rpmbuild/SOURCES/untracked.patch - touch rpmbuild/SOURCES/diff.patch rpmbuild/SOURCES/untracked.patch + "${srctree}/scripts/package/gen-diff-patch" rpmbuild/SOURCES/diff.patch fi if grep -q CONFIG_MODULES=y include/config/auto.conf; then @@ -56,7 +55,6 @@ sed -e '/^DEL/d' -e 's/^\t*//' <<EOF $S Source0: linux.tar.gz $S Source1: config $S Source2: diff.patch -$S Source3: untracked.patch Provides: $PROVIDES $S BuildRequires: bc binutils bison dwarves $S BuildRequires: (elfutils-libelf-devel or libelf-devel) flex @@ -94,12 +92,7 @@ $S$M $S %prep $S %setup -q -n linux $S cp %{SOURCE1} .config -$S if [ -s %{SOURCE2} ]; then -$S patch -p1 < %{SOURCE2} -$S fi -$S if [ -s %{SOURCE3} ]; then -$S patch -p1 < %{SOURCE3} -$S fi +$S patch -p1 < %{SOURCE2} $S $S %build $S $MAKE %{?_smp_mflags} KERNELRELEASE=$KERNELRELEASE KBUILD_BUILD_VERSION=%{release} diff --git a/security/Kconfig b/security/Kconfig index e6db09a779b7..97abeb9b9a19 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -32,11 +32,6 @@ config SECURITY If you are unsure how to answer this question, answer N. -config SECURITY_WRITABLE_HOOKS - depends on SECURITY - bool - default n - config SECURITYFS bool "Enable the securityfs filesystem" help @@ -110,7 +105,7 @@ config INTEL_TXT See <https://www.intel.com/technology/security/> for more information about Intel(R) TXT. See <http://tboot.sourceforge.net> for more information about tboot. - See Documentation/x86/intel_txt.rst for a description of how to enable + See Documentation/arch/x86/intel_txt.rst for a description of how to enable Intel TXT support in a kernel boot. If you are unsure as to whether this is required, answer N. @@ -246,15 +241,17 @@ endchoice config LSM string "Ordered list of enabled LSMs" - default "landlock,lockdown,yama,loadpin,safesetid,integrity,smack,selinux,tomoyo,apparmor,bpf" if DEFAULT_SECURITY_SMACK - default "landlock,lockdown,yama,loadpin,safesetid,integrity,apparmor,selinux,smack,tomoyo,bpf" if DEFAULT_SECURITY_APPARMOR - default "landlock,lockdown,yama,loadpin,safesetid,integrity,tomoyo,bpf" if DEFAULT_SECURITY_TOMOYO - default "landlock,lockdown,yama,loadpin,safesetid,integrity,bpf" if DEFAULT_SECURITY_DAC - default "landlock,lockdown,yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor,bpf" + default "landlock,lockdown,yama,loadpin,safesetid,smack,selinux,tomoyo,apparmor,bpf" if DEFAULT_SECURITY_SMACK + default "landlock,lockdown,yama,loadpin,safesetid,apparmor,selinux,smack,tomoyo,bpf" if DEFAULT_SECURITY_APPARMOR + default "landlock,lockdown,yama,loadpin,safesetid,tomoyo,bpf" if DEFAULT_SECURITY_TOMOYO + default "landlock,lockdown,yama,loadpin,safesetid,bpf" if DEFAULT_SECURITY_DAC + default "landlock,lockdown,yama,loadpin,safesetid,selinux,smack,tomoyo,apparmor,bpf" help A comma-separated list of LSMs, in initialization order. - Any LSMs left off this list will be ignored. This can be - controlled at boot with the "lsm=" parameter. + Any LSMs left off this list, except for those with order + LSM_ORDER_FIRST and LSM_ORDER_LAST, which are always enabled + if selected in the kernel configuration, will be ignored. + This can be controlled at boot with the "lsm=" parameter. If unsure, leave this as the default. diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index d6cc4812ca53..cebba4824e60 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -1209,13 +1209,13 @@ static int apparmor_inet_conn_request(const struct sock *sk, struct sk_buff *skb /* * The cred blob is a pointer to, not an instance of, an aa_label. */ -struct lsm_blob_sizes apparmor_blob_sizes __lsm_ro_after_init = { +struct lsm_blob_sizes apparmor_blob_sizes __ro_after_init = { .lbs_cred = sizeof(struct aa_label *), .lbs_file = sizeof(struct aa_file_ctx), .lbs_task = sizeof(struct aa_task_ctx), }; -static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { +static struct security_hook_list apparmor_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme), LSM_HOOK_INIT(capget, apparmor_capget), @@ -1427,7 +1427,7 @@ static const struct kernel_param_ops param_ops_aaintbool = { .get = param_get_aaintbool }; /* Boot time disable flag */ -static int apparmor_enabled __lsm_ro_after_init = 1; +static int apparmor_enabled __ro_after_init = 1; module_param_named(enabled, apparmor_enabled, aaintbool, 0444); static int __init apparmor_enabled_setup(char *str) diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c index e5971fa74fd7..cfaf1d0e6a5f 100644 --- a/security/bpf/hooks.c +++ b/security/bpf/hooks.c @@ -6,7 +6,7 @@ #include <linux/lsm_hooks.h> #include <linux/bpf_lsm.h> -static struct security_hook_list bpf_lsm_hooks[] __lsm_ro_after_init = { +static struct security_hook_list bpf_lsm_hooks[] __ro_after_init = { #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ LSM_HOOK_INIT(NAME, bpf_lsm_##NAME), #include <linux/lsm_hook_defs.h> @@ -22,7 +22,7 @@ static int __init bpf_lsm_init(void) return 0; } -struct lsm_blob_sizes bpf_lsm_blob_sizes __lsm_ro_after_init = { +struct lsm_blob_sizes bpf_lsm_blob_sizes __ro_after_init = { .lbs_inode = sizeof(struct bpf_storage_blob), .lbs_task = sizeof(struct bpf_storage_blob), }; diff --git a/security/commoncap.c b/security/commoncap.c index 5bb7d1e96277..0b3fc2f3afe7 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -1440,7 +1440,7 @@ int cap_mmap_file(struct file *file, unsigned long reqprot, #ifdef CONFIG_SECURITY -static struct security_hook_list capability_hooks[] __lsm_ro_after_init = { +static struct security_hook_list capability_hooks[] __ro_after_init = { LSM_HOOK_INIT(capable, cap_capable), LSM_HOOK_INIT(settime, cap_settime), LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check), diff --git a/security/device_cgroup.c b/security/device_cgroup.c index bef2b9285fb3..7507d14eacc7 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -216,7 +216,7 @@ static void devcgroup_offline(struct cgroup_subsys_state *css) } /* - * called from kernel/cgroup.c with cgroup_lock() held. + * called from kernel/cgroup/cgroup.c with cgroup_lock() held. */ static struct cgroup_subsys_state * devcgroup_css_alloc(struct cgroup_subsys_state *parent_css) diff --git a/security/integrity/Kconfig b/security/integrity/Kconfig index 599429f99f99..ec6e0d789da1 100644 --- a/security/integrity/Kconfig +++ b/security/integrity/Kconfig @@ -68,13 +68,34 @@ config INTEGRITY_MACHINE_KEYRING depends on INTEGRITY_ASYMMETRIC_KEYS depends on SYSTEM_BLACKLIST_KEYRING depends on LOAD_UEFI_KEYS - depends on !IMA_KEYRINGS_PERMIT_SIGNED_BY_BUILTIN_OR_SECONDARY help If set, provide a keyring to which Machine Owner Keys (MOK) may be added. This keyring shall contain just MOK keys. Unlike keys in the platform keyring, keys contained in the .machine keyring will be trusted within the kernel. +config INTEGRITY_CA_MACHINE_KEYRING + bool "Enforce Machine Keyring CA Restrictions" + depends on INTEGRITY_MACHINE_KEYRING + default n + help + The .machine keyring can be configured to enforce CA restriction + on any key added to it. By default no restrictions are in place + and all Machine Owner Keys (MOK) are added to the machine keyring. + If enabled only CA keys are added to the machine keyring, all + other MOK keys load into the platform keyring. + +config INTEGRITY_CA_MACHINE_KEYRING_MAX + bool "Only CA keys without DigitialSignature usage set" + depends on INTEGRITY_CA_MACHINE_KEYRING + default n + help + When selected, only load CA keys are loaded into the machine + keyring that contain the CA bit set along with the keyCertSign + Usage field. Keys containing the digitialSignature Usage field + will not be loaded. The remaining MOK keys are loaded into the + .platform keyring. + config LOAD_UEFI_KEYS depends on INTEGRITY_PLATFORM_KEYRING depends on EFI diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c index f2193c531f4a..6f31ffe23c48 100644 --- a/security/integrity/digsig.c +++ b/security/integrity/digsig.c @@ -132,7 +132,8 @@ int __init integrity_init_keyring(const unsigned int id) | KEY_USR_READ | KEY_USR_SEARCH; if (id == INTEGRITY_KEYRING_PLATFORM || - id == INTEGRITY_KEYRING_MACHINE) { + (id == INTEGRITY_KEYRING_MACHINE && + !IS_ENABLED(CONFIG_INTEGRITY_CA_MACHINE_KEYRING))) { restriction = NULL; goto out; } @@ -144,7 +145,10 @@ int __init integrity_init_keyring(const unsigned int id) if (!restriction) return -ENOMEM; - restriction->check = restrict_link_to_ima; + if (id == INTEGRITY_KEYRING_MACHINE) + restriction->check = restrict_link_by_ca; + else + restriction->check = restrict_link_to_ima; /* * MOK keys can only be added through a read-only runtime services diff --git a/security/integrity/iint.c b/security/integrity/iint.c index 8638976f7990..c73858e8c6d5 100644 --- a/security/integrity/iint.c +++ b/security/integrity/iint.c @@ -98,14 +98,6 @@ struct integrity_iint_cache *integrity_inode_get(struct inode *inode) struct rb_node *node, *parent = NULL; struct integrity_iint_cache *iint, *test_iint; - /* - * The integrity's "iint_cache" is initialized at security_init(), - * unless it is not included in the ordered list of LSMs enabled - * on the boot command line. - */ - if (!iint_cache) - panic("%s: lsm=integrity required.\n", __func__); - iint = integrity_iint_find(inode); if (iint) return iint; @@ -182,6 +174,7 @@ static int __init integrity_iintcache_init(void) DEFINE_LSM(integrity) = { .name = "integrity", .init = integrity_iintcache_init, + .order = LSM_ORDER_LAST, }; diff --git a/security/landlock/cred.c b/security/landlock/cred.c index ec6c37f04a19..13dff2a31545 100644 --- a/security/landlock/cred.c +++ b/security/landlock/cred.c @@ -34,7 +34,7 @@ static void hook_cred_free(struct cred *const cred) landlock_put_ruleset_deferred(dom); } -static struct security_hook_list landlock_hooks[] __lsm_ro_after_init = { +static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(cred_prepare, hook_cred_prepare), LSM_HOOK_INIT(cred_free, hook_cred_free), }; diff --git a/security/landlock/fs.c b/security/landlock/fs.c index adcea0fe7e68..1c0c198f6fdb 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -1280,7 +1280,7 @@ static int hook_file_truncate(struct file *const file) return -EACCES; } -static struct security_hook_list landlock_hooks[] __lsm_ro_after_init = { +static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_free_security, hook_inode_free_security), LSM_HOOK_INIT(sb_delete, hook_sb_delete), diff --git a/security/landlock/ptrace.c b/security/landlock/ptrace.c index 4c5b9cd71286..8a06d6c492bf 100644 --- a/security/landlock/ptrace.c +++ b/security/landlock/ptrace.c @@ -108,7 +108,7 @@ static int hook_ptrace_traceme(struct task_struct *const parent) return task_ptrace(parent, current); } -static struct security_hook_list landlock_hooks[] __lsm_ro_after_init = { +static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, hook_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, hook_ptrace_traceme), }; diff --git a/security/landlock/setup.c b/security/landlock/setup.c index 3f196d2ce4f9..0f6113528fa4 100644 --- a/security/landlock/setup.c +++ b/security/landlock/setup.c @@ -15,9 +15,9 @@ #include "ptrace.h" #include "setup.h" -bool landlock_initialized __lsm_ro_after_init = false; +bool landlock_initialized __ro_after_init = false; -struct lsm_blob_sizes landlock_blob_sizes __lsm_ro_after_init = { +struct lsm_blob_sizes landlock_blob_sizes __ro_after_init = { .lbs_cred = sizeof(struct landlock_cred_security), .lbs_file = sizeof(struct landlock_file_security), .lbs_inode = sizeof(struct landlock_inode_security), diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c index d73a281adf86..b9d773f11232 100644 --- a/security/loadpin/loadpin.c +++ b/security/loadpin/loadpin.c @@ -214,7 +214,7 @@ static int loadpin_load_data(enum kernel_load_data_id id, bool contents) return loadpin_check(NULL, (enum kernel_read_file_id) id); } -static struct security_hook_list loadpin_hooks[] __lsm_ro_after_init = { +static struct security_hook_list loadpin_hooks[] __ro_after_init = { LSM_HOOK_INIT(sb_free_security, loadpin_sb_free_security), LSM_HOOK_INIT(kernel_read_file, loadpin_read_file), LSM_HOOK_INIT(kernel_load_data, loadpin_load_data), diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index a79b985e917e..68d19632aeb7 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -71,7 +71,7 @@ static int lockdown_is_locked_down(enum lockdown_reason what) return 0; } -static struct security_hook_list lockdown_hooks[] __lsm_ro_after_init = { +static struct security_hook_list lockdown_hooks[] __ro_after_init = { LSM_HOOK_INIT(locked_down, lockdown_is_locked_down), }; diff --git a/security/security.c b/security/security.c index cf6cc576736f..d5ff7ff45b77 100644 --- a/security/security.c +++ b/security/security.c @@ -6,6 +6,7 @@ * Copyright (C) 2001-2002 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com> * Copyright (C) 2016 Mellanox Technologies + * Copyright (C) 2023 Microsoft Corporation <paul@paul-moore.com> */ #define pr_fmt(fmt) "LSM: " fmt @@ -41,7 +42,7 @@ * all security modules to use the same descriptions for auditing * purposes. */ -const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { +const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = { [LOCKDOWN_NONE] = "none", [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading", [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port", @@ -74,20 +75,20 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; -struct security_hook_heads security_hook_heads __lsm_ro_after_init; +struct security_hook_heads security_hook_heads __ro_after_init; static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain); static struct kmem_cache *lsm_file_cache; static struct kmem_cache *lsm_inode_cache; char *lsm_names; -static struct lsm_blob_sizes blob_sizes __lsm_ro_after_init; +static struct lsm_blob_sizes blob_sizes __ro_after_init; /* Boot-time LSM user choice */ static __initdata const char *chosen_lsm_order; static __initdata const char *chosen_major_lsm; -static __initconst const char * const builtin_lsm_order = CONFIG_LSM; +static __initconst const char *const builtin_lsm_order = CONFIG_LSM; /* Ordered list of LSMs to initialize. */ static __initdata struct lsm_info **ordered_lsms; @@ -284,9 +285,9 @@ static void __init ordered_lsm_parse(const char *order, const char *origin) bool found = false; for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { - if (lsm->order == LSM_ORDER_MUTABLE && - strcmp(lsm->name, name) == 0) { - append_ordered_lsm(lsm, origin); + if (strcmp(lsm->name, name) == 0) { + if (lsm->order == LSM_ORDER_MUTABLE) + append_ordered_lsm(lsm, origin); found = true; } } @@ -306,6 +307,12 @@ static void __init ordered_lsm_parse(const char *order, const char *origin) } } + /* LSM_ORDER_LAST is always last. */ + for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { + if (lsm->order == LSM_ORDER_LAST) + append_ordered_lsm(lsm, " last"); + } + /* Disable all LSMs not in the ordered list. */ for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (exists_ordered_lsm(lsm)) @@ -331,7 +338,8 @@ static void __init report_lsm_order(void) pr_info("initializing lsm="); /* Report each enabled LSM name, comma separated. */ - for (early = __start_early_lsm_info; early < __end_early_lsm_info; early++) + for (early = __start_early_lsm_info; + early < __end_early_lsm_info; early++) if (is_enabled(early)) pr_cont("%s%s", first++ == 0 ? "" : ",", early->name); for (lsm = ordered_lsms; *lsm; lsm++) @@ -346,7 +354,7 @@ static void __init ordered_lsm_init(void) struct lsm_info **lsm; ordered_lsms = kcalloc(LSM_COUNT + 1, sizeof(*ordered_lsms), - GFP_KERNEL); + GFP_KERNEL); if (chosen_lsm_order) { if (chosen_major_lsm) { @@ -419,9 +427,9 @@ int __init security_init(void) { struct lsm_info *lsm; - init_debug("legacy security=%s\n", chosen_major_lsm ?: " *unspecified*"); + init_debug("legacy security=%s\n", chosen_major_lsm ? : " *unspecified*"); init_debug(" CONFIG_LSM=%s\n", builtin_lsm_order); - init_debug("boot arg lsm=%s\n", chosen_lsm_order ?: " *unspecified*"); + init_debug("boot arg lsm=%s\n", chosen_lsm_order ? : " *unspecified*"); /* * Append the names of the early LSM modules now that kmalloc() is @@ -509,7 +517,7 @@ static int lsm_append(const char *new, char **result) * Each LSM has to register its hooks with the infrastructure. */ void __init security_add_hooks(struct security_hook_list *hooks, int count, - const char *lsm) + const char *lsm) { int i; @@ -778,57 +786,157 @@ static int lsm_superblock_alloc(struct super_block *sb) /* Security operations */ +/** + * security_binder_set_context_mgr() - Check if becoming binder ctx mgr is ok + * @mgr: task credentials of current binder process + * + * Check whether @mgr is allowed to be the binder context manager. + * + * Return: Return 0 if permission is granted. + */ int security_binder_set_context_mgr(const struct cred *mgr) { return call_int_hook(binder_set_context_mgr, 0, mgr); } +/** + * security_binder_transaction() - Check if a binder transaction is allowed + * @from: sending process + * @to: receiving process + * + * Check whether @from is allowed to invoke a binder transaction call to @to. + * + * Return: Returns 0 if permission is granted. + */ int security_binder_transaction(const struct cred *from, const struct cred *to) { return call_int_hook(binder_transaction, 0, from, to); } +/** + * security_binder_transfer_binder() - Check if a binder transfer is allowed + * @from: sending process + * @to: receiving process + * + * Check whether @from is allowed to transfer a binder reference to @to. + * + * Return: Returns 0 if permission is granted. + */ int security_binder_transfer_binder(const struct cred *from, const struct cred *to) { return call_int_hook(binder_transfer_binder, 0, from, to); } +/** + * security_binder_transfer_file() - Check if a binder file xfer is allowed + * @from: sending process + * @to: receiving process + * @file: file being transferred + * + * Check whether @from is allowed to transfer @file to @to. + * + * Return: Returns 0 if permission is granted. + */ int security_binder_transfer_file(const struct cred *from, const struct cred *to, struct file *file) { return call_int_hook(binder_transfer_file, 0, from, to, file); } +/** + * security_ptrace_access_check() - Check if tracing is allowed + * @child: target process + * @mode: PTRACE_MODE flags + * + * Check permission before allowing the current process to trace the @child + * process. Security modules may also want to perform a process tracing check + * during an execve in the set_security or apply_creds hooks of tracing check + * during an execve in the bprm_set_creds hook of binprm_security_ops if the + * process is being traced and its security attributes would be changed by the + * execve. + * + * Return: Returns 0 if permission is granted. + */ int security_ptrace_access_check(struct task_struct *child, unsigned int mode) { return call_int_hook(ptrace_access_check, 0, child, mode); } +/** + * security_ptrace_traceme() - Check if tracing is allowed + * @parent: tracing process + * + * Check that the @parent process has sufficient permission to trace the + * current process before allowing the current process to present itself to the + * @parent process for tracing. + * + * Return: Returns 0 if permission is granted. + */ int security_ptrace_traceme(struct task_struct *parent) { return call_int_hook(ptrace_traceme, 0, parent); } +/** + * security_capget() - Get the capability sets for a process + * @target: target process + * @effective: effective capability set + * @inheritable: inheritable capability set + * @permitted: permitted capability set + * + * Get the @effective, @inheritable, and @permitted capability sets for the + * @target process. The hook may also perform permission checking to determine + * if the current process is allowed to see the capability sets of the @target + * process. + * + * Return: Returns 0 if the capability sets were successfully obtained. + */ int security_capget(struct task_struct *target, - kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) + kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) { return call_int_hook(capget, 0, target, - effective, inheritable, permitted); + effective, inheritable, permitted); } +/** + * security_capset() - Set the capability sets for a process + * @new: new credentials for the target process + * @old: current credentials of the target process + * @effective: effective capability set + * @inheritable: inheritable capability set + * @permitted: permitted capability set + * + * Set the @effective, @inheritable, and @permitted capability sets for the + * current process. + * + * Return: Returns 0 and update @new if permission is granted. + */ int security_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { return call_int_hook(capset, 0, new, old, - effective, inheritable, permitted); + effective, inheritable, permitted); } +/** + * security_capable() - Check if a process has the necessary capability + * @cred: credentials to examine + * @ns: user namespace + * @cap: capability requested + * @opts: capability check options + * + * Check whether the @tsk process has the @cap capability in the indicated + * credentials. @cap contains the capability <include/linux/capability.h>. + * @opts contains options for the capable check <include/linux/security.h>. + * + * Return: Returns 0 if the capability is granted. + */ int security_capable(const struct cred *cred, struct user_namespace *ns, int cap, @@ -837,26 +945,78 @@ int security_capable(const struct cred *cred, return call_int_hook(capable, 0, cred, ns, cap, opts); } +/** + * security_quotactl() - Check if a quotactl() syscall is allowed for this fs + * @cmds: commands + * @type: type + * @id: id + * @sb: filesystem + * + * Check whether the quotactl syscall is allowed for this @sb. + * + * Return: Returns 0 if permission is granted. + */ int security_quotactl(int cmds, int type, int id, struct super_block *sb) { return call_int_hook(quotactl, 0, cmds, type, id, sb); } +/** + * security_quota_on() - Check if QUOTAON is allowed for a dentry + * @dentry: dentry + * + * Check whether QUOTAON is allowed for @dentry. + * + * Return: Returns 0 if permission is granted. + */ int security_quota_on(struct dentry *dentry) { return call_int_hook(quota_on, 0, dentry); } +/** + * security_syslog() - Check if accessing the kernel message ring is allowed + * @type: SYSLOG_ACTION_* type + * + * Check permission before accessing the kernel message ring or changing + * logging to the console. See the syslog(2) manual page for an explanation of + * the @type values. + * + * Return: Return 0 if permission is granted. + */ int security_syslog(int type) { return call_int_hook(syslog, 0, type); } +/** + * security_settime64() - Check if changing the system time is allowed + * @ts: new time + * @tz: timezone + * + * Check permission to change the system time, struct timespec64 is defined in + * <include/linux/time64.h> and timezone is defined in <include/linux/time.h>. + * + * Return: Returns 0 if permission is granted. + */ int security_settime64(const struct timespec64 *ts, const struct timezone *tz) { return call_int_hook(settime, 0, ts, tz); } +/** + * security_vm_enough_memory_mm() - Check if allocating a new mem map is allowed + * @mm: mm struct + * @pages: number of pages + * + * Check permissions for allocating a new virtual mapping. If all LSMs return + * a positive value, __vm_enough_memory() will be called with cap_sys_admin + * set. If at least one LSM returns 0 or negative, __vm_enough_memory() will be + * called with cap_sys_admin cleared. + * + * Return: Returns 0 if permission is granted by the LSM infrastructure to the + * caller. + */ int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) { struct security_hook_list *hp; @@ -880,16 +1040,61 @@ int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) return __vm_enough_memory(mm, pages, cap_sys_admin); } +/** + * security_bprm_creds_for_exec() - Prepare the credentials for exec() + * @bprm: binary program information + * + * If the setup in prepare_exec_creds did not setup @bprm->cred->security + * properly for executing @bprm->file, update the LSM's portion of + * @bprm->cred->security to be what commit_creds needs to install for the new + * program. This hook may also optionally check permissions (e.g. for + * transitions between security domains). The hook must set @bprm->secureexec + * to 1 if AT_SECURE should be set to request libc enable secure mode. @bprm + * contains the linux_binprm structure. + * + * Return: Returns 0 if the hook is successful and permission is granted. + */ int security_bprm_creds_for_exec(struct linux_binprm *bprm) { return call_int_hook(bprm_creds_for_exec, 0, bprm); } +/** + * security_bprm_creds_from_file() - Update linux_binprm creds based on file + * @bprm: binary program information + * @file: associated file + * + * If @file is setpcap, suid, sgid or otherwise marked to change privilege upon + * exec, update @bprm->cred to reflect that change. This is called after + * finding the binary that will be executed without an interpreter. This + * ensures that the credentials will not be derived from a script that the + * binary will need to reopen, which when reopend may end up being a completely + * different file. This hook may also optionally check permissions (e.g. for + * transitions between security domains). The hook must set @bprm->secureexec + * to 1 if AT_SECURE should be set to request libc enable secure mode. The + * hook must add to @bprm->per_clear any personality flags that should be + * cleared from current->personality. @bprm contains the linux_binprm + * structure. + * + * Return: Returns 0 if the hook is successful and permission is granted. + */ int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file) { return call_int_hook(bprm_creds_from_file, 0, bprm, file); } +/** + * security_bprm_check() - Mediate binary handler search + * @bprm: binary program information + * + * This hook mediates the point when a search for a binary handler will begin. + * It allows a check against the @bprm->cred->security value which was set in + * the preceding creds_for_exec call. The argv list and envp list are reliably + * available in @bprm. This hook may be called multiple times during a single + * execve. @bprm contains the linux_binprm structure. + * + * Return: Returns 0 if the hook is successful and permission is granted. + */ int security_bprm_check(struct linux_binprm *bprm) { int ret; @@ -900,21 +1105,67 @@ int security_bprm_check(struct linux_binprm *bprm) return ima_bprm_check(bprm); } +/** + * security_bprm_committing_creds() - Install creds for a process during exec() + * @bprm: binary program information + * + * Prepare to install the new security attributes of a process being + * transformed by an execve operation, based on the old credentials pointed to + * by @current->cred and the information set in @bprm->cred by the + * bprm_creds_for_exec hook. @bprm points to the linux_binprm structure. This + * hook is a good place to perform state changes on the process such as closing + * open file descriptors to which access will no longer be granted when the + * attributes are changed. This is called immediately before commit_creds(). + */ void security_bprm_committing_creds(struct linux_binprm *bprm) { call_void_hook(bprm_committing_creds, bprm); } +/** + * security_bprm_committed_creds() - Tidy up after cred install during exec() + * @bprm: binary program information + * + * Tidy up after the installation of the new security attributes of a process + * being transformed by an execve operation. The new credentials have, by this + * point, been set to @current->cred. @bprm points to the linux_binprm + * structure. This hook is a good place to perform state changes on the + * process such as clearing out non-inheritable signal state. This is called + * immediately after commit_creds(). + */ void security_bprm_committed_creds(struct linux_binprm *bprm) { call_void_hook(bprm_committed_creds, bprm); } +/** + * security_fs_context_dup() - Duplicate a fs_context LSM blob + * @fc: destination filesystem context + * @src_fc: source filesystem context + * + * Allocate and attach a security structure to sc->security. This pointer is + * initialised to NULL by the caller. @fc indicates the new filesystem context. + * @src_fc indicates the original filesystem context. + * + * Return: Returns 0 on success or a negative error code on failure. + */ int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { return call_int_hook(fs_context_dup, 0, fc, src_fc); } +/** + * security_fs_context_parse_param() - Configure a filesystem context + * @fc: filesystem context + * @param: filesystem parameter + * + * Userspace provided a parameter to configure a superblock. The LSM can + * consume the parameter or return it to the caller for use elsewhere. + * + * Return: If the parameter is used by the LSM it should return 0, if it is + * returned to the caller -ENOPARAM is returned, otherwise a negative + * error code is returned. + */ int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param) { @@ -933,6 +1184,16 @@ int security_fs_context_parse_param(struct fs_context *fc, return rc; } +/** + * security_sb_alloc() - Allocate a super_block LSM blob + * @sb: filesystem superblock + * + * Allocate and attach a security structure to the sb->s_security field. The + * s_security field is initialized to NULL when the structure is allocated. + * @sb contains the super_block structure to be modified. + * + * Return: Returns 0 if operation was successful. + */ int security_sb_alloc(struct super_block *sb) { int rc = lsm_superblock_alloc(sb); @@ -945,11 +1206,25 @@ int security_sb_alloc(struct super_block *sb) return rc; } +/** + * security_sb_delete() - Release super_block LSM associated objects + * @sb: filesystem superblock + * + * Release objects tied to a superblock (e.g. inodes). @sb contains the + * super_block structure being released. + */ void security_sb_delete(struct super_block *sb) { call_void_hook(sb_delete, sb); } +/** + * security_sb_free() - Free a super_block LSM blob + * @sb: filesystem superblock + * + * Deallocate and clear the sb->s_security field. @sb contains the super_block + * structure to be modified. + */ void security_sb_free(struct super_block *sb) { call_void_hook(sb_free_security, sb); @@ -957,6 +1232,12 @@ void security_sb_free(struct super_block *sb) sb->s_security = NULL; } +/** + * security_free_mnt_opts() - Free memory associated with mount options + * @mnt_opts: LSM processed mount options + * + * Free memory associated with @mnt_ops. + */ void security_free_mnt_opts(void **mnt_opts) { if (!*mnt_opts) @@ -966,12 +1247,31 @@ void security_free_mnt_opts(void **mnt_opts) } EXPORT_SYMBOL(security_free_mnt_opts); +/** + * security_sb_eat_lsm_opts() - Consume LSM mount options + * @options: mount options + * @mnt_opts: LSM processed mount options + * + * Eat (scan @options) and save them in @mnt_opts. + * + * Return: Returns 0 on success, negative values on failure. + */ int security_sb_eat_lsm_opts(char *options, void **mnt_opts) { return call_int_hook(sb_eat_lsm_opts, 0, options, mnt_opts); } EXPORT_SYMBOL(security_sb_eat_lsm_opts); +/** + * security_sb_mnt_opts_compat() - Check if new mount options are allowed + * @sb: filesystem superblock + * @mnt_opts: new mount options + * + * Determine if the new mount options in @mnt_opts are allowed given the + * existing mounted filesystem at @sb. @sb superblock being compared. + * + * Return: Returns 0 if options are compatible. + */ int security_sb_mnt_opts_compat(struct super_block *sb, void *mnt_opts) { @@ -979,6 +1279,16 @@ int security_sb_mnt_opts_compat(struct super_block *sb, } EXPORT_SYMBOL(security_sb_mnt_opts_compat); +/** + * security_sb_remount() - Verify no incompatible mount changes during remount + * @sb: filesystem superblock + * @mnt_opts: (re)mount options + * + * Extracts security system specific mount options and verifies no changes are + * being made to those options. + * + * Return: Returns 0 if permission is granted. + */ int security_sb_remount(struct super_block *sb, void *mnt_opts) { @@ -986,69 +1296,184 @@ int security_sb_remount(struct super_block *sb, } EXPORT_SYMBOL(security_sb_remount); +/** + * security_sb_kern_mount() - Check if a kernel mount is allowed + * @sb: filesystem superblock + * + * Mount this @sb if allowed by permissions. + * + * Return: Returns 0 if permission is granted. + */ int security_sb_kern_mount(struct super_block *sb) { return call_int_hook(sb_kern_mount, 0, sb); } +/** + * security_sb_show_options() - Output the mount options for a superblock + * @m: output file + * @sb: filesystem superblock + * + * Show (print on @m) mount options for this @sb. + * + * Return: Returns 0 on success, negative values on failure. + */ int security_sb_show_options(struct seq_file *m, struct super_block *sb) { return call_int_hook(sb_show_options, 0, m, sb); } +/** + * security_sb_statfs() - Check if accessing fs stats is allowed + * @dentry: superblock handle + * + * Check permission before obtaining filesystem statistics for the @mnt + * mountpoint. @dentry is a handle on the superblock for the filesystem. + * + * Return: Returns 0 if permission is granted. + */ int security_sb_statfs(struct dentry *dentry) { return call_int_hook(sb_statfs, 0, dentry); } +/** + * security_sb_mount() - Check permission for mounting a filesystem + * @dev_name: filesystem backing device + * @path: mount point + * @type: filesystem type + * @flags: mount flags + * @data: filesystem specific data + * + * Check permission before an object specified by @dev_name is mounted on the + * mount point named by @nd. For an ordinary mount, @dev_name identifies a + * device if the file system type requires a device. For a remount + * (@flags & MS_REMOUNT), @dev_name is irrelevant. For a loopback/bind mount + * (@flags & MS_BIND), @dev_name identifies the pathname of the object being + * mounted. + * + * Return: Returns 0 if permission is granted. + */ int security_sb_mount(const char *dev_name, const struct path *path, - const char *type, unsigned long flags, void *data) + const char *type, unsigned long flags, void *data) { return call_int_hook(sb_mount, 0, dev_name, path, type, flags, data); } +/** + * security_sb_umount() - Check permission for unmounting a filesystem + * @mnt: mounted filesystem + * @flags: unmount flags + * + * Check permission before the @mnt file system is unmounted. + * + * Return: Returns 0 if permission is granted. + */ int security_sb_umount(struct vfsmount *mnt, int flags) { return call_int_hook(sb_umount, 0, mnt, flags); } -int security_sb_pivotroot(const struct path *old_path, const struct path *new_path) +/** + * security_sb_pivotroot() - Check permissions for pivoting the rootfs + * @old_path: new location for current rootfs + * @new_path: location of the new rootfs + * + * Check permission before pivoting the root filesystem. + * + * Return: Returns 0 if permission is granted. + */ +int security_sb_pivotroot(const struct path *old_path, + const struct path *new_path) { return call_int_hook(sb_pivotroot, 0, old_path, new_path); } +/** + * security_sb_set_mnt_opts() - Set the mount options for a filesystem + * @sb: filesystem superblock + * @mnt_opts: binary mount options + * @kern_flags: kernel flags (in) + * @set_kern_flags: kernel flags (out) + * + * Set the security relevant mount options used for a superblock. + * + * Return: Returns 0 on success, error on failure. + */ int security_sb_set_mnt_opts(struct super_block *sb, - void *mnt_opts, - unsigned long kern_flags, - unsigned long *set_kern_flags) + void *mnt_opts, + unsigned long kern_flags, + unsigned long *set_kern_flags) { return call_int_hook(sb_set_mnt_opts, - mnt_opts ? -EOPNOTSUPP : 0, sb, - mnt_opts, kern_flags, set_kern_flags); + mnt_opts ? -EOPNOTSUPP : 0, sb, + mnt_opts, kern_flags, set_kern_flags); } EXPORT_SYMBOL(security_sb_set_mnt_opts); +/** + * security_sb_clone_mnt_opts() - Duplicate superblock mount options + * @oldsb: source superblock + * @newsb: destination superblock + * @kern_flags: kernel flags (in) + * @set_kern_flags: kernel flags (out) + * + * Copy all security options from a given superblock to another. + * + * Return: Returns 0 on success, error on failure. + */ int security_sb_clone_mnt_opts(const struct super_block *oldsb, - struct super_block *newsb, - unsigned long kern_flags, - unsigned long *set_kern_flags) + struct super_block *newsb, + unsigned long kern_flags, + unsigned long *set_kern_flags) { return call_int_hook(sb_clone_mnt_opts, 0, oldsb, newsb, - kern_flags, set_kern_flags); + kern_flags, set_kern_flags); } EXPORT_SYMBOL(security_sb_clone_mnt_opts); -int security_move_mount(const struct path *from_path, const struct path *to_path) +/** + * security_move_mount() - Check permissions for moving a mount + * @from_path: source mount point + * @to_path: destination mount point + * + * Check permission before a mount is moved. + * + * Return: Returns 0 if permission is granted. + */ +int security_move_mount(const struct path *from_path, + const struct path *to_path) { return call_int_hook(move_mount, 0, from_path, to_path); } +/** + * security_path_notify() - Check if setting a watch is allowed + * @path: file path + * @mask: event mask + * @obj_type: file path type + * + * Check permissions before setting a watch on events as defined by @mask, on + * an object at @path, whose type is defined by @obj_type. + * + * Return: Returns 0 if permission is granted. + */ int security_path_notify(const struct path *path, u64 mask, - unsigned int obj_type) + unsigned int obj_type) { return call_int_hook(path_notify, 0, path, mask, obj_type); } +/** + * security_inode_alloc() - Allocate an inode LSM blob + * @inode: the inode + * + * Allocate and attach a security structure to @inode->i_security. The + * i_security field is initialized to NULL when the inode structure is + * allocated. + * + * Return: Return 0 if operation was successful. + */ int security_inode_alloc(struct inode *inode) { int rc = lsm_inode_alloc(inode); @@ -1069,6 +1494,12 @@ static void inode_free_by_rcu(struct rcu_head *head) kmem_cache_free(lsm_inode_cache, head); } +/** + * security_inode_free() - Free an inode's LSM blob + * @inode: the inode + * + * Deallocate the inode security structure and set @inode->i_security to NULL. + */ void security_inode_free(struct inode *inode) { integrity_inode_free(inode); @@ -1084,9 +1515,24 @@ void security_inode_free(struct inode *inode) */ if (inode->i_security) call_rcu((struct rcu_head *)inode->i_security, - inode_free_by_rcu); + inode_free_by_rcu); } +/** + * security_dentry_init_security() - Perform dentry initialization + * @dentry: the dentry to initialize + * @mode: mode used to determine resource type + * @name: name of the last path component + * @xattr_name: name of the security/LSM xattr + * @ctx: pointer to the resulting LSM context + * @ctxlen: length of @ctx + * + * Compute a context for a dentry as the inode is not yet available since NFSv4 + * has no label backed by an EA anyway. It is important to note that + * @xattr_name does not need to be free'd by the caller, it is a static string. + * + * Return: Returns 0 on success, negative values on failure. + */ int security_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, void **ctx, @@ -1098,7 +1544,8 @@ int security_dentry_init_security(struct dentry *dentry, int mode, /* * Only one module will provide a security context. */ - hlist_for_each_entry(hp, &security_hook_heads.dentry_init_security, list) { + hlist_for_each_entry(hp, &security_hook_heads.dentry_init_security, + list) { rc = hp->hook.dentry_init_security(dentry, mode, name, xattr_name, ctx, ctxlen); if (rc != LSM_RET_DEFAULT(dentry_init_security)) @@ -1108,15 +1555,51 @@ int security_dentry_init_security(struct dentry *dentry, int mode, } EXPORT_SYMBOL(security_dentry_init_security); +/** + * security_dentry_create_files_as() - Perform dentry initialization + * @dentry: the dentry to initialize + * @mode: mode used to determine resource type + * @name: name of the last path component + * @old: creds to use for LSM context calculations + * @new: creds to modify + * + * Compute a context for a dentry as the inode is not yet available and set + * that context in passed in creds so that new files are created using that + * context. Context is calculated using the passed in creds and not the creds + * of the caller. + * + * Return: Returns 0 on success, error on failure. + */ int security_dentry_create_files_as(struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new) { return call_int_hook(dentry_create_files_as, 0, dentry, mode, - name, old, new); + name, old, new); } EXPORT_SYMBOL(security_dentry_create_files_as); +/** + * security_inode_init_security() - Initialize an inode's LSM context + * @inode: the inode + * @dir: parent directory + * @qstr: last component of the pathname + * @initxattrs: callback function to write xattrs + * @fs_data: filesystem specific data + * + * Obtain the security attribute name suffix and value to set on a newly + * created inode and set up the incore security field for the new inode. This + * hook is called by the fs code as part of the inode creation transaction and + * provides for atomic labeling of the inode, unlike the post_create/mkdir/... + * hooks called by the VFS. The hook function is expected to allocate the name + * and value via kmalloc, with the caller being responsible for calling kfree + * after using them. If the security module does not use security attributes + * or does not wish to put a security attribute on this particular inode, then + * it should return -EOPNOTSUPP to skip this processing. + * + * Return: Returns 0 on success, -EOPNOTSUPP if no security attribute is + * needed, or -ENOMEM on memory allocation failure. + */ int security_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, const initxattrs initxattrs, void *fs_data) @@ -1134,9 +1617,9 @@ int security_inode_init_security(struct inode *inode, struct inode *dir, memset(new_xattrs, 0, sizeof(new_xattrs)); lsm_xattr = new_xattrs; ret = call_int_hook(inode_init_security, -EOPNOTSUPP, inode, dir, qstr, - &lsm_xattr->name, - &lsm_xattr->value, - &lsm_xattr->value_len); + &lsm_xattr->name, + &lsm_xattr->value, + &lsm_xattr->value_len); if (ret) goto out; @@ -1152,6 +1635,18 @@ out: } EXPORT_SYMBOL(security_inode_init_security); +/** + * security_inode_init_security_anon() - Initialize an anonymous inode + * @inode: the inode + * @name: the anonymous inode class + * @context_inode: an optional related inode + * + * Set up the incore security field for the new anonymous inode and return + * whether the inode creation is permitted by the security module or not. + * + * Return: Returns 0 on success, -EACCES if the security module denies the + * creation of this inode, or another -errno upon other errors. + */ int security_inode_init_security_anon(struct inode *inode, const struct qstr *name, const struct inode *context_inode) @@ -1160,20 +1655,21 @@ int security_inode_init_security_anon(struct inode *inode, context_inode); } -int security_old_inode_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr, const char **name, - void **value, size_t *len) -{ - if (unlikely(IS_PRIVATE(inode))) - return -EOPNOTSUPP; - return call_int_hook(inode_init_security, -EOPNOTSUPP, inode, dir, - qstr, name, value, len); -} -EXPORT_SYMBOL(security_old_inode_init_security); - #ifdef CONFIG_SECURITY_PATH -int security_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode, - unsigned int dev) +/** + * security_path_mknod() - Check if creating a special file is allowed + * @dir: parent directory + * @dentry: new file + * @mode: new file mode + * @dev: device number + * + * Check permissions when creating a file. Note that this hook is called even + * if mknod operation is being done for a regular file. + * + * Return: Returns 0 if permission is granted. + */ +int security_path_mknod(const struct path *dir, struct dentry *dentry, + umode_t mode, unsigned int dev) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; @@ -1181,7 +1677,18 @@ int security_path_mknod(const struct path *dir, struct dentry *dentry, umode_t m } EXPORT_SYMBOL(security_path_mknod); -int security_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode) +/** + * security_path_mkdir() - Check if creating a new directory is allowed + * @dir: parent directory + * @dentry: new directory + * @mode: new directory mode + * + * Check permissions to create a new directory in the existing directory. + * + * Return: Returns 0 if permission is granted. + */ +int security_path_mkdir(const struct path *dir, struct dentry *dentry, + umode_t mode) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; @@ -1189,6 +1696,15 @@ int security_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t m } EXPORT_SYMBOL(security_path_mkdir); +/** + * security_path_rmdir() - Check if removing a directory is allowed + * @dir: parent directory + * @dentry: directory to remove + * + * Check the permission to remove a directory. + * + * Return: Returns 0 if permission is granted. + */ int security_path_rmdir(const struct path *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) @@ -1196,6 +1712,15 @@ int security_path_rmdir(const struct path *dir, struct dentry *dentry) return call_int_hook(path_rmdir, 0, dir, dentry); } +/** + * security_path_unlink() - Check if removing a hard link is allowed + * @dir: parent directory + * @dentry: file + * + * Check the permission to remove a hard link to a file. + * + * Return: Returns 0 if permission is granted. + */ int security_path_unlink(const struct path *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) @@ -1204,6 +1729,16 @@ int security_path_unlink(const struct path *dir, struct dentry *dentry) } EXPORT_SYMBOL(security_path_unlink); +/** + * security_path_symlink() - Check if creating a symbolic link is allowed + * @dir: parent directory + * @dentry: symbolic link + * @old_name: file pathname + * + * Check the permission to create a symbolic link to a file. + * + * Return: Returns 0 if permission is granted. + */ int security_path_symlink(const struct path *dir, struct dentry *dentry, const char *old_name) { @@ -1212,6 +1747,16 @@ int security_path_symlink(const struct path *dir, struct dentry *dentry, return call_int_hook(path_symlink, 0, dir, dentry, old_name); } +/** + * security_path_link - Check if creating a hard link is allowed + * @old_dentry: existing file + * @new_dir: new parent directory + * @new_dentry: new link + * + * Check permission before creating a new hard link to a file. + * + * Return: Returns 0 if permission is granted. + */ int security_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { @@ -1220,19 +1765,42 @@ int security_path_link(struct dentry *old_dentry, const struct path *new_dir, return call_int_hook(path_link, 0, old_dentry, new_dir, new_dentry); } +/** + * security_path_rename() - Check if renaming a file is allowed + * @old_dir: parent directory of the old file + * @old_dentry: the old file + * @new_dir: parent directory of the new file + * @new_dentry: the new file + * @flags: flags + * + * Check for permission to rename a file or directory. + * + * Return: Returns 0 if permission is granted. + */ int security_path_rename(const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, unsigned int flags) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) || - (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry))))) + (d_is_positive(new_dentry) && + IS_PRIVATE(d_backing_inode(new_dentry))))) return 0; return call_int_hook(path_rename, 0, old_dir, old_dentry, new_dir, - new_dentry, flags); + new_dentry, flags); } EXPORT_SYMBOL(security_path_rename); +/** + * security_path_truncate() - Check if truncating a file is allowed + * @path: file + * + * Check permission before truncating the file indicated by path. Note that + * truncation permissions may also be checked based on already opened files, + * using the security_file_truncate() hook. + * + * Return: Returns 0 if permission is granted. + */ int security_path_truncate(const struct path *path) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) @@ -1240,6 +1808,17 @@ int security_path_truncate(const struct path *path) return call_int_hook(path_truncate, 0, path); } +/** + * security_path_chmod() - Check if changing the file's mode is allowed + * @path: file + * @mode: new mode + * + * Check for permission to change a mode of the file @path. The new mode is + * specified in @mode which is a bitmask of constants from + * <include/uapi/linux/stat.h>. + * + * Return: Returns 0 if permission is granted. + */ int security_path_chmod(const struct path *path, umode_t mode) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) @@ -1247,6 +1826,16 @@ int security_path_chmod(const struct path *path, umode_t mode) return call_int_hook(path_chmod, 0, path, mode); } +/** + * security_path_chown() - Check if changing the file's owner/group is allowed + * @path: file + * @uid: file owner + * @gid: file group + * + * Check for permission to change owner/group of a file or directory. + * + * Return: Returns 0 if permission is granted. + */ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) @@ -1254,13 +1843,32 @@ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid) return call_int_hook(path_chown, 0, path, uid, gid); } +/** + * security_path_chroot() - Check if changing the root directory is allowed + * @path: directory + * + * Check for permission to change root directory. + * + * Return: Returns 0 if permission is granted. + */ int security_path_chroot(const struct path *path) { return call_int_hook(path_chroot, 0, path); } -#endif +#endif /* CONFIG_SECURITY_PATH */ -int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode) +/** + * security_inode_create() - Check if creating a file is allowed + * @dir: the parent directory + * @dentry: the file being created + * @mode: requested file mode + * + * Check permission to create a regular file. + * + * Return: Returns 0 if permission is granted. + */ +int security_inode_create(struct inode *dir, struct dentry *dentry, + umode_t mode) { if (unlikely(IS_PRIVATE(dir))) return 0; @@ -1268,14 +1876,33 @@ int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode } EXPORT_SYMBOL_GPL(security_inode_create); +/** + * security_inode_link() - Check if creating a hard link is allowed + * @old_dentry: existing file + * @dir: new parent directory + * @new_dentry: new link + * + * Check permission before creating a new hard link to a file. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *new_dentry) + struct dentry *new_dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)))) return 0; return call_int_hook(inode_link, 0, old_dentry, dir, new_dentry); } +/** + * security_inode_unlink() - Check if removing a hard link is allowed + * @dir: parent directory + * @dentry: file + * + * Check the permission to remove a hard link to a file. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_unlink(struct inode *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) @@ -1283,14 +1910,35 @@ int security_inode_unlink(struct inode *dir, struct dentry *dentry) return call_int_hook(inode_unlink, 0, dir, dentry); } +/** + * security_inode_symlink() - Check if creating a symbolic link is allowed + * @dir: parent directory + * @dentry: symbolic link + * @old_name: existing filename + * + * Check the permission to create a symbolic link to a file. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_symlink(struct inode *dir, struct dentry *dentry, - const char *old_name) + const char *old_name) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_symlink, 0, dir, dentry, old_name); } +/** + * security_inode_mkdir() - Check if creation a new director is allowed + * @dir: parent directory + * @dentry: new directory + * @mode: new directory mode + * + * Check permissions to create a new directory in the existing directory + * associated with inode structure @dir. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { if (unlikely(IS_PRIVATE(dir))) @@ -1299,6 +1947,15 @@ int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } EXPORT_SYMBOL_GPL(security_inode_mkdir); +/** + * security_inode_rmdir() - Check if removing a directory is allowed + * @dir: parent directory + * @dentry: directory to be removed + * + * Check the permission to remove a directory. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_rmdir(struct inode *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) @@ -1306,32 +1963,68 @@ int security_inode_rmdir(struct inode *dir, struct dentry *dentry) return call_int_hook(inode_rmdir, 0, dir, dentry); } -int security_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +/** + * security_inode_mknod() - Check if creating a special file is allowed + * @dir: parent directory + * @dentry: new file + * @mode: new file mode + * @dev: device number + * + * Check permissions when creating a special file (or a socket or a fifo file + * created via the mknod system call). Note that if mknod operation is being + * done for a regular file, then the create hook will be called and not this + * hook. + * + * Return: Returns 0 if permission is granted. + */ +int security_inode_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t dev) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_mknod, 0, dir, dentry, mode, dev); } +/** + * security_inode_rename() - Check if renaming a file is allowed + * @old_dir: parent directory of the old file + * @old_dentry: the old file + * @new_dir: parent directory of the new file + * @new_dentry: the new file + * @flags: flags + * + * Check for permission to rename a file or directory. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { - if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) || - (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry))))) + if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) || + (d_is_positive(new_dentry) && + IS_PRIVATE(d_backing_inode(new_dentry))))) return 0; if (flags & RENAME_EXCHANGE) { int err = call_int_hook(inode_rename, 0, new_dir, new_dentry, - old_dir, old_dentry); + old_dir, old_dentry); if (err) return err; } return call_int_hook(inode_rename, 0, old_dir, old_dentry, - new_dir, new_dentry); + new_dir, new_dentry); } +/** + * security_inode_readlink() - Check if reading a symbolic link is allowed + * @dentry: link + * + * Check the permission to read the symbolic link. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_readlink(struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) @@ -1339,6 +2032,17 @@ int security_inode_readlink(struct dentry *dentry) return call_int_hook(inode_readlink, 0, dentry); } +/** + * security_inode_follow_link() - Check if following a symbolic link is allowed + * @dentry: link dentry + * @inode: link inode + * @rcu: true if in RCU-walk mode + * + * Check permission to follow a symbolic link when looking up a pathname. If + * @rcu is true, @inode is not stable. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_follow_link(struct dentry *dentry, struct inode *inode, bool rcu) { @@ -1347,6 +2051,20 @@ int security_inode_follow_link(struct dentry *dentry, struct inode *inode, return call_int_hook(inode_follow_link, 0, dentry, inode, rcu); } +/** + * security_inode_permission() - Check if accessing an inode is allowed + * @inode: inode + * @mask: access mask + * + * Check permission before accessing an inode. This hook is called by the + * existing Linux permission function, so a security module can use it to + * provide additional checking for existing Linux permission checks. Notice + * that this hook is called when a file is opened (as well as many other + * operations), whereas the file_security_ops permission hook is called when + * the actual read/write operations are performed. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_permission(struct inode *inode, int mask) { if (unlikely(IS_PRIVATE(inode))) @@ -1354,6 +2072,19 @@ int security_inode_permission(struct inode *inode, int mask) return call_int_hook(inode_permission, 0, inode, mask); } +/** + * security_inode_setattr() - Check if setting file attributes is allowed + * @idmap: idmap of the mount + * @dentry: file + * @attr: new attributes + * + * Check permission before setting file attributes. Note that the kernel call + * to notify_change is performed from several locations, whenever file + * attributes change (such as when a file is truncated, chown/chmod operations, + * transferring disk quotas, etc). + * + * Return: Returns 0 if permission is granted. + */ int security_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { @@ -1368,6 +2099,14 @@ int security_inode_setattr(struct mnt_idmap *idmap, } EXPORT_SYMBOL_GPL(security_inode_setattr); +/** + * security_inode_getattr() - Check if getting file attributes is allowed + * @path: file + * + * Check permission before obtaining file attributes. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_getattr(const struct path *path) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) @@ -1375,6 +2114,19 @@ int security_inode_getattr(const struct path *path) return call_int_hook(inode_getattr, 0, path); } +/** + * security_inode_setxattr() - Check if setting file xattrs is allowed + * @idmap: idmap of the mount + * @dentry: file + * @name: xattr name + * @value: xattr value + * @size: size of xattr value + * @flags: flags + * + * Check permission before setting the extended attributes. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) @@ -1400,6 +2152,18 @@ int security_inode_setxattr(struct mnt_idmap *idmap, return evm_inode_setxattr(idmap, dentry, name, value, size); } +/** + * security_inode_set_acl() - Check if setting posix acls is allowed + * @idmap: idmap of the mount + * @dentry: file + * @acl_name: acl name + * @kacl: acl struct + * + * Check permission before setting posix acls, the posix acls in @kacl are + * identified by @acl_name. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) @@ -1418,6 +2182,17 @@ int security_inode_set_acl(struct mnt_idmap *idmap, return evm_inode_set_acl(idmap, dentry, acl_name, kacl); } +/** + * security_inode_get_acl() - Check if reading posix acls is allowed + * @idmap: idmap of the mount + * @dentry: file + * @acl_name: acl name + * + * Check permission before getting osix acls, the posix acls are identified by + * @acl_name. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { @@ -1426,6 +2201,17 @@ int security_inode_get_acl(struct mnt_idmap *idmap, return call_int_hook(inode_get_acl, 0, idmap, dentry, acl_name); } +/** + * security_inode_remove_acl() - Check if removing a posix acl is allowed + * @idmap: idmap of the mount + * @dentry: file + * @acl_name: acl name + * + * Check permission before removing posix acls, the posix acls are identified + * by @acl_name. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { @@ -1442,6 +2228,16 @@ int security_inode_remove_acl(struct mnt_idmap *idmap, return evm_inode_remove_acl(idmap, dentry, acl_name); } +/** + * security_inode_post_setxattr() - Update the inode after a setxattr operation + * @dentry: file + * @name: xattr name + * @value: xattr value + * @size: xattr value size + * @flags: flags + * + * Update inode security field after successful setxattr operation. + */ void security_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { @@ -1451,6 +2247,16 @@ void security_inode_post_setxattr(struct dentry *dentry, const char *name, evm_inode_post_setxattr(dentry, name, value, size); } +/** + * security_inode_getxattr() - Check if xattr access is allowed + * @dentry: file + * @name: xattr name + * + * Check permission before obtaining the extended attributes identified by + * @name for @dentry. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_getxattr(struct dentry *dentry, const char *name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) @@ -1458,6 +2264,15 @@ int security_inode_getxattr(struct dentry *dentry, const char *name) return call_int_hook(inode_getxattr, 0, dentry, name); } +/** + * security_inode_listxattr() - Check if listing xattrs is allowed + * @dentry: file + * + * Check permission before obtaining the list of extended attribute names for + * @dentry. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_listxattr(struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) @@ -1465,6 +2280,17 @@ int security_inode_listxattr(struct dentry *dentry) return call_int_hook(inode_listxattr, 0, dentry); } +/** + * security_inode_removexattr() - Check if removing an xattr is allowed + * @idmap: idmap of the mount + * @dentry: file + * @name: xattr name + * + * Check permission before removing the extended attribute identified by @name + * for @dentry. + * + * Return: Returns 0 if permission is granted. + */ int security_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { @@ -1487,17 +2313,55 @@ int security_inode_removexattr(struct mnt_idmap *idmap, return evm_inode_removexattr(idmap, dentry, name); } +/** + * security_inode_need_killpriv() - Check if security_inode_killpriv() required + * @dentry: associated dentry + * + * Called when an inode has been changed to determine if + * security_inode_killpriv() should be called. + * + * Return: Return <0 on error to abort the inode change operation, return 0 if + * security_inode_killpriv() does not need to be called, return >0 if + * security_inode_killpriv() does need to be called. + */ int security_inode_need_killpriv(struct dentry *dentry) { return call_int_hook(inode_need_killpriv, 0, dentry); } +/** + * security_inode_killpriv() - The setuid bit is removed, update LSM state + * @idmap: idmap of the mount + * @dentry: associated dentry + * + * The @dentry's setuid bit is being removed. Remove similar security labels. + * Called with the dentry->d_inode->i_mutex held. + * + * Return: Return 0 on success. If error is returned, then the operation + * causing setuid bit removal is failed. + */ int security_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry) { return call_int_hook(inode_killpriv, 0, idmap, dentry); } +/** + * security_inode_getsecurity() - Get the xattr security label of an inode + * @idmap: idmap of the mount + * @inode: inode + * @name: xattr name + * @buffer: security label buffer + * @alloc: allocation flag + * + * Retrieve a copy of the extended attribute representation of the security + * label associated with @name for @inode via @buffer. Note that @name is the + * remainder of the attribute name after the security prefix has been removed. + * @alloc is used to specify if the call should return a value via the buffer + * or just the value length. + * + * Return: Returns size of buffer on success. + */ int security_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) @@ -1511,14 +2375,31 @@ int security_inode_getsecurity(struct mnt_idmap *idmap, * Only one module will provide an attribute with a given name. */ hlist_for_each_entry(hp, &security_hook_heads.inode_getsecurity, list) { - rc = hp->hook.inode_getsecurity(idmap, inode, name, buffer, alloc); + rc = hp->hook.inode_getsecurity(idmap, inode, name, buffer, + alloc); if (rc != LSM_RET_DEFAULT(inode_getsecurity)) return rc; } return LSM_RET_DEFAULT(inode_getsecurity); } -int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags) +/** + * security_inode_setsecurity() - Set the xattr security label of an inode + * @inode: inode + * @name: xattr name + * @value: security label + * @size: length of security label + * @flags: flags + * + * Set the security label associated with @name for @inode from the extended + * attribute value @value. @size indicates the size of the @value in bytes. + * @flags may be XATTR_CREATE, XATTR_REPLACE, or 0. Note that @name is the + * remainder of the attribute name after the security. prefix has been removed. + * + * Return: Returns 0 on success. + */ +int security_inode_setsecurity(struct inode *inode, const char *name, + const void *value, size_t size, int flags) { struct security_hook_list *hp; int rc; @@ -1530,14 +2411,28 @@ int security_inode_setsecurity(struct inode *inode, const char *name, const void */ hlist_for_each_entry(hp, &security_hook_heads.inode_setsecurity, list) { rc = hp->hook.inode_setsecurity(inode, name, value, size, - flags); + flags); if (rc != LSM_RET_DEFAULT(inode_setsecurity)) return rc; } return LSM_RET_DEFAULT(inode_setsecurity); } -int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size) +/** + * security_inode_listsecurity() - List the xattr security label names + * @inode: inode + * @buffer: buffer + * @buffer_size: size of buffer + * + * Copy the extended attribute names for the security labels associated with + * @inode into @buffer. The maximum size of @buffer is specified by + * @buffer_size. @buffer may be NULL to request the size of the buffer + * required. + * + * Return: Returns number of bytes used/required on success. + */ +int security_inode_listsecurity(struct inode *inode, + char *buffer, size_t buffer_size) { if (unlikely(IS_PRIVATE(inode))) return 0; @@ -1545,17 +2440,49 @@ int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer } EXPORT_SYMBOL(security_inode_listsecurity); +/** + * security_inode_getsecid() - Get an inode's secid + * @inode: inode + * @secid: secid to return + * + * Get the secid associated with the node. In case of failure, @secid will be + * set to zero. + */ void security_inode_getsecid(struct inode *inode, u32 *secid) { call_void_hook(inode_getsecid, inode, secid); } +/** + * security_inode_copy_up() - Create new creds for an overlayfs copy-up op + * @src: union dentry of copy-up file + * @new: newly created creds + * + * A file is about to be copied up from lower layer to upper layer of overlay + * filesystem. Security module can prepare a set of new creds and modify as + * need be and return new creds. Caller will switch to new creds temporarily to + * create new file and release newly allocated creds. + * + * Return: Returns 0 on success or a negative error code on error. + */ int security_inode_copy_up(struct dentry *src, struct cred **new) { return call_int_hook(inode_copy_up, 0, src, new); } EXPORT_SYMBOL(security_inode_copy_up); +/** + * security_inode_copy_up_xattr() - Filter xattrs in an overlayfs copy-up op + * @name: xattr name + * + * Filter the xattrs being copied up when a unioned file is copied up from a + * lower layer to the union/overlay layer. The caller is responsible for + * reading and writing the xattrs, this hook is merely a filter. + * + * Return: Returns 0 to accept the xattr, 1 to discard the xattr, -EOPNOTSUPP + * if the security module does not know about attribute, or a negative + * error code to abort the copy up. + */ int security_inode_copy_up_xattr(const char *name) { struct security_hook_list *hp; @@ -1567,7 +2494,7 @@ int security_inode_copy_up_xattr(const char *name) * any other error code incase of an error. */ hlist_for_each_entry(hp, - &security_hook_heads.inode_copy_up_xattr, list) { + &security_hook_heads.inode_copy_up_xattr, list) { rc = hp->hook.inode_copy_up_xattr(name); if (rc != LSM_RET_DEFAULT(inode_copy_up_xattr)) return rc; @@ -1577,12 +2504,41 @@ int security_inode_copy_up_xattr(const char *name) } EXPORT_SYMBOL(security_inode_copy_up_xattr); +/** + * security_kernfs_init_security() - Init LSM context for a kernfs node + * @kn_dir: parent kernfs node + * @kn: the kernfs node to initialize + * + * Initialize the security context of a newly created kernfs node based on its + * own and its parent's attributes. + * + * Return: Returns 0 if permission is granted. + */ int security_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn) { return call_int_hook(kernfs_init_security, 0, kn_dir, kn); } +/** + * security_file_permission() - Check file permissions + * @file: file + * @mask: requested permissions + * + * Check file permissions before accessing an open file. This hook is called + * by various operations that read or write files. A security module can use + * this hook to perform additional checking on these operations, e.g. to + * revalidate permissions on use to support privilege bracketing or policy + * changes. Notice that this hook is used when the actual read/write + * operations are performed, whereas the inode_security_ops hook is called when + * a file is opened (as well as many other operations). Although this hook can + * be used to revalidate permissions for various system call operations that + * read or write files, it does not address the revalidation of permissions for + * memory-mapped files. Security modules must handle this separately if they + * need such revalidation. + * + * Return: Returns 0 if permission is granted. + */ int security_file_permission(struct file *file, int mask) { int ret; @@ -1594,6 +2550,15 @@ int security_file_permission(struct file *file, int mask) return fsnotify_perm(file, mask); } +/** + * security_file_alloc() - Allocate and init a file's LSM blob + * @file: the file + * + * Allocate and attach a security structure to the file->f_security field. The + * security field is initialized to NULL when the structure is first created. + * + * Return: Return 0 if the hook is successful and permission is granted. + */ int security_file_alloc(struct file *file) { int rc = lsm_file_alloc(file); @@ -1606,6 +2571,12 @@ int security_file_alloc(struct file *file) return rc; } +/** + * security_file_free() - Free a file's LSM blob + * @file: the file + * + * Deallocate and free any security structures stored in file->f_security. + */ void security_file_free(struct file *file) { void *blob; @@ -1619,6 +2590,19 @@ void security_file_free(struct file *file) } } +/** + * security_file_ioctl() - Check if an ioctl is allowed + * @file: associated file + * @cmd: ioctl cmd + * @arg: ioctl arguments + * + * Check permission for an ioctl operation on @file. Note that @arg sometimes + * represents a user space pointer; in other cases, it may be a simple integer + * value. When @arg represents a user space pointer, it should never be used + * by the security module. + * + * Return: Returns 0 if permission is granted. + */ int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_ioctl, 0, file, cmd, arg); @@ -1658,8 +2642,19 @@ static inline unsigned long mmap_prot(struct file *file, unsigned long prot) return prot; } +/** + * security_mmap_file() - Check if mmap'ing a file is allowed + * @file: file + * @prot: protection applied by the kernel + * @flags: flags + * + * Check permissions for a mmap operation. The @file may be NULL, e.g. if + * mapping anonymous memory. + * + * Return: Returns 0 if permission is granted. + */ int security_mmap_file(struct file *file, unsigned long prot, - unsigned long flags) + unsigned long flags) { unsigned long prot_adj = mmap_prot(file, prot); int ret; @@ -1670,13 +2665,31 @@ int security_mmap_file(struct file *file, unsigned long prot, return ima_file_mmap(file, prot, prot_adj, flags); } +/** + * security_mmap_addr() - Check if mmap'ing an address is allowed + * @addr: address + * + * Check permissions for a mmap operation at @addr. + * + * Return: Returns 0 if permission is granted. + */ int security_mmap_addr(unsigned long addr) { return call_int_hook(mmap_addr, 0, addr); } +/** + * security_file_mprotect() - Check if changing memory protections is allowed + * @vma: memory region + * @reqprot: application requested protection + * @prot: protection applied by the kernel + * + * Check permissions before changing memory access permissions. + * + * Return: Returns 0 if permission is granted. + */ int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, - unsigned long prot) + unsigned long prot) { int ret; @@ -1686,32 +2699,97 @@ int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, return ima_file_mprotect(vma, prot); } +/** + * security_file_lock() - Check if a file lock is allowed + * @file: file + * @cmd: lock operation (e.g. F_RDLCK, F_WRLCK) + * + * Check permission before performing file locking operations. Note the hook + * mediates both flock and fcntl style locks. + * + * Return: Returns 0 if permission is granted. + */ int security_file_lock(struct file *file, unsigned int cmd) { return call_int_hook(file_lock, 0, file, cmd); } +/** + * security_file_fcntl() - Check if fcntl() op is allowed + * @file: file + * @cmd: fnctl command + * @arg: command argument + * + * Check permission before allowing the file operation specified by @cmd from + * being performed on the file @file. Note that @arg sometimes represents a + * user space pointer; in other cases, it may be a simple integer value. When + * @arg represents a user space pointer, it should never be used by the + * security module. + * + * Return: Returns 0 if permission is granted. + */ int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_fcntl, 0, file, cmd, arg); } +/** + * security_file_set_fowner() - Set the file owner info in the LSM blob + * @file: the file + * + * Save owner security information (typically from current->security) in + * file->f_security for later use by the send_sigiotask hook. + * + * Return: Returns 0 on success. + */ void security_file_set_fowner(struct file *file) { call_void_hook(file_set_fowner, file); } +/** + * security_file_send_sigiotask() - Check if sending SIGIO/SIGURG is allowed + * @tsk: target task + * @fown: signal sender + * @sig: signal to be sent, SIGIO is sent if 0 + * + * Check permission for the file owner @fown to send SIGIO or SIGURG to the + * process @tsk. Note that this hook is sometimes called from interrupt. Note + * that the fown_struct, @fown, is never outside the context of a struct file, + * so the file structure (and associated security information) can always be + * obtained: container_of(fown, struct file, f_owner). + * + * Return: Returns 0 if permission is granted. + */ int security_file_send_sigiotask(struct task_struct *tsk, - struct fown_struct *fown, int sig) + struct fown_struct *fown, int sig) { return call_int_hook(file_send_sigiotask, 0, tsk, fown, sig); } +/** + * security_file_receive() - Check is receiving a file via IPC is allowed + * @file: file being received + * + * This hook allows security modules to control the ability of a process to + * receive an open file descriptor via socket IPC. + * + * Return: Returns 0 if permission is granted. + */ int security_file_receive(struct file *file) { return call_int_hook(file_receive, 0, file); } +/** + * security_file_open() - Save open() time state for late use by the LSM + * @file: + * + * Save open-time permission checking state for later use upon file_permission, + * and recheck access if anything has changed since inode_permission. + * + * Return: Returns 0 if permission is granted. + */ int security_file_open(struct file *file) { int ret; @@ -1723,11 +2801,30 @@ int security_file_open(struct file *file) return fsnotify_perm(file, MAY_OPEN); } +/** + * security_file_truncate() - Check if truncating a file is allowed + * @file: file + * + * Check permission before truncating a file, i.e. using ftruncate. Note that + * truncation permission may also be checked based on the path, using the + * @path_truncate hook. + * + * Return: Returns 0 if permission is granted. + */ int security_file_truncate(struct file *file) { return call_int_hook(file_truncate, 0, file); } +/** + * security_task_alloc() - Allocate a task's LSM blob + * @task: the task + * @clone_flags: flags indicating what is being shared + * + * Handle allocation of task-related resources. + * + * Return: Returns a zero on success, negative values on failure. + */ int security_task_alloc(struct task_struct *task, unsigned long clone_flags) { int rc = lsm_task_alloc(task); @@ -1740,6 +2837,13 @@ int security_task_alloc(struct task_struct *task, unsigned long clone_flags) return rc; } +/** + * security_task_free() - Free a task's LSM blob and related resources + * @task: task + * + * Handle release of task-related resources. Note that this can be called from + * interrupt context. + */ void security_task_free(struct task_struct *task) { call_void_hook(task_free, task); @@ -1748,6 +2852,16 @@ void security_task_free(struct task_struct *task) task->security = NULL; } +/** + * security_cred_alloc_blank() - Allocate the min memory to allow cred_transfer + * @cred: credentials + * @gfp: gfp flags + * + * Only allocate sufficient memory and attach to @cred such that + * cred_transfer() will not get ENOMEM. + * + * Return: Returns 0 on success, negative values on failure. + */ int security_cred_alloc_blank(struct cred *cred, gfp_t gfp) { int rc = lsm_cred_alloc(cred, gfp); @@ -1761,6 +2875,12 @@ int security_cred_alloc_blank(struct cred *cred, gfp_t gfp) return rc; } +/** + * security_cred_free() - Free the cred's LSM blob and associated resources + * @cred: credentials + * + * Deallocate and clear the cred->security field in a set of credentials. + */ void security_cred_free(struct cred *cred) { /* @@ -1776,6 +2896,16 @@ void security_cred_free(struct cred *cred) cred->security = NULL; } +/** + * security_prepare_creds() - Prepare a new set of credentials + * @new: new credentials + * @old: original credentials + * @gfp: gfp flags + * + * Prepare a new set of credentials by copying the data from the old set. + * + * Return: Returns 0 on success, negative values on failure. + */ int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp) { int rc = lsm_cred_alloc(new, gfp); @@ -1789,11 +2919,26 @@ int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp) return rc; } +/** + * security_transfer_creds() - Transfer creds + * @new: target credentials + * @old: original credentials + * + * Transfer data from original creds to new creds. + */ void security_transfer_creds(struct cred *new, const struct cred *old) { call_void_hook(cred_transfer, new, old); } +/** + * security_cred_getsecid() - Get the secid from a set of credentials + * @c: credentials + * @secid: secid value + * + * Retrieve the security identifier of the cred structure @c. In case of + * failure, @secid will be set to zero. + */ void security_cred_getsecid(const struct cred *c, u32 *secid) { *secid = 0; @@ -1801,16 +2946,46 @@ void security_cred_getsecid(const struct cred *c, u32 *secid) } EXPORT_SYMBOL(security_cred_getsecid); +/** + * security_kernel_act_as() - Set the kernel credentials to act as secid + * @new: credentials + * @secid: secid + * + * Set the credentials for a kernel service to act as (subjective context). + * The current task must be the one that nominated @secid. + * + * Return: Returns 0 if successful. + */ int security_kernel_act_as(struct cred *new, u32 secid) { return call_int_hook(kernel_act_as, 0, new, secid); } +/** + * security_kernel_create_files_as() - Set file creation context using an inode + * @new: target credentials + * @inode: reference inode + * + * Set the file creation context in a set of credentials to be the same as the + * objective context of the specified inode. The current task must be the one + * that nominated @inode. + * + * Return: Returns 0 if successful. + */ int security_kernel_create_files_as(struct cred *new, struct inode *inode) { return call_int_hook(kernel_create_files_as, 0, new, inode); } +/** + * security_kernel_module_request() - Check is loading a module is allowed + * @kmod_name: module name + * + * Ability to trigger the kernel to automatically upcall to userspace for + * userspace to load a kernel module with the given name. + * + * Return: Returns 0 if successful. + */ int security_kernel_module_request(char *kmod_name) { int ret; @@ -1821,6 +2996,16 @@ int security_kernel_module_request(char *kmod_name) return integrity_kernel_module_request(kmod_name); } +/** + * security_kernel_read_file() - Read a file specified by userspace + * @file: file + * @id: file identifier + * @contents: trust if security_kernel_post_read_file() will be called + * + * Read a file specified by userspace. + * + * Return: Returns 0 if permission is granted. + */ int security_kernel_read_file(struct file *file, enum kernel_read_file_id id, bool contents) { @@ -1833,6 +3018,19 @@ int security_kernel_read_file(struct file *file, enum kernel_read_file_id id, } EXPORT_SYMBOL_GPL(security_kernel_read_file); +/** + * security_kernel_post_read_file() - Read a file specified by userspace + * @file: file + * @buf: file contents + * @size: size of file contents + * @id: file identifier + * + * Read a file specified by userspace. This must be paired with a prior call + * to security_kernel_read_file() call that indicated this hook would also be + * called, see security_kernel_read_file() for more information. + * + * Return: Returns 0 if permission is granted. + */ int security_kernel_post_read_file(struct file *file, char *buf, loff_t size, enum kernel_read_file_id id) { @@ -1845,6 +3043,15 @@ int security_kernel_post_read_file(struct file *file, char *buf, loff_t size, } EXPORT_SYMBOL_GPL(security_kernel_post_read_file); +/** + * security_kernel_load_data() - Load data provided by userspace + * @id: data identifier + * @contents: true if security_kernel_post_load_data() will be called + * + * Load data provided by userspace. + * + * Return: Returns 0 if permission is granted. + */ int security_kernel_load_data(enum kernel_load_data_id id, bool contents) { int ret; @@ -1856,6 +3063,20 @@ int security_kernel_load_data(enum kernel_load_data_id id, bool contents) } EXPORT_SYMBOL_GPL(security_kernel_load_data); +/** + * security_kernel_post_load_data() - Load userspace data from a non-file source + * @buf: data + * @size: size of data + * @id: data identifier + * @description: text description of data, specific to the id value + * + * Load data provided by a non-file source (usually userspace buffer). This + * must be paired with a prior security_kernel_load_data() call that indicated + * this hook would also be called, see security_kernel_load_data() for more + * information. + * + * Return: Returns 0 if permission is granted. + */ int security_kernel_post_load_data(char *buf, loff_t size, enum kernel_load_data_id id, char *description) @@ -1870,38 +3091,112 @@ int security_kernel_post_load_data(char *buf, loff_t size, } EXPORT_SYMBOL_GPL(security_kernel_post_load_data); +/** + * security_task_fix_setuid() - Update LSM with new user id attributes + * @new: updated credentials + * @old: credentials being replaced + * @flags: LSM_SETID_* flag values + * + * Update the module's state after setting one or more of the user identity + * attributes of the current process. The @flags parameter indicates which of + * the set*uid system calls invoked this hook. If @new is the set of + * credentials that will be installed. Modifications should be made to this + * rather than to @current->cred. + * + * Return: Returns 0 on success. + */ int security_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { return call_int_hook(task_fix_setuid, 0, new, old, flags); } +/** + * security_task_fix_setgid() - Update LSM with new group id attributes + * @new: updated credentials + * @old: credentials being replaced + * @flags: LSM_SETID_* flag value + * + * Update the module's state after setting one or more of the group identity + * attributes of the current process. The @flags parameter indicates which of + * the set*gid system calls invoked this hook. @new is the set of credentials + * that will be installed. Modifications should be made to this rather than to + * @current->cred. + * + * Return: Returns 0 on success. + */ int security_task_fix_setgid(struct cred *new, const struct cred *old, - int flags) + int flags) { return call_int_hook(task_fix_setgid, 0, new, old, flags); } +/** + * security_task_fix_setgroups() - Update LSM with new supplementary groups + * @new: updated credentials + * @old: credentials being replaced + * + * Update the module's state after setting the supplementary group identity + * attributes of the current process. @new is the set of credentials that will + * be installed. Modifications should be made to this rather than to + * @current->cred. + * + * Return: Returns 0 on success. + */ int security_task_fix_setgroups(struct cred *new, const struct cred *old) { return call_int_hook(task_fix_setgroups, 0, new, old); } +/** + * security_task_setpgid() - Check if setting the pgid is allowed + * @p: task being modified + * @pgid: new pgid + * + * Check permission before setting the process group identifier of the process + * @p to @pgid. + * + * Return: Returns 0 if permission is granted. + */ int security_task_setpgid(struct task_struct *p, pid_t pgid) { return call_int_hook(task_setpgid, 0, p, pgid); } +/** + * security_task_getpgid() - Check if getting the pgid is allowed + * @p: task + * + * Check permission before getting the process group identifier of the process + * @p. + * + * Return: Returns 0 if permission is granted. + */ int security_task_getpgid(struct task_struct *p) { return call_int_hook(task_getpgid, 0, p); } +/** + * security_task_getsid() - Check if getting the session id is allowed + * @p: task + * + * Check permission before getting the session identifier of the process @p. + * + * Return: Returns 0 if permission is granted. + */ int security_task_getsid(struct task_struct *p) { return call_int_hook(task_getsid, 0, p); } +/** + * security_current_getsecid_subj() - Get the current task's subjective secid + * @secid: secid value + * + * Retrieve the subjective security identifier of the current task and return + * it in @secid. In case of failure, @secid will be set to zero. + */ void security_current_getsecid_subj(u32 *secid) { *secid = 0; @@ -1909,6 +3204,14 @@ void security_current_getsecid_subj(u32 *secid) } EXPORT_SYMBOL(security_current_getsecid_subj); +/** + * security_task_getsecid_obj() - Get a task's objective secid + * @p: target task + * @secid: secid value + * + * Retrieve the objective security identifier of the task_struct in @p and + * return it in @secid. In case of failure, @secid will be set to zero. + */ void security_task_getsecid_obj(struct task_struct *p, u32 *secid) { *secid = 0; @@ -1916,56 +3219,159 @@ void security_task_getsecid_obj(struct task_struct *p, u32 *secid) } EXPORT_SYMBOL(security_task_getsecid_obj); +/** + * security_task_setnice() - Check if setting a task's nice value is allowed + * @p: target task + * @nice: nice value + * + * Check permission before setting the nice value of @p to @nice. + * + * Return: Returns 0 if permission is granted. + */ int security_task_setnice(struct task_struct *p, int nice) { return call_int_hook(task_setnice, 0, p, nice); } +/** + * security_task_setioprio() - Check if setting a task's ioprio is allowed + * @p: target task + * @ioprio: ioprio value + * + * Check permission before setting the ioprio value of @p to @ioprio. + * + * Return: Returns 0 if permission is granted. + */ int security_task_setioprio(struct task_struct *p, int ioprio) { return call_int_hook(task_setioprio, 0, p, ioprio); } +/** + * security_task_getioprio() - Check if getting a task's ioprio is allowed + * @p: task + * + * Check permission before getting the ioprio value of @p. + * + * Return: Returns 0 if permission is granted. + */ int security_task_getioprio(struct task_struct *p) { return call_int_hook(task_getioprio, 0, p); } +/** + * security_task_prlimit() - Check if get/setting resources limits is allowed + * @cred: current task credentials + * @tcred: target task credentials + * @flags: LSM_PRLIMIT_* flag bits indicating a get/set/both + * + * Check permission before getting and/or setting the resource limits of + * another task. + * + * Return: Returns 0 if permission is granted. + */ int security_task_prlimit(const struct cred *cred, const struct cred *tcred, unsigned int flags) { return call_int_hook(task_prlimit, 0, cred, tcred, flags); } +/** + * security_task_setrlimit() - Check if setting a new rlimit value is allowed + * @p: target task's group leader + * @resource: resource whose limit is being set + * @new_rlim: new resource limit + * + * Check permission before setting the resource limits of process @p for + * @resource to @new_rlim. The old resource limit values can be examined by + * dereferencing (p->signal->rlim + resource). + * + * Return: Returns 0 if permission is granted. + */ int security_task_setrlimit(struct task_struct *p, unsigned int resource, - struct rlimit *new_rlim) + struct rlimit *new_rlim) { return call_int_hook(task_setrlimit, 0, p, resource, new_rlim); } +/** + * security_task_setscheduler() - Check if setting sched policy/param is allowed + * @p: target task + * + * Check permission before setting scheduling policy and/or parameters of + * process @p. + * + * Return: Returns 0 if permission is granted. + */ int security_task_setscheduler(struct task_struct *p) { return call_int_hook(task_setscheduler, 0, p); } +/** + * security_task_getscheduler() - Check if getting scheduling info is allowed + * @p: target task + * + * Check permission before obtaining scheduling information for process @p. + * + * Return: Returns 0 if permission is granted. + */ int security_task_getscheduler(struct task_struct *p) { return call_int_hook(task_getscheduler, 0, p); } +/** + * security_task_movememory() - Check if moving memory is allowed + * @p: task + * + * Check permission before moving memory owned by process @p. + * + * Return: Returns 0 if permission is granted. + */ int security_task_movememory(struct task_struct *p) { return call_int_hook(task_movememory, 0, p); } +/** + * security_task_kill() - Check if sending a signal is allowed + * @p: target process + * @info: signal information + * @sig: signal value + * @cred: credentials of the signal sender, NULL if @current + * + * Check permission before sending signal @sig to @p. @info can be NULL, the + * constant 1, or a pointer to a kernel_siginfo structure. If @info is 1 or + * SI_FROMKERNEL(info) is true, then the signal should be viewed as coming from + * the kernel and should typically be permitted. SIGIO signals are handled + * separately by the send_sigiotask hook in file_security_ops. + * + * Return: Returns 0 if permission is granted. + */ int security_task_kill(struct task_struct *p, struct kernel_siginfo *info, - int sig, const struct cred *cred) + int sig, const struct cred *cred) { return call_int_hook(task_kill, 0, p, info, sig, cred); } +/** + * security_task_prctl() - Check if a prctl op is allowed + * @option: operation + * @arg2: argument + * @arg3: argument + * @arg4: argument + * @arg5: argument + * + * Check permission before performing a process control operation on the + * current process. + * + * Return: Return -ENOSYS if no-one wanted to handle this op, any other value + * to cause prctl() to return immediately with that value. + */ int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, - unsigned long arg4, unsigned long arg5) + unsigned long arg4, unsigned long arg5) { int thisrc; int rc = LSM_RET_DEFAULT(task_prctl); @@ -1982,27 +3388,69 @@ int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, return rc; } +/** + * security_task_to_inode() - Set the security attributes of a task's inode + * @p: task + * @inode: inode + * + * Set the security attributes for an inode based on an associated task's + * security attributes, e.g. for /proc/pid inodes. + */ void security_task_to_inode(struct task_struct *p, struct inode *inode) { call_void_hook(task_to_inode, p, inode); } +/** + * security_create_user_ns() - Check if creating a new userns is allowed + * @cred: prepared creds + * + * Check permission prior to creating a new user namespace. + * + * Return: Returns 0 if successful, otherwise < 0 error code. + */ int security_create_user_ns(const struct cred *cred) { return call_int_hook(userns_create, 0, cred); } +/** + * security_ipc_permission() - Check if sysv ipc access is allowed + * @ipcp: ipc permission structure + * @flag: requested permissions + * + * Check permissions for access to IPC. + * + * Return: Returns 0 if permission is granted. + */ int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag) { return call_int_hook(ipc_permission, 0, ipcp, flag); } +/** + * security_ipc_getsecid() - Get the sysv ipc object's secid + * @ipcp: ipc permission structure + * @secid: secid pointer + * + * Get the secid associated with the ipc object. In case of failure, @secid + * will be set to zero. + */ void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid) { *secid = 0; call_void_hook(ipc_getsecid, ipcp, secid); } +/** + * security_msg_msg_alloc() - Allocate a sysv ipc message LSM blob + * @msg: message structure + * + * Allocate and attach a security structure to the msg->security field. The + * security field is initialized to NULL when the structure is first created. + * + * Return: Return 0 if operation was successful and permission is granted. + */ int security_msg_msg_alloc(struct msg_msg *msg) { int rc = lsm_msg_msg_alloc(msg); @@ -2015,6 +3463,12 @@ int security_msg_msg_alloc(struct msg_msg *msg) return rc; } +/** + * security_msg_msg_free() - Free a sysv ipc message LSM blob + * @msg: message structure + * + * Deallocate the security structure for this message. + */ void security_msg_msg_free(struct msg_msg *msg) { call_void_hook(msg_msg_free_security, msg); @@ -2022,6 +3476,15 @@ void security_msg_msg_free(struct msg_msg *msg) msg->security = NULL; } +/** + * security_msg_queue_alloc() - Allocate a sysv ipc msg queue LSM blob + * @msq: sysv ipc permission structure + * + * Allocate and attach a security structure to @msg. The security field is + * initialized to NULL when the structure is first created. + * + * Return: Returns 0 if operation was successful and permission is granted. + */ int security_msg_queue_alloc(struct kern_ipc_perm *msq) { int rc = lsm_ipc_alloc(msq); @@ -2034,6 +3497,12 @@ int security_msg_queue_alloc(struct kern_ipc_perm *msq) return rc; } +/** + * security_msg_queue_free() - Free a sysv ipc msg queue LSM blob + * @msq: sysv ipc permission structure + * + * Deallocate security field @perm->security for the message queue. + */ void security_msg_queue_free(struct kern_ipc_perm *msq) { call_void_hook(msg_queue_free_security, msq); @@ -2041,28 +3510,84 @@ void security_msg_queue_free(struct kern_ipc_perm *msq) msq->security = NULL; } +/** + * security_msg_queue_associate() - Check if a msg queue operation is allowed + * @msq: sysv ipc permission structure + * @msqflg: operation flags + * + * Check permission when a message queue is requested through the msgget system + * call. This hook is only called when returning the message queue identifier + * for an existing message queue, not when a new message queue is created. + * + * Return: Return 0 if permission is granted. + */ int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg) { return call_int_hook(msg_queue_associate, 0, msq, msqflg); } +/** + * security_msg_queue_msgctl() - Check if a msg queue operation is allowed + * @msq: sysv ipc permission structure + * @cmd: operation + * + * Check permission when a message control operation specified by @cmd is to be + * performed on the message queue with permissions. + * + * Return: Returns 0 if permission is granted. + */ int security_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd) { return call_int_hook(msg_queue_msgctl, 0, msq, cmd); } +/** + * security_msg_queue_msgsnd() - Check if sending a sysv ipc message is allowed + * @msq: sysv ipc permission structure + * @msg: message + * @msqflg: operation flags + * + * Check permission before a message, @msg, is enqueued on the message queue + * with permissions specified in @msq. + * + * Return: Returns 0 if permission is granted. + */ int security_msg_queue_msgsnd(struct kern_ipc_perm *msq, - struct msg_msg *msg, int msqflg) + struct msg_msg *msg, int msqflg) { return call_int_hook(msg_queue_msgsnd, 0, msq, msg, msqflg); } +/** + * security_msg_queue_msgrcv() - Check if receiving a sysv ipc msg is allowed + * @msq: sysv ipc permission structure + * @msg: message + * @target: target task + * @type: type of message requested + * @mode: operation flags + * + * Check permission before a message, @msg, is removed from the message queue. + * The @target task structure contains a pointer to the process that will be + * receiving the message (not equal to the current process when inline receives + * are being performed). + * + * Return: Returns 0 if permission is granted. + */ int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg, - struct task_struct *target, long type, int mode) + struct task_struct *target, long type, int mode) { return call_int_hook(msg_queue_msgrcv, 0, msq, msg, target, type, mode); } +/** + * security_shm_alloc() - Allocate a sysv shm LSM blob + * @shp: sysv ipc permission structure + * + * Allocate and attach a security structure to the @shp security field. The + * security field is initialized to NULL when the structure is first created. + * + * Return: Returns 0 if operation was successful and permission is granted. + */ int security_shm_alloc(struct kern_ipc_perm *shp) { int rc = lsm_ipc_alloc(shp); @@ -2075,6 +3600,12 @@ int security_shm_alloc(struct kern_ipc_perm *shp) return rc; } +/** + * security_shm_free() - Free a sysv shm LSM blob + * @shp: sysv ipc permission structure + * + * Deallocate the security structure @perm->security for the memory segment. + */ void security_shm_free(struct kern_ipc_perm *shp) { call_void_hook(shm_free_security, shp); @@ -2082,21 +3613,65 @@ void security_shm_free(struct kern_ipc_perm *shp) shp->security = NULL; } +/** + * security_shm_associate() - Check if a sysv shm operation is allowed + * @shp: sysv ipc permission structure + * @shmflg: operation flags + * + * Check permission when a shared memory region is requested through the shmget + * system call. This hook is only called when returning the shared memory + * region identifier for an existing region, not when a new shared memory + * region is created. + * + * Return: Returns 0 if permission is granted. + */ int security_shm_associate(struct kern_ipc_perm *shp, int shmflg) { return call_int_hook(shm_associate, 0, shp, shmflg); } +/** + * security_shm_shmctl() - Check if a sysv shm operation is allowed + * @shp: sysv ipc permission structure + * @cmd: operation + * + * Check permission when a shared memory control operation specified by @cmd is + * to be performed on the shared memory region with permissions in @shp. + * + * Return: Return 0 if permission is granted. + */ int security_shm_shmctl(struct kern_ipc_perm *shp, int cmd) { return call_int_hook(shm_shmctl, 0, shp, cmd); } -int security_shm_shmat(struct kern_ipc_perm *shp, char __user *shmaddr, int shmflg) +/** + * security_shm_shmat() - Check if a sysv shm attach operation is allowed + * @shp: sysv ipc permission structure + * @shmaddr: address of memory region to attach + * @shmflg: operation flags + * + * Check permissions prior to allowing the shmat system call to attach the + * shared memory segment with permissions @shp to the data segment of the + * calling process. The attaching address is specified by @shmaddr. + * + * Return: Returns 0 if permission is granted. + */ +int security_shm_shmat(struct kern_ipc_perm *shp, + char __user *shmaddr, int shmflg) { return call_int_hook(shm_shmat, 0, shp, shmaddr, shmflg); } +/** + * security_sem_alloc() - Allocate a sysv semaphore LSM blob + * @sma: sysv ipc permission structure + * + * Allocate and attach a security structure to the @sma security field. The + * security field is initialized to NULL when the structure is first created. + * + * Return: Returns 0 if operation was successful and permission is granted. + */ int security_sem_alloc(struct kern_ipc_perm *sma) { int rc = lsm_ipc_alloc(sma); @@ -2109,6 +3684,12 @@ int security_sem_alloc(struct kern_ipc_perm *sma) return rc; } +/** + * security_sem_free() - Free a sysv semaphore LSM blob + * @sma: sysv ipc permission structure + * + * Deallocate security structure @sma->security for the semaphore. + */ void security_sem_free(struct kern_ipc_perm *sma) { call_void_hook(sem_free_security, sma); @@ -2116,22 +3697,62 @@ void security_sem_free(struct kern_ipc_perm *sma) sma->security = NULL; } +/** + * security_sem_associate() - Check if a sysv semaphore operation is allowed + * @sma: sysv ipc permission structure + * @semflg: operation flags + * + * Check permission when a semaphore is requested through the semget system + * call. This hook is only called when returning the semaphore identifier for + * an existing semaphore, not when a new one must be created. + * + * Return: Returns 0 if permission is granted. + */ int security_sem_associate(struct kern_ipc_perm *sma, int semflg) { return call_int_hook(sem_associate, 0, sma, semflg); } +/** + * security_sem_semctl() - Check if a sysv semaphore operation is allowed + * @sma: sysv ipc permission structure + * @cmd: operation + * + * Check permission when a semaphore operation specified by @cmd is to be + * performed on the semaphore. + * + * Return: Returns 0 if permission is granted. + */ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd) { return call_int_hook(sem_semctl, 0, sma, cmd); } +/** + * security_sem_semop() - Check if a sysv semaphore operation is allowed + * @sma: sysv ipc permission structure + * @sops: operations to perform + * @nsops: number of operations + * @alter: flag indicating changes will be made + * + * Check permissions before performing operations on members of the semaphore + * set. If the @alter flag is nonzero, the semaphore set may be modified. + * + * Return: Returns 0 if permission is granted. + */ int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, - unsigned nsops, int alter) + unsigned nsops, int alter) { return call_int_hook(sem_semop, 0, sma, sops, nsops, alter); } +/** + * security_d_instantiate() - Populate an inode's LSM state based on a dentry + * @dentry: dentry + * @inode: inode + * + * Fill in @inode security information for a @dentry if allowed. + */ void security_d_instantiate(struct dentry *dentry, struct inode *inode) { if (unlikely(inode && IS_PRIVATE(inode))) @@ -2140,6 +3761,17 @@ void security_d_instantiate(struct dentry *dentry, struct inode *inode) } EXPORT_SYMBOL(security_d_instantiate); +/** + * security_getprocattr() - Read an attribute for a task + * @p: the task + * @lsm: LSM name + * @name: attribute name + * @value: attribute value + * + * Read attribute @name for task @p and store it into @value if allowed. + * + * Return: Returns the length of @value on success, a negative value otherwise. + */ int security_getprocattr(struct task_struct *p, const char *lsm, const char *name, char **value) { @@ -2153,6 +3785,18 @@ int security_getprocattr(struct task_struct *p, const char *lsm, return LSM_RET_DEFAULT(getprocattr); } +/** + * security_setprocattr() - Set an attribute for a task + * @lsm: LSM name + * @name: attribute name + * @value: attribute value + * @size: attribute value size + * + * Write (set) the current task's attribute @name to @value, size @size if + * allowed. + * + * Return: Returns bytes written on success, a negative value otherwise. + */ int security_setprocattr(const char *lsm, const char *name, void *value, size_t size) { @@ -2166,17 +3810,51 @@ int security_setprocattr(const char *lsm, const char *name, void *value, return LSM_RET_DEFAULT(setprocattr); } +/** + * security_netlink_send() - Save info and check if netlink sending is allowed + * @sk: sending socket + * @skb: netlink message + * + * Save security information for a netlink message so that permission checking + * can be performed when the message is processed. The security information + * can be saved using the eff_cap field of the netlink_skb_parms structure. + * Also may be used to provide fine grained control over message transmission. + * + * Return: Returns 0 if the information was successfully saved and message is + * allowed to be transmitted. + */ int security_netlink_send(struct sock *sk, struct sk_buff *skb) { return call_int_hook(netlink_send, 0, sk, skb); } +/** + * security_ismaclabel() - Check is the named attribute is a MAC label + * @name: full extended attribute name + * + * Check if the extended attribute specified by @name represents a MAC label. + * + * Return: Returns 1 if name is a MAC attribute otherwise returns 0. + */ int security_ismaclabel(const char *name) { return call_int_hook(ismaclabel, 0, name); } EXPORT_SYMBOL(security_ismaclabel); +/** + * security_secid_to_secctx() - Convert a secid to a secctx + * @secid: secid + * @secdata: secctx + * @seclen: secctx length + * + * Convert secid to security context. If @secdata is NULL the length of the + * result will be returned in @seclen, but no @secdata will be returned. This + * does mean that the length could change between calls to check the length and + * the next call which actually allocates and returns the @secdata. + * + * Return: Return 0 on success, error on failure. + */ int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { struct security_hook_list *hp; @@ -2196,6 +3874,16 @@ int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) } EXPORT_SYMBOL(security_secid_to_secctx); +/** + * security_secctx_to_secid() - Convert a secctx to a secid + * @secdata: secctx + * @seclen: length of secctx + * @secid: secid + * + * Convert security context to secid. + * + * Return: Returns 0 on success, error on failure. + */ int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { *secid = 0; @@ -2203,30 +3891,86 @@ int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) } EXPORT_SYMBOL(security_secctx_to_secid); +/** + * security_release_secctx() - Free a secctx buffer + * @secdata: secctx + * @seclen: length of secctx + * + * Release the security context. + */ void security_release_secctx(char *secdata, u32 seclen) { call_void_hook(release_secctx, secdata, seclen); } EXPORT_SYMBOL(security_release_secctx); +/** + * security_inode_invalidate_secctx() - Invalidate an inode's security label + * @inode: inode + * + * Notify the security module that it must revalidate the security context of + * an inode. + */ void security_inode_invalidate_secctx(struct inode *inode) { call_void_hook(inode_invalidate_secctx, inode); } EXPORT_SYMBOL(security_inode_invalidate_secctx); +/** + * security_inode_notifysecctx() - Nofify the LSM of an inode's security label + * @inode: inode + * @ctx: secctx + * @ctxlen: length of secctx + * + * Notify the security module of what the security context of an inode should + * be. Initializes the incore security context managed by the security module + * for this inode. Example usage: NFS client invokes this hook to initialize + * the security context in its incore inode to the value provided by the server + * for the file when the server returned the file's attributes to the client. + * Must be called with inode->i_mutex locked. + * + * Return: Returns 0 on success, error on failure. + */ int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) { return call_int_hook(inode_notifysecctx, 0, inode, ctx, ctxlen); } EXPORT_SYMBOL(security_inode_notifysecctx); +/** + * security_inode_setsecctx() - Change the security label of an inode + * @dentry: inode + * @ctx: secctx + * @ctxlen: length of secctx + * + * Change the security context of an inode. Updates the incore security + * context managed by the security module and invokes the fs code as needed + * (via __vfs_setxattr_noperm) to update any backing xattrs that represent the + * context. Example usage: NFS server invokes this hook to change the security + * context in its incore inode and on the backing filesystem to a value + * provided by the client on a SETATTR operation. Must be called with + * inode->i_mutex locked. + * + * Return: Returns 0 on success, error on failure. + */ int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) { return call_int_hook(inode_setsecctx, 0, dentry, ctx, ctxlen); } EXPORT_SYMBOL(security_inode_setsecctx); +/** + * security_inode_getsecctx() - Get the security label of an inode + * @inode: inode + * @ctx: secctx + * @ctxlen: length of secctx + * + * On success, returns 0 and fills out @ctx and @ctxlen with the security + * context for the given @inode. + * + * Return: Returns 0 on success, error on failure. + */ int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) { return call_int_hook(inode_getsecctx, -EOPNOTSUPP, inode, ctx, ctxlen); @@ -2234,6 +3978,16 @@ int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) EXPORT_SYMBOL(security_inode_getsecctx); #ifdef CONFIG_WATCH_QUEUE +/** + * security_post_notification() - Check if a watch notification can be posted + * @w_cred: credentials of the task that set the watch + * @cred: credentials of the task which triggered the watch + * @n: the notification + * + * Check to see if a watch notification can be posted to a particular queue. + * + * Return: Returns 0 if permission is granted. + */ int security_post_notification(const struct cred *w_cred, const struct cred *cred, struct watch_notification *n) @@ -2243,106 +3997,336 @@ int security_post_notification(const struct cred *w_cred, #endif /* CONFIG_WATCH_QUEUE */ #ifdef CONFIG_KEY_NOTIFICATIONS +/** + * security_watch_key() - Check if a task is allowed to watch for key events + * @key: the key to watch + * + * Check to see if a process is allowed to watch for event notifications from + * a key or keyring. + * + * Return: Returns 0 if permission is granted. + */ int security_watch_key(struct key *key) { return call_int_hook(watch_key, 0, key); } -#endif +#endif /* CONFIG_KEY_NOTIFICATIONS */ #ifdef CONFIG_SECURITY_NETWORK - -int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) +/** + * security_unix_stream_connect() - Check if a AF_UNIX stream is allowed + * @sock: originating sock + * @other: peer sock + * @newsk: new sock + * + * Check permissions before establishing a Unix domain stream connection + * between @sock and @other. + * + * The @unix_stream_connect and @unix_may_send hooks were necessary because + * Linux provides an alternative to the conventional file name space for Unix + * domain sockets. Whereas binding and connecting to sockets in the file name + * space is mediated by the typical file permissions (and caught by the mknod + * and permission hooks in inode_security_ops), binding and connecting to + * sockets in the abstract name space is completely unmediated. Sufficient + * control of Unix domain sockets in the abstract name space isn't possible + * using only the socket layer hooks, since we need to know the actual target + * socket, which is not looked up until we are inside the af_unix code. + * + * Return: Returns 0 if permission is granted. + */ +int security_unix_stream_connect(struct sock *sock, struct sock *other, + struct sock *newsk) { return call_int_hook(unix_stream_connect, 0, sock, other, newsk); } EXPORT_SYMBOL(security_unix_stream_connect); +/** + * security_unix_may_send() - Check if AF_UNIX socket can send datagrams + * @sock: originating sock + * @other: peer sock + * + * Check permissions before connecting or sending datagrams from @sock to + * @other. + * + * The @unix_stream_connect and @unix_may_send hooks were necessary because + * Linux provides an alternative to the conventional file name space for Unix + * domain sockets. Whereas binding and connecting to sockets in the file name + * space is mediated by the typical file permissions (and caught by the mknod + * and permission hooks in inode_security_ops), binding and connecting to + * sockets in the abstract name space is completely unmediated. Sufficient + * control of Unix domain sockets in the abstract name space isn't possible + * using only the socket layer hooks, since we need to know the actual target + * socket, which is not looked up until we are inside the af_unix code. + * + * Return: Returns 0 if permission is granted. + */ int security_unix_may_send(struct socket *sock, struct socket *other) { return call_int_hook(unix_may_send, 0, sock, other); } EXPORT_SYMBOL(security_unix_may_send); +/** + * security_socket_create() - Check if creating a new socket is allowed + * @family: protocol family + * @type: communications type + * @protocol: requested protocol + * @kern: set to 1 if a kernel socket is requested + * + * Check permissions prior to creating a new socket. + * + * Return: Returns 0 if permission is granted. + */ int security_socket_create(int family, int type, int protocol, int kern) { return call_int_hook(socket_create, 0, family, type, protocol, kern); } +/** + * security_socket_post_create() - Initialize a newly created socket + * @sock: socket + * @family: protocol family + * @type: communications type + * @protocol: requested protocol + * @kern: set to 1 if a kernel socket is requested + * + * This hook allows a module to update or allocate a per-socket security + * structure. Note that the security field was not added directly to the socket + * structure, but rather, the socket security information is stored in the + * associated inode. Typically, the inode alloc_security hook will allocate + * and attach security information to SOCK_INODE(sock)->i_security. This hook + * may be used to update the SOCK_INODE(sock)->i_security field with additional + * information that wasn't available when the inode was allocated. + * + * Return: Returns 0 if permission is granted. + */ int security_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { return call_int_hook(socket_post_create, 0, sock, family, type, - protocol, kern); + protocol, kern); } +/** + * security_socket_socketpair() - Check if creating a socketpair is allowed + * @socka: first socket + * @sockb: second socket + * + * Check permissions before creating a fresh pair of sockets. + * + * Return: Returns 0 if permission is granted and the connection was + * established. + */ int security_socket_socketpair(struct socket *socka, struct socket *sockb) { return call_int_hook(socket_socketpair, 0, socka, sockb); } EXPORT_SYMBOL(security_socket_socketpair); -int security_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) +/** + * security_socket_bind() - Check if a socket bind operation is allowed + * @sock: socket + * @address: requested bind address + * @addrlen: length of address + * + * Check permission before socket protocol layer bind operation is performed + * and the socket @sock is bound to the address specified in the @address + * parameter. + * + * Return: Returns 0 if permission is granted. + */ +int security_socket_bind(struct socket *sock, + struct sockaddr *address, int addrlen) { return call_int_hook(socket_bind, 0, sock, address, addrlen); } -int security_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) +/** + * security_socket_connect() - Check if a socket connect operation is allowed + * @sock: socket + * @address: address of remote connection point + * @addrlen: length of address + * + * Check permission before socket protocol layer connect operation attempts to + * connect socket @sock to a remote address, @address. + * + * Return: Returns 0 if permission is granted. + */ +int security_socket_connect(struct socket *sock, + struct sockaddr *address, int addrlen) { return call_int_hook(socket_connect, 0, sock, address, addrlen); } +/** + * security_socket_listen() - Check if a socket is allowed to listen + * @sock: socket + * @backlog: connection queue size + * + * Check permission before socket protocol layer listen operation. + * + * Return: Returns 0 if permission is granted. + */ int security_socket_listen(struct socket *sock, int backlog) { return call_int_hook(socket_listen, 0, sock, backlog); } +/** + * security_socket_accept() - Check if a socket is allowed to accept connections + * @sock: listening socket + * @newsock: newly creation connection socket + * + * Check permission before accepting a new connection. Note that the new + * socket, @newsock, has been created and some information copied to it, but + * the accept operation has not actually been performed. + * + * Return: Returns 0 if permission is granted. + */ int security_socket_accept(struct socket *sock, struct socket *newsock) { return call_int_hook(socket_accept, 0, sock, newsock); } +/** + * security_socket_sendmsg() - Check is sending a message is allowed + * @sock: sending socket + * @msg: message to send + * @size: size of message + * + * Check permission before transmitting a message to another socket. + * + * Return: Returns 0 if permission is granted. + */ int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return call_int_hook(socket_sendmsg, 0, sock, msg, size); } +/** + * security_socket_recvmsg() - Check if receiving a message is allowed + * @sock: receiving socket + * @msg: message to receive + * @size: size of message + * @flags: operational flags + * + * Check permission before receiving a message from a socket. + * + * Return: Returns 0 if permission is granted. + */ int security_socket_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) { return call_int_hook(socket_recvmsg, 0, sock, msg, size, flags); } +/** + * security_socket_getsockname() - Check if reading the socket addr is allowed + * @sock: socket + * + * Check permission before reading the local address (name) of the socket + * object. + * + * Return: Returns 0 if permission is granted. + */ int security_socket_getsockname(struct socket *sock) { return call_int_hook(socket_getsockname, 0, sock); } +/** + * security_socket_getpeername() - Check if reading the peer's addr is allowed + * @sock: socket + * + * Check permission before the remote address (name) of a socket object. + * + * Return: Returns 0 if permission is granted. + */ int security_socket_getpeername(struct socket *sock) { return call_int_hook(socket_getpeername, 0, sock); } +/** + * security_socket_getsockopt() - Check if reading a socket option is allowed + * @sock: socket + * @level: option's protocol level + * @optname: option name + * + * Check permissions before retrieving the options associated with socket + * @sock. + * + * Return: Returns 0 if permission is granted. + */ int security_socket_getsockopt(struct socket *sock, int level, int optname) { return call_int_hook(socket_getsockopt, 0, sock, level, optname); } +/** + * security_socket_setsockopt() - Check if setting a socket option is allowed + * @sock: socket + * @level: option's protocol level + * @optname: option name + * + * Check permissions before setting the options associated with socket @sock. + * + * Return: Returns 0 if permission is granted. + */ int security_socket_setsockopt(struct socket *sock, int level, int optname) { return call_int_hook(socket_setsockopt, 0, sock, level, optname); } +/** + * security_socket_shutdown() - Checks if shutting down the socket is allowed + * @sock: socket + * @how: flag indicating how sends and receives are handled + * + * Checks permission before all or part of a connection on the socket @sock is + * shut down. + * + * Return: Returns 0 if permission is granted. + */ int security_socket_shutdown(struct socket *sock, int how) { return call_int_hook(socket_shutdown, 0, sock, how); } +/** + * security_sock_rcv_skb() - Check if an incoming network packet is allowed + * @sk: destination sock + * @skb: incoming packet + * + * Check permissions on incoming network packets. This hook is distinct from + * Netfilter's IP input hooks since it is the first time that the incoming + * sk_buff @skb has been associated with a particular socket, @sk. Must not + * sleep inside this hook because some callers hold spinlocks. + * + * Return: Returns 0 if permission is granted. + */ int security_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { return call_int_hook(socket_sock_rcv_skb, 0, sk, skb); } EXPORT_SYMBOL(security_sock_rcv_skb); +/** + * security_socket_getpeersec_stream() - Get the remote peer label + * @sock: socket + * @optval: destination buffer + * @optlen: size of peer label copied into the buffer + * @len: maximum size of the destination buffer + * + * This hook allows the security module to provide peer socket security state + * for unix or connected tcp sockets to userspace via getsockopt SO_GETPEERSEC. + * For tcp sockets this can be meaningful if the socket is associated with an + * ipsec SA. + * + * Return: Returns 0 if all is well, otherwise, typical getsockopt return + * values. + */ int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { @@ -2350,23 +4334,62 @@ int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, optval, optlen, len); } -int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) +/** + * security_socket_getpeersec_dgram() - Get the remote peer label + * @sock: socket + * @skb: datagram packet + * @secid: remote peer label secid + * + * This hook allows the security module to provide peer socket security state + * for udp sockets on a per-packet basis to userspace via getsockopt + * SO_GETPEERSEC. The application must first have indicated the IP_PASSSEC + * option via getsockopt. It can then retrieve the security state returned by + * this hook for a packet via the SCM_SECURITY ancillary message type. + * + * Return: Returns 0 on success, error on failure. + */ +int security_socket_getpeersec_dgram(struct socket *sock, + struct sk_buff *skb, u32 *secid) { return call_int_hook(socket_getpeersec_dgram, -ENOPROTOOPT, sock, skb, secid); } EXPORT_SYMBOL(security_socket_getpeersec_dgram); +/** + * security_sk_alloc() - Allocate and initialize a sock's LSM blob + * @sk: sock + * @family: protocol family + * @priority: gfp flags + * + * Allocate and attach a security structure to the sk->sk_security field, which + * is used to copy security attributes between local stream sockets. + * + * Return: Returns 0 on success, error on failure. + */ int security_sk_alloc(struct sock *sk, int family, gfp_t priority) { return call_int_hook(sk_alloc_security, 0, sk, family, priority); } +/** + * security_sk_free() - Free the sock's LSM blob + * @sk: sock + * + * Deallocate security structure. + */ void security_sk_free(struct sock *sk) { call_void_hook(sk_free_security, sk); } +/** + * security_sk_clone() - Clone a sock's LSM state + * @sk: original sock + * @newsk: target sock + * + * Clone/copy security structure. + */ void security_sk_clone(const struct sock *sk, struct sock *newsk) { call_void_hook(sk_clone_security, sk, newsk); @@ -2379,6 +4402,13 @@ void security_sk_classify_flow(struct sock *sk, struct flowi_common *flic) } EXPORT_SYMBOL(security_sk_classify_flow); +/** + * security_req_classify_flow() - Set a flow's secid based on request_sock + * @req: request_sock + * @flic: target flow + * + * Sets @flic's secid to @req's secid. + */ void security_req_classify_flow(const struct request_sock *req, struct flowi_common *flic) { @@ -2386,92 +4416,215 @@ void security_req_classify_flow(const struct request_sock *req, } EXPORT_SYMBOL(security_req_classify_flow); +/** + * security_sock_graft() - Reconcile LSM state when grafting a sock on a socket + * @sk: sock being grafted + * @parent: target parent socket + * + * Sets @parent's inode secid to @sk's secid and update @sk with any necessary + * LSM state from @parent. + */ void security_sock_graft(struct sock *sk, struct socket *parent) { call_void_hook(sock_graft, sk, parent); } EXPORT_SYMBOL(security_sock_graft); +/** + * security_inet_conn_request() - Set request_sock state using incoming connect + * @sk: parent listening sock + * @skb: incoming connection + * @req: new request_sock + * + * Initialize the @req LSM state based on @sk and the incoming connect in @skb. + * + * Return: Returns 0 if permission is granted. + */ int security_inet_conn_request(const struct sock *sk, - struct sk_buff *skb, struct request_sock *req) + struct sk_buff *skb, struct request_sock *req) { return call_int_hook(inet_conn_request, 0, sk, skb, req); } EXPORT_SYMBOL(security_inet_conn_request); +/** + * security_inet_csk_clone() - Set new sock LSM state based on request_sock + * @newsk: new sock + * @req: connection request_sock + * + * Set that LSM state of @sock using the LSM state from @req. + */ void security_inet_csk_clone(struct sock *newsk, - const struct request_sock *req) + const struct request_sock *req) { call_void_hook(inet_csk_clone, newsk, req); } +/** + * security_inet_conn_established() - Update sock's LSM state with connection + * @sk: sock + * @skb: connection packet + * + * Update @sock's LSM state to represent a new connection from @skb. + */ void security_inet_conn_established(struct sock *sk, - struct sk_buff *skb) + struct sk_buff *skb) { call_void_hook(inet_conn_established, sk, skb); } EXPORT_SYMBOL(security_inet_conn_established); +/** + * security_secmark_relabel_packet() - Check if setting a secmark is allowed + * @secid: new secmark value + * + * Check if the process should be allowed to relabel packets to @secid. + * + * Return: Returns 0 if permission is granted. + */ int security_secmark_relabel_packet(u32 secid) { return call_int_hook(secmark_relabel_packet, 0, secid); } EXPORT_SYMBOL(security_secmark_relabel_packet); +/** + * security_secmark_refcount_inc() - Increment the secmark labeling rule count + * + * Tells the LSM to increment the number of secmark labeling rules loaded. + */ void security_secmark_refcount_inc(void) { call_void_hook(secmark_refcount_inc); } EXPORT_SYMBOL(security_secmark_refcount_inc); +/** + * security_secmark_refcount_dec() - Decrement the secmark labeling rule count + * + * Tells the LSM to decrement the number of secmark labeling rules loaded. + */ void security_secmark_refcount_dec(void) { call_void_hook(secmark_refcount_dec); } EXPORT_SYMBOL(security_secmark_refcount_dec); +/** + * security_tun_dev_alloc_security() - Allocate a LSM blob for a TUN device + * @security: pointer to the LSM blob + * + * This hook allows a module to allocate a security structure for a TUN device, + * returning the pointer in @security. + * + * Return: Returns a zero on success, negative values on failure. + */ int security_tun_dev_alloc_security(void **security) { return call_int_hook(tun_dev_alloc_security, 0, security); } EXPORT_SYMBOL(security_tun_dev_alloc_security); +/** + * security_tun_dev_free_security() - Free a TUN device LSM blob + * @security: LSM blob + * + * This hook allows a module to free the security structure for a TUN device. + */ void security_tun_dev_free_security(void *security) { call_void_hook(tun_dev_free_security, security); } EXPORT_SYMBOL(security_tun_dev_free_security); +/** + * security_tun_dev_create() - Check if creating a TUN device is allowed + * + * Check permissions prior to creating a new TUN device. + * + * Return: Returns 0 if permission is granted. + */ int security_tun_dev_create(void) { return call_int_hook(tun_dev_create, 0); } EXPORT_SYMBOL(security_tun_dev_create); +/** + * security_tun_dev_attach_queue() - Check if attaching a TUN queue is allowed + * @security: TUN device LSM blob + * + * Check permissions prior to attaching to a TUN device queue. + * + * Return: Returns 0 if permission is granted. + */ int security_tun_dev_attach_queue(void *security) { return call_int_hook(tun_dev_attach_queue, 0, security); } EXPORT_SYMBOL(security_tun_dev_attach_queue); +/** + * security_tun_dev_attach() - Update TUN device LSM state on attach + * @sk: associated sock + * @security: TUN device LSM blob + * + * This hook can be used by the module to update any security state associated + * with the TUN device's sock structure. + * + * Return: Returns 0 if permission is granted. + */ int security_tun_dev_attach(struct sock *sk, void *security) { return call_int_hook(tun_dev_attach, 0, sk, security); } EXPORT_SYMBOL(security_tun_dev_attach); +/** + * security_tun_dev_open() - Update TUN device LSM state on open + * @security: TUN device LSM blob + * + * This hook can be used by the module to update any security state associated + * with the TUN device's security structure. + * + * Return: Returns 0 if permission is granted. + */ int security_tun_dev_open(void *security) { return call_int_hook(tun_dev_open, 0, security); } EXPORT_SYMBOL(security_tun_dev_open); -int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) +/** + * security_sctp_assoc_request() - Update the LSM on a SCTP association req + * @asoc: SCTP association + * @skb: packet requesting the association + * + * Passes the @asoc and @chunk->skb of the association INIT packet to the LSM. + * + * Return: Returns 0 on success, error on failure. + */ +int security_sctp_assoc_request(struct sctp_association *asoc, + struct sk_buff *skb) { return call_int_hook(sctp_assoc_request, 0, asoc, skb); } EXPORT_SYMBOL(security_sctp_assoc_request); +/** + * security_sctp_bind_connect() - Validate a list of addrs for a SCTP option + * @sk: socket + * @optname: SCTP option to validate + * @address: list of IP addresses to validate + * @addrlen: length of the address list + * + * Validiate permissions required for each address associated with sock @sk. + * Depending on @optname, the addresses will be treated as either a connect or + * bind service. The @addrlen is calculated on each IPv4 and IPv6 address using + * sizeof(struct sockaddr_in) or sizeof(struct sockaddr_in6). + * + * Return: Returns 0 on success, error on failure. + */ int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen) { @@ -2480,6 +4633,16 @@ int security_sctp_bind_connect(struct sock *sk, int optname, } EXPORT_SYMBOL(security_sctp_bind_connect); +/** + * security_sctp_sk_clone() - Clone a SCTP sock's LSM state + * @asoc: SCTP association + * @sk: original sock + * @newsk: target sock + * + * Called whenever a new socket is created by accept(2) (i.e. a TCP style + * socket) or when a socket is 'peeled off' e.g userspace calls + * sctp_peeloff(3). + */ void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk) { @@ -2487,6 +4650,16 @@ void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, } EXPORT_SYMBOL(security_sctp_sk_clone); +/** + * security_sctp_assoc_established() - Update LSM state when assoc established + * @asoc: SCTP association + * @skb: packet establishing the association + * + * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet to the + * security module. + * + * Return: Returns 0 if permission is granted. + */ int security_sctp_assoc_established(struct sctp_association *asoc, struct sk_buff *skb) { @@ -2497,25 +4670,60 @@ EXPORT_SYMBOL(security_sctp_assoc_established); #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND - +/** + * security_ib_pkey_access() - Check if access to an IB pkey is allowed + * @sec: LSM blob + * @subnet_prefix: subnet prefix of the port + * @pkey: IB pkey + * + * Check permission to access a pkey when modifing a QP. + * + * Return: Returns 0 if permission is granted. + */ int security_ib_pkey_access(void *sec, u64 subnet_prefix, u16 pkey) { return call_int_hook(ib_pkey_access, 0, sec, subnet_prefix, pkey); } EXPORT_SYMBOL(security_ib_pkey_access); -int security_ib_endport_manage_subnet(void *sec, const char *dev_name, u8 port_num) +/** + * security_ib_endport_manage_subnet() - Check if SMPs traffic is allowed + * @sec: LSM blob + * @dev_name: IB device name + * @port_num: port number + * + * Check permissions to send and receive SMPs on a end port. + * + * Return: Returns 0 if permission is granted. + */ +int security_ib_endport_manage_subnet(void *sec, + const char *dev_name, u8 port_num) { - return call_int_hook(ib_endport_manage_subnet, 0, sec, dev_name, port_num); + return call_int_hook(ib_endport_manage_subnet, 0, sec, + dev_name, port_num); } EXPORT_SYMBOL(security_ib_endport_manage_subnet); +/** + * security_ib_alloc_security() - Allocate an Infiniband LSM blob + * @sec: LSM blob + * + * Allocate a security structure for Infiniband objects. + * + * Return: Returns 0 on success, non-zero on failure. + */ int security_ib_alloc_security(void **sec) { return call_int_hook(ib_alloc_security, 0, sec); } EXPORT_SYMBOL(security_ib_alloc_security); +/** + * security_ib_free_security() - Free an Infiniband LSM blob + * @sec: LSM blob + * + * Deallocate an Infiniband security structure. + */ void security_ib_free_security(void *sec) { call_void_hook(ib_free_security, sec); @@ -2524,7 +4732,17 @@ EXPORT_SYMBOL(security_ib_free_security); #endif /* CONFIG_SECURITY_INFINIBAND */ #ifdef CONFIG_SECURITY_NETWORK_XFRM - +/** + * security_xfrm_policy_alloc() - Allocate a xfrm policy LSM blob + * @ctxp: xfrm security context being added to the SPD + * @sec_ctx: security label provided by userspace + * @gfp: gfp flags + * + * Allocate a security structure to the xp->security field; the security field + * is initialized to NULL when the xfrm_policy is allocated. + * + * Return: Return 0 if operation was successful. + */ int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp) @@ -2533,23 +4751,58 @@ int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, } EXPORT_SYMBOL(security_xfrm_policy_alloc); +/** + * security_xfrm_policy_clone() - Clone xfrm policy LSM state + * @old_ctx: xfrm security context + * @new_ctxp: target xfrm security context + * + * Allocate a security structure in new_ctxp that contains the information from + * the old_ctx structure. + * + * Return: Return 0 if operation was successful. + */ int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx, - struct xfrm_sec_ctx **new_ctxp) + struct xfrm_sec_ctx **new_ctxp) { return call_int_hook(xfrm_policy_clone_security, 0, old_ctx, new_ctxp); } +/** + * security_xfrm_policy_free() - Free a xfrm security context + * @ctx: xfrm security context + * + * Free LSM resources associated with @ctx. + */ void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx) { call_void_hook(xfrm_policy_free_security, ctx); } EXPORT_SYMBOL(security_xfrm_policy_free); +/** + * security_xfrm_policy_delete() - Check if deleting a xfrm policy is allowed + * @ctx: xfrm security context + * + * Authorize deletion of a SPD entry. + * + * Return: Returns 0 if permission is granted. + */ int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx) { return call_int_hook(xfrm_policy_delete_security, 0, ctx); } +/** + * security_xfrm_state_alloc() - Allocate a xfrm state LSM blob + * @x: xfrm state being added to the SAD + * @sec_ctx: security label provided by userspace + * + * Allocate a security structure to the @x->security field; the security field + * is initialized to NULL when the xfrm_state is allocated. Set the context to + * correspond to @sec_ctx. + * + * Return: Return 0 if operation was successful. + */ int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) { @@ -2557,28 +4810,76 @@ int security_xfrm_state_alloc(struct xfrm_state *x, } EXPORT_SYMBOL(security_xfrm_state_alloc); +/** + * security_xfrm_state_alloc_acquire() - Allocate a xfrm state LSM blob + * @x: xfrm state being added to the SAD + * @polsec: associated policy's security context + * @secid: secid from the flow + * + * Allocate a security structure to the x->security field; the security field + * is initialized to NULL when the xfrm_state is allocated. Set the context to + * correspond to secid. + * + * Return: Returns 0 if operation was successful. + */ int security_xfrm_state_alloc_acquire(struct xfrm_state *x, struct xfrm_sec_ctx *polsec, u32 secid) { return call_int_hook(xfrm_state_alloc_acquire, 0, x, polsec, secid); } +/** + * security_xfrm_state_delete() - Check if deleting a xfrm state is allowed + * @x: xfrm state + * + * Authorize deletion of x->security. + * + * Return: Returns 0 if permission is granted. + */ int security_xfrm_state_delete(struct xfrm_state *x) { return call_int_hook(xfrm_state_delete_security, 0, x); } EXPORT_SYMBOL(security_xfrm_state_delete); +/** + * security_xfrm_state_free() - Free a xfrm state + * @x: xfrm state + * + * Deallocate x->security. + */ void security_xfrm_state_free(struct xfrm_state *x) { call_void_hook(xfrm_state_free_security, x); } +/** + * security_xfrm_policy_lookup() - Check if using a xfrm policy is allowed + * @ctx: target xfrm security context + * @fl_secid: flow secid used to authorize access + * + * Check permission when a flow selects a xfrm_policy for processing XFRMs on a + * packet. The hook is called when selecting either a per-socket policy or a + * generic xfrm policy. + * + * Return: Return 0 if permission is granted, -ESRCH otherwise, or -errno on + * other errors. + */ int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid) { return call_int_hook(xfrm_policy_lookup, 0, ctx, fl_secid); } +/** + * security_xfrm_state_pol_flow_match() - Check for a xfrm match + * @x: xfrm state to match + * @xp: xfrm policy to check for a match + * @flic: flow to check for a match. + * + * Check @xp and @flic for a match with @x. + * + * Return: Returns 1 if there is a match. + */ int security_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *xp, const struct flowi_common *flic) @@ -2596,13 +4897,22 @@ int security_xfrm_state_pol_flow_match(struct xfrm_state *x, * using the macro */ hlist_for_each_entry(hp, &security_hook_heads.xfrm_state_pol_flow_match, - list) { + list) { rc = hp->hook.xfrm_state_pol_flow_match(x, xp, flic); break; } return rc; } +/** + * security_xfrm_decode_session() - Determine the xfrm secid for a packet + * @skb: xfrm packet + * @secid: secid + * + * Decode the packet in @skb and return the security label in @secid. + * + * Return: Return 0 if all xfrms used have the same secid. + */ int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid) { return call_int_hook(xfrm_decode_session, 0, skb, secid, 1); @@ -2611,58 +4921,135 @@ int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid) void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic) { int rc = call_int_hook(xfrm_decode_session, 0, skb, &flic->flowic_secid, - 0); + 0); BUG_ON(rc); } EXPORT_SYMBOL(security_skb_classify_flow); - #endif /* CONFIG_SECURITY_NETWORK_XFRM */ #ifdef CONFIG_KEYS - +/** + * security_key_alloc() - Allocate and initialize a kernel key LSM blob + * @key: key + * @cred: credentials + * @flags: allocation flags + * + * Permit allocation of a key and assign security data. Note that key does not + * have a serial number assigned at this point. + * + * Return: Return 0 if permission is granted, -ve error otherwise. + */ int security_key_alloc(struct key *key, const struct cred *cred, unsigned long flags) { return call_int_hook(key_alloc, 0, key, cred, flags); } +/** + * security_key_free() - Free a kernel key LSM blob + * @key: key + * + * Notification of destruction; free security data. + */ void security_key_free(struct key *key) { call_void_hook(key_free, key); } +/** + * security_key_permission() - Check if a kernel key operation is allowed + * @key_ref: key reference + * @cred: credentials of actor requesting access + * @need_perm: requested permissions + * + * See whether a specific operational right is granted to a process on a key. + * + * Return: Return 0 if permission is granted, -ve error otherwise. + */ int security_key_permission(key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) { return call_int_hook(key_permission, 0, key_ref, cred, need_perm); } -int security_key_getsecurity(struct key *key, char **_buffer) +/** + * security_key_getsecurity() - Get the key's security label + * @key: key + * @buffer: security label buffer + * + * Get a textual representation of the security context attached to a key for + * the purposes of honouring KEYCTL_GETSECURITY. This function allocates the + * storage for the NUL-terminated string and the caller should free it. + * + * Return: Returns the length of @buffer (including terminating NUL) or -ve if + * an error occurs. May also return 0 (and a NULL buffer pointer) if + * there is no security label assigned to the key. + */ +int security_key_getsecurity(struct key *key, char **buffer) { - *_buffer = NULL; - return call_int_hook(key_getsecurity, 0, key, _buffer); + *buffer = NULL; + return call_int_hook(key_getsecurity, 0, key, buffer); } - #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT - +/** + * security_audit_rule_init() - Allocate and init an LSM audit rule struct + * @field: audit action + * @op: rule operator + * @rulestr: rule context + * @lsmrule: receive buffer for audit rule struct + * + * Allocate and initialize an LSM audit rule structure. + * + * Return: Return 0 if @lsmrule has been successfully set, -EINVAL in case of + * an invalid rule. + */ int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule) { return call_int_hook(audit_rule_init, 0, field, op, rulestr, lsmrule); } +/** + * security_audit_rule_known() - Check if an audit rule contains LSM fields + * @krule: audit rule + * + * Specifies whether given @krule contains any fields related to the current + * LSM. + * + * Return: Returns 1 in case of relation found, 0 otherwise. + */ int security_audit_rule_known(struct audit_krule *krule) { return call_int_hook(audit_rule_known, 0, krule); } +/** + * security_audit_rule_free() - Free an LSM audit rule struct + * @lsmrule: audit rule struct + * + * Deallocate the LSM audit rule structure previously allocated by + * audit_rule_init(). + */ void security_audit_rule_free(void *lsmrule) { call_void_hook(audit_rule_free, lsmrule); } +/** + * security_audit_rule_match() - Check if a label matches an audit rule + * @secid: security label + * @field: LSM audit field + * @op: matching operator + * @lsmrule: audit rule + * + * Determine if given @secid matches a rule previously approved by + * security_audit_rule_known(). + * + * Return: Returns 1 if secid matches the rule, 0 if it does not, -ERRNO on + * failure. + */ int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule) { return call_int_hook(audit_rule_match, 0, secid, field, op, lsmrule); @@ -2670,36 +5057,110 @@ int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule) #endif /* CONFIG_AUDIT */ #ifdef CONFIG_BPF_SYSCALL +/** + * security_bpf() - Check if the bpf syscall operation is allowed + * @cmd: command + * @attr: bpf attribute + * @size: size + * + * Do a initial check for all bpf syscalls after the attribute is copied into + * the kernel. The actual security module can implement their own rules to + * check the specific cmd they need. + * + * Return: Returns 0 if permission is granted. + */ int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) { return call_int_hook(bpf, 0, cmd, attr, size); } + +/** + * security_bpf_map() - Check if access to a bpf map is allowed + * @map: bpf map + * @fmode: mode + * + * Do a check when the kernel generates and returns a file descriptor for eBPF + * maps. + * + * Return: Returns 0 if permission is granted. + */ int security_bpf_map(struct bpf_map *map, fmode_t fmode) { return call_int_hook(bpf_map, 0, map, fmode); } + +/** + * security_bpf_prog() - Check if access to a bpf program is allowed + * @prog: bpf program + * + * Do a check when the kernel generates and returns a file descriptor for eBPF + * programs. + * + * Return: Returns 0 if permission is granted. + */ int security_bpf_prog(struct bpf_prog *prog) { return call_int_hook(bpf_prog, 0, prog); } + +/** + * security_bpf_map_alloc() - Allocate a bpf map LSM blob + * @map: bpf map + * + * Initialize the security field inside bpf map. + * + * Return: Returns 0 on success, error on failure. + */ int security_bpf_map_alloc(struct bpf_map *map) { return call_int_hook(bpf_map_alloc_security, 0, map); } + +/** + * security_bpf_prog_alloc() - Allocate a bpf program LSM blob + * @aux: bpf program aux info struct + * + * Initialize the security field inside bpf program. + * + * Return: Returns 0 on success, error on failure. + */ int security_bpf_prog_alloc(struct bpf_prog_aux *aux) { return call_int_hook(bpf_prog_alloc_security, 0, aux); } + +/** + * security_bpf_map_free() - Free a bpf map's LSM blob + * @map: bpf map + * + * Clean up the security information stored inside bpf map. + */ void security_bpf_map_free(struct bpf_map *map) { call_void_hook(bpf_map_free_security, map); } + +/** + * security_bpf_prog_free() - Free a bpf program's LSM blob + * @aux: bpf program aux info struct + * + * Clean up the security information stored inside bpf prog. + */ void security_bpf_prog_free(struct bpf_prog_aux *aux) { call_void_hook(bpf_prog_free_security, aux); } #endif /* CONFIG_BPF_SYSCALL */ +/** + * security_locked_down() - Check if a kernel feature is allowed + * @what: requested kernel feature + * + * Determine whether a kernel feature that potentially enables arbitrary code + * execution in kernel space should be permitted. + * + * Return: Returns 0 if permission is granted. + */ int security_locked_down(enum lockdown_reason what) { return call_int_hook(locked_down, 0, what); @@ -2707,26 +5168,65 @@ int security_locked_down(enum lockdown_reason what) EXPORT_SYMBOL(security_locked_down); #ifdef CONFIG_PERF_EVENTS +/** + * security_perf_event_open() - Check if a perf event open is allowed + * @attr: perf event attribute + * @type: type of event + * + * Check whether the @type of perf_event_open syscall is allowed. + * + * Return: Returns 0 if permission is granted. + */ int security_perf_event_open(struct perf_event_attr *attr, int type) { return call_int_hook(perf_event_open, 0, attr, type); } +/** + * security_perf_event_alloc() - Allocate a perf event LSM blob + * @event: perf event + * + * Allocate and save perf_event security info. + * + * Return: Returns 0 on success, error on failure. + */ int security_perf_event_alloc(struct perf_event *event) { return call_int_hook(perf_event_alloc, 0, event); } +/** + * security_perf_event_free() - Free a perf event LSM blob + * @event: perf event + * + * Release (free) perf_event security info. + */ void security_perf_event_free(struct perf_event *event) { call_void_hook(perf_event_free, event); } +/** + * security_perf_event_read() - Check if reading a perf event label is allowed + * @event: perf event + * + * Read perf_event security info if allowed. + * + * Return: Returns 0 if permission is granted. + */ int security_perf_event_read(struct perf_event *event) { return call_int_hook(perf_event_read, 0, event); } +/** + * security_perf_event_write() - Check if writing a perf event label is allowed + * @event: perf event + * + * Write perf_event security info if allowed. + * + * Return: Returns 0 if permission is granted. + */ int security_perf_event_write(struct perf_event *event) { return call_int_hook(perf_event_write, 0, event); @@ -2734,15 +5234,41 @@ int security_perf_event_write(struct perf_event *event) #endif /* CONFIG_PERF_EVENTS */ #ifdef CONFIG_IO_URING +/** + * security_uring_override_creds() - Check if overriding creds is allowed + * @new: new credentials + * + * Check if the current task, executing an io_uring operation, is allowed to + * override it's credentials with @new. + * + * Return: Returns 0 if permission is granted. + */ int security_uring_override_creds(const struct cred *new) { return call_int_hook(uring_override_creds, 0, new); } +/** + * security_uring_sqpoll() - Check if IORING_SETUP_SQPOLL is allowed + * + * Check whether the current task is allowed to spawn a io_uring polling thread + * (IORING_SETUP_SQPOLL). + * + * Return: Returns 0 if permission is granted. + */ int security_uring_sqpoll(void) { return call_int_hook(uring_sqpoll, 0); } + +/** + * security_uring_cmd() - Check if a io_uring passthrough command is allowed + * @ioucmd: command + * + * Check whether the file_operations uring_cmd is allowed to run. + * + * Return: Returns 0 if permission is granted. + */ int security_uring_cmd(struct io_uring_cmd *ioucmd) { return call_int_hook(uring_cmd, 0, ioucmd); diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig index 9e921fc72538..95a186ec0fcb 100644 --- a/security/selinux/Kconfig +++ b/security/selinux/Kconfig @@ -23,30 +23,6 @@ config SECURITY_SELINUX_BOOTPARAM If you are unsure how to answer this question, answer N. -config SECURITY_SELINUX_DISABLE - bool "NSA SELinux runtime disable" - depends on SECURITY_SELINUX - select SECURITY_WRITABLE_HOOKS - default n - help - This option enables writing to a selinuxfs node 'disable', which - allows SELinux to be disabled at runtime prior to the policy load. - SELinux will then remain disabled until the next boot. - This option is similar to the selinux=0 boot parameter, but is to - support runtime disabling of SELinux, e.g. from /sbin/init, for - portability across platforms where boot parameters are difficult - to employ. - - NOTE: selecting this option will disable the '__ro_after_init' - kernel hardening feature for security hooks. Please consider - using the selinux=0 boot parameter instead of enabling this - option. - - WARNING: this option is deprecated and will be removed in a future - kernel release. - - If you are unsure how to answer this question, answer N. - config SECURITY_SELINUX_DEVELOP bool "NSA SELinux Development Support" depends on SECURITY_SELINUX @@ -70,29 +46,6 @@ config SECURITY_SELINUX_AVC_STATS /sys/fs/selinux/avc/cache_stats, which may be monitored via tools such as avcstat. -config SECURITY_SELINUX_CHECKREQPROT_VALUE - int "NSA SELinux checkreqprot default value" - depends on SECURITY_SELINUX - range 0 1 - default 0 - help - This option sets the default value for the 'checkreqprot' flag - that determines whether SELinux checks the protection requested - by the application or the protection that will be applied by the - kernel (including any implied execute for read-implies-exec) for - mmap and mprotect calls. If this option is set to 0 (zero), - SELinux will default to checking the protection that will be applied - by the kernel. If this option is set to 1 (one), SELinux will - default to checking the protection requested by the application. - The checkreqprot flag may be changed from the default via the - 'checkreqprot=' boot parameter. It may also be changed at runtime - via /sys/fs/selinux/checkreqprot if authorized by policy. - - WARNING: this option is deprecated and will be removed in a future - kernel release. - - If you are unsure how to answer this question, answer 0. - config SECURITY_SELINUX_SIDTAB_HASH_BITS int "NSA SELinux sidtab hashtable size" depends on SECURITY_SELINUX diff --git a/security/selinux/Makefile b/security/selinux/Makefile index 776162444882..0aecf9334ec3 100644 --- a/security/selinux/Makefile +++ b/security/selinux/Makefile @@ -23,8 +23,8 @@ ccflags-y := -I$(srctree)/security/selinux -I$(srctree)/security/selinux/include $(addprefix $(obj)/,$(selinux-y)): $(obj)/flask.h quiet_cmd_flask = GEN $(obj)/flask.h $(obj)/av_permissions.h - cmd_flask = scripts/selinux/genheaders/genheaders $(obj)/flask.h $(obj)/av_permissions.h + cmd_flask = $< $(obj)/flask.h $(obj)/av_permissions.h targets += flask.h av_permissions.h -$(obj)/flask.h: $(src)/include/classmap.h FORCE +$(obj)/flask.h $(obj)/av_permissions.h &: scripts/selinux/genheaders/genheaders FORCE $(call if_changed,flask) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 9a43af0ebd7d..eaed5c2da02b 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -93,7 +93,7 @@ struct selinux_avc { static struct selinux_avc selinux_avc; -void selinux_avc_init(struct selinux_avc **avc) +void selinux_avc_init(void) { int i; @@ -104,18 +104,16 @@ void selinux_avc_init(struct selinux_avc **avc) } atomic_set(&selinux_avc.avc_cache.active_nodes, 0); atomic_set(&selinux_avc.avc_cache.lru_hint, 0); - *avc = &selinux_avc; } -unsigned int avc_get_cache_threshold(struct selinux_avc *avc) +unsigned int avc_get_cache_threshold(void) { - return avc->avc_cache_threshold; + return selinux_avc.avc_cache_threshold; } -void avc_set_cache_threshold(struct selinux_avc *avc, - unsigned int cache_threshold) +void avc_set_cache_threshold(unsigned int cache_threshold) { - avc->avc_cache_threshold = cache_threshold; + selinux_avc.avc_cache_threshold = cache_threshold; } static struct avc_callback_node *avc_callbacks __ro_after_init; @@ -150,7 +148,7 @@ void __init avc_init(void) 0, SLAB_PANIC, NULL); } -int avc_get_hash_stats(struct selinux_avc *avc, char *page) +int avc_get_hash_stats(char *page) { int i, chain_len, max_chain_len, slots_used; struct avc_node *node; @@ -161,7 +159,7 @@ int avc_get_hash_stats(struct selinux_avc *avc, char *page) slots_used = 0; max_chain_len = 0; for (i = 0; i < AVC_CACHE_SLOTS; i++) { - head = &avc->avc_cache.slots[i]; + head = &selinux_avc.avc_cache.slots[i]; if (!hlist_empty(head)) { slots_used++; chain_len = 0; @@ -176,7 +174,7 @@ int avc_get_hash_stats(struct selinux_avc *avc, char *page) return scnprintf(page, PAGE_SIZE, "entries: %d\nbuckets used: %d/%d\n" "longest chain: %d\n", - atomic_read(&avc->avc_cache.active_nodes), + atomic_read(&selinux_avc.avc_cache.active_nodes), slots_used, AVC_CACHE_SLOTS, max_chain_len); } @@ -414,8 +412,7 @@ static inline u32 avc_xperms_audit_required(u32 requested, return audited; } -static inline int avc_xperms_audit(struct selinux_state *state, - u32 ssid, u32 tsid, u16 tclass, +static inline int avc_xperms_audit(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct av_decision *avd, struct extended_perms_decision *xpd, u8 perm, int result, @@ -427,7 +424,7 @@ static inline int avc_xperms_audit(struct selinux_state *state, requested, avd, xpd, perm, result, &denied); if (likely(!audited)) return 0; - return slow_avc_audit(state, ssid, tsid, tclass, requested, + return slow_avc_audit(ssid, tsid, tclass, requested, audited, denied, result, ad); } @@ -439,30 +436,29 @@ static void avc_node_free(struct rcu_head *rhead) avc_cache_stats_incr(frees); } -static void avc_node_delete(struct selinux_avc *avc, struct avc_node *node) +static void avc_node_delete(struct avc_node *node) { hlist_del_rcu(&node->list); call_rcu(&node->rhead, avc_node_free); - atomic_dec(&avc->avc_cache.active_nodes); + atomic_dec(&selinux_avc.avc_cache.active_nodes); } -static void avc_node_kill(struct selinux_avc *avc, struct avc_node *node) +static void avc_node_kill(struct avc_node *node) { avc_xperms_free(node->ae.xp_node); kmem_cache_free(avc_node_cachep, node); avc_cache_stats_incr(frees); - atomic_dec(&avc->avc_cache.active_nodes); + atomic_dec(&selinux_avc.avc_cache.active_nodes); } -static void avc_node_replace(struct selinux_avc *avc, - struct avc_node *new, struct avc_node *old) +static void avc_node_replace(struct avc_node *new, struct avc_node *old) { hlist_replace_rcu(&old->list, &new->list); call_rcu(&old->rhead, avc_node_free); - atomic_dec(&avc->avc_cache.active_nodes); + atomic_dec(&selinux_avc.avc_cache.active_nodes); } -static inline int avc_reclaim_node(struct selinux_avc *avc) +static inline int avc_reclaim_node(void) { struct avc_node *node; int hvalue, try, ecx; @@ -471,17 +467,17 @@ static inline int avc_reclaim_node(struct selinux_avc *avc) spinlock_t *lock; for (try = 0, ecx = 0; try < AVC_CACHE_SLOTS; try++) { - hvalue = atomic_inc_return(&avc->avc_cache.lru_hint) & + hvalue = atomic_inc_return(&selinux_avc.avc_cache.lru_hint) & (AVC_CACHE_SLOTS - 1); - head = &avc->avc_cache.slots[hvalue]; - lock = &avc->avc_cache.slots_lock[hvalue]; + head = &selinux_avc.avc_cache.slots[hvalue]; + lock = &selinux_avc.avc_cache.slots_lock[hvalue]; if (!spin_trylock_irqsave(lock, flags)) continue; rcu_read_lock(); hlist_for_each_entry(node, head, list) { - avc_node_delete(avc, node); + avc_node_delete(node); avc_cache_stats_incr(reclaims); ecx++; if (ecx >= AVC_CACHE_RECLAIM) { @@ -497,7 +493,7 @@ out: return ecx; } -static struct avc_node *avc_alloc_node(struct selinux_avc *avc) +static struct avc_node *avc_alloc_node(void) { struct avc_node *node; @@ -508,9 +504,9 @@ static struct avc_node *avc_alloc_node(struct selinux_avc *avc) INIT_HLIST_NODE(&node->list); avc_cache_stats_incr(allocations); - if (atomic_inc_return(&avc->avc_cache.active_nodes) > - avc->avc_cache_threshold) - avc_reclaim_node(avc); + if (atomic_inc_return(&selinux_avc.avc_cache.active_nodes) > + selinux_avc.avc_cache_threshold) + avc_reclaim_node(); out: return node; @@ -524,15 +520,14 @@ static void avc_node_populate(struct avc_node *node, u32 ssid, u32 tsid, u16 tcl memcpy(&node->ae.avd, avd, sizeof(node->ae.avd)); } -static inline struct avc_node *avc_search_node(struct selinux_avc *avc, - u32 ssid, u32 tsid, u16 tclass) +static inline struct avc_node *avc_search_node(u32 ssid, u32 tsid, u16 tclass) { struct avc_node *node, *ret = NULL; int hvalue; struct hlist_head *head; hvalue = avc_hash(ssid, tsid, tclass); - head = &avc->avc_cache.slots[hvalue]; + head = &selinux_avc.avc_cache.slots[hvalue]; hlist_for_each_entry_rcu(node, head, list) { if (ssid == node->ae.ssid && tclass == node->ae.tclass && @@ -547,7 +542,6 @@ static inline struct avc_node *avc_search_node(struct selinux_avc *avc, /** * avc_lookup - Look up an AVC entry. - * @avc: the access vector cache * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class @@ -558,13 +552,12 @@ static inline struct avc_node *avc_search_node(struct selinux_avc *avc, * then this function returns the avc_node. * Otherwise, this function returns NULL. */ -static struct avc_node *avc_lookup(struct selinux_avc *avc, - u32 ssid, u32 tsid, u16 tclass) +static struct avc_node *avc_lookup(u32 ssid, u32 tsid, u16 tclass) { struct avc_node *node; avc_cache_stats_incr(lookups); - node = avc_search_node(avc, ssid, tsid, tclass); + node = avc_search_node(ssid, tsid, tclass); if (node) return node; @@ -573,8 +566,7 @@ static struct avc_node *avc_lookup(struct selinux_avc *avc, return NULL; } -static int avc_latest_notif_update(struct selinux_avc *avc, - int seqno, int is_insert) +static int avc_latest_notif_update(int seqno, int is_insert) { int ret = 0; static DEFINE_SPINLOCK(notif_lock); @@ -582,14 +574,14 @@ static int avc_latest_notif_update(struct selinux_avc *avc, spin_lock_irqsave(¬if_lock, flag); if (is_insert) { - if (seqno < avc->avc_cache.latest_notif) { + if (seqno < selinux_avc.avc_cache.latest_notif) { pr_warn("SELinux: avc: seqno %d < latest_notif %d\n", - seqno, avc->avc_cache.latest_notif); + seqno, selinux_avc.avc_cache.latest_notif); ret = -EAGAIN; } } else { - if (seqno > avc->avc_cache.latest_notif) - avc->avc_cache.latest_notif = seqno; + if (seqno > selinux_avc.avc_cache.latest_notif) + selinux_avc.avc_cache.latest_notif = seqno; } spin_unlock_irqrestore(¬if_lock, flag); @@ -598,7 +590,6 @@ static int avc_latest_notif_update(struct selinux_avc *avc, /** * avc_insert - Insert an AVC entry. - * @avc: the access vector cache * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class @@ -612,13 +603,10 @@ static int avc_latest_notif_update(struct selinux_avc *avc, * response to a security_compute_av() call. If the * sequence number @avd->seqno is not less than the latest * revocation notification, then the function copies - * the access vectors into a cache entry, returns - * avc_node inserted. Otherwise, this function returns NULL. + * the access vectors into a cache entry. */ -static struct avc_node *avc_insert(struct selinux_avc *avc, - u32 ssid, u32 tsid, u16 tclass, - struct av_decision *avd, - struct avc_xperms_node *xp_node) +static void avc_insert(u32 ssid, u32 tsid, u16 tclass, + struct av_decision *avd, struct avc_xperms_node *xp_node) { struct avc_node *pos, *node = NULL; int hvalue; @@ -626,35 +614,35 @@ static struct avc_node *avc_insert(struct selinux_avc *avc, spinlock_t *lock; struct hlist_head *head; - if (avc_latest_notif_update(avc, avd->seqno, 1)) - return NULL; + if (avc_latest_notif_update(avd->seqno, 1)) + return; - node = avc_alloc_node(avc); + node = avc_alloc_node(); if (!node) - return NULL; + return; avc_node_populate(node, ssid, tsid, tclass, avd); if (avc_xperms_populate(node, xp_node)) { - avc_node_kill(avc, node); - return NULL; + avc_node_kill(node); + return; } hvalue = avc_hash(ssid, tsid, tclass); - head = &avc->avc_cache.slots[hvalue]; - lock = &avc->avc_cache.slots_lock[hvalue]; + head = &selinux_avc.avc_cache.slots[hvalue]; + lock = &selinux_avc.avc_cache.slots_lock[hvalue]; spin_lock_irqsave(lock, flag); hlist_for_each_entry(pos, head, list) { if (pos->ae.ssid == ssid && pos->ae.tsid == tsid && pos->ae.tclass == tclass) { - avc_node_replace(avc, node, pos); + avc_node_replace(node, pos); goto found; } } hlist_add_head_rcu(&node->list, head); found: spin_unlock_irqrestore(lock, flag); - return node; + return; } /** @@ -715,14 +703,14 @@ static void avc_audit_post_callback(struct audit_buffer *ab, void *a) u32 tcontext_len; int rc; - rc = security_sid_to_context(sad->state, sad->ssid, &scontext, + rc = security_sid_to_context(sad->ssid, &scontext, &scontext_len); if (rc) audit_log_format(ab, " ssid=%d", sad->ssid); else audit_log_format(ab, " scontext=%s", scontext); - rc = security_sid_to_context(sad->state, sad->tsid, &tcontext, + rc = security_sid_to_context(sad->tsid, &tcontext, &tcontext_len); if (rc) audit_log_format(ab, " tsid=%d", sad->tsid); @@ -740,7 +728,7 @@ static void avc_audit_post_callback(struct audit_buffer *ab, void *a) kfree(scontext); /* in case of invalid context report also the actual context string */ - rc = security_sid_to_context_inval(sad->state, sad->ssid, &scontext, + rc = security_sid_to_context_inval(sad->ssid, &scontext, &scontext_len); if (!rc && scontext) { if (scontext_len && scontext[scontext_len - 1] == '\0') @@ -750,7 +738,7 @@ static void avc_audit_post_callback(struct audit_buffer *ab, void *a) kfree(scontext); } - rc = security_sid_to_context_inval(sad->state, sad->tsid, &scontext, + rc = security_sid_to_context_inval(sad->tsid, &scontext, &scontext_len); if (!rc && scontext) { if (scontext_len && scontext[scontext_len - 1] == '\0') @@ -766,8 +754,7 @@ static void avc_audit_post_callback(struct audit_buffer *ab, void *a) * Note that it is non-blocking and can be called from under * rcu_read_lock(). */ -noinline int slow_avc_audit(struct selinux_state *state, - u32 ssid, u32 tsid, u16 tclass, +noinline int slow_avc_audit(u32 ssid, u32 tsid, u16 tclass, u32 requested, u32 audited, u32 denied, int result, struct common_audit_data *a) { @@ -789,7 +776,6 @@ noinline int slow_avc_audit(struct selinux_state *state, sad.audited = audited; sad.denied = denied; sad.result = result; - sad.state = state; a->selinux_audit_data = &sad; @@ -827,7 +813,6 @@ out: /** * avc_update_node - Update an AVC entry - * @avc: the access vector cache * @event : Updating event * @perms : Permission mask bits * @driver: xperm driver information @@ -844,8 +829,7 @@ out: * otherwise, this function updates the AVC entry. The original AVC-entry object * will release later by RCU. */ -static int avc_update_node(struct selinux_avc *avc, - u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid, +static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid, u32 tsid, u16 tclass, u32 seqno, struct extended_perms_decision *xpd, u32 flags) @@ -856,7 +840,7 @@ static int avc_update_node(struct selinux_avc *avc, struct hlist_head *head; spinlock_t *lock; - node = avc_alloc_node(avc); + node = avc_alloc_node(); if (!node) { rc = -ENOMEM; goto out; @@ -865,8 +849,8 @@ static int avc_update_node(struct selinux_avc *avc, /* Lock the target slot */ hvalue = avc_hash(ssid, tsid, tclass); - head = &avc->avc_cache.slots[hvalue]; - lock = &avc->avc_cache.slots_lock[hvalue]; + head = &selinux_avc.avc_cache.slots[hvalue]; + lock = &selinux_avc.avc_cache.slots_lock[hvalue]; spin_lock_irqsave(lock, flag); @@ -882,7 +866,7 @@ static int avc_update_node(struct selinux_avc *avc, if (!orig) { rc = -ENOENT; - avc_node_kill(avc, node); + avc_node_kill(node); goto out_unlock; } @@ -895,7 +879,7 @@ static int avc_update_node(struct selinux_avc *avc, if (orig->ae.xp_node) { rc = avc_xperms_populate(node, orig->ae.xp_node); if (rc) { - avc_node_kill(avc, node); + avc_node_kill(node); goto out_unlock; } } @@ -926,7 +910,7 @@ static int avc_update_node(struct selinux_avc *avc, avc_add_xperms_decision(node, xpd); break; } - avc_node_replace(avc, node, orig); + avc_node_replace(node, orig); out_unlock: spin_unlock_irqrestore(lock, flag); out: @@ -935,9 +919,8 @@ out: /** * avc_flush - Flush the cache - * @avc: the access vector cache */ -static void avc_flush(struct selinux_avc *avc) +static void avc_flush(void) { struct hlist_head *head; struct avc_node *node; @@ -946,8 +929,8 @@ static void avc_flush(struct selinux_avc *avc) int i; for (i = 0; i < AVC_CACHE_SLOTS; i++) { - head = &avc->avc_cache.slots[i]; - lock = &avc->avc_cache.slots_lock[i]; + head = &selinux_avc.avc_cache.slots[i]; + lock = &selinux_avc.avc_cache.slots_lock[i]; spin_lock_irqsave(lock, flag); /* @@ -956,7 +939,7 @@ static void avc_flush(struct selinux_avc *avc) */ rcu_read_lock(); hlist_for_each_entry(node, head, list) - avc_node_delete(avc, node); + avc_node_delete(node); rcu_read_unlock(); spin_unlock_irqrestore(lock, flag); } @@ -964,15 +947,14 @@ static void avc_flush(struct selinux_avc *avc) /** * avc_ss_reset - Flush the cache and revalidate migrated permissions. - * @avc: the access vector cache * @seqno: policy sequence number */ -int avc_ss_reset(struct selinux_avc *avc, u32 seqno) +int avc_ss_reset(u32 seqno) { struct avc_callback_node *c; int rc = 0, tmprc; - avc_flush(avc); + avc_flush(); for (c = avc_callbacks; c; c = c->next) { if (c->events & AVC_CALLBACK_RESET) { @@ -984,34 +966,32 @@ int avc_ss_reset(struct selinux_avc *avc, u32 seqno) } } - avc_latest_notif_update(avc, seqno, 0); + avc_latest_notif_update(seqno, 0); return rc; } -/* - * Slow-path helper function for avc_has_perm_noaudit, - * when the avc_node lookup fails. We get called with - * the RCU read lock held, and need to return with it - * still held, but drop if for the security compute. +/** + * avc_compute_av - Add an entry to the AVC based on the security policy + * @ssid: subject + * @tsid: object/target + * @tclass: object class + * @avd: access vector decision + * @xp_node: AVC extended permissions node * - * Don't inline this, since it's the slow-path and just - * results in a bigger stack frame. + * Slow-path helper function for avc_has_perm_noaudit, when the avc_node lookup + * fails. Don't inline this, since it's the slow-path and just results in a + * bigger stack frame. */ -static noinline -struct avc_node *avc_compute_av(struct selinux_state *state, - u32 ssid, u32 tsid, - u16 tclass, struct av_decision *avd, - struct avc_xperms_node *xp_node) +static noinline void avc_compute_av(u32 ssid, u32 tsid, u16 tclass, + struct av_decision *avd, + struct avc_xperms_node *xp_node) { - rcu_read_unlock(); INIT_LIST_HEAD(&xp_node->xpd_head); - security_compute_av(state, ssid, tsid, tclass, avd, &xp_node->xp); - rcu_read_lock(); - return avc_insert(state->avc, ssid, tsid, tclass, avd, xp_node); + security_compute_av(ssid, tsid, tclass, avd, &xp_node->xp); + avc_insert(ssid, tsid, tclass, avd, xp_node); } -static noinline int avc_denied(struct selinux_state *state, - u32 ssid, u32 tsid, +static noinline int avc_denied(u32 ssid, u32 tsid, u16 tclass, u32 requested, u8 driver, u8 xperm, unsigned int flags, struct av_decision *avd) @@ -1019,11 +999,11 @@ static noinline int avc_denied(struct selinux_state *state, if (flags & AVC_STRICT) return -EACCES; - if (enforcing_enabled(state) && + if (enforcing_enabled() && !(avd->flags & AVD_FLAGS_PERMISSIVE)) return -EACCES; - avc_update_node(state->avc, AVC_CALLBACK_GRANT, requested, driver, + avc_update_node(AVC_CALLBACK_GRANT, requested, driver, xperm, ssid, tsid, tclass, avd->seqno, NULL, flags); return 0; } @@ -1035,8 +1015,7 @@ static noinline int avc_denied(struct selinux_state *state, * as-is the case with ioctls, then multiple may be chained together and the * driver field is used to specify which set contains the permission. */ -int avc_has_extended_perms(struct selinux_state *state, - u32 ssid, u32 tsid, u16 tclass, u32 requested, +int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, u8 driver, u8 xperm, struct common_audit_data *ad) { struct avc_node *node; @@ -1057,9 +1036,9 @@ int avc_has_extended_perms(struct selinux_state *state, rcu_read_lock(); - node = avc_lookup(state->avc, ssid, tsid, tclass); + node = avc_lookup(ssid, tsid, tclass); if (unlikely(!node)) { - avc_compute_av(state, ssid, tsid, tclass, &avd, xp_node); + avc_compute_av(ssid, tsid, tclass, &avd, xp_node); } else { memcpy(&avd, &node->ae.avd, sizeof(avd)); xp_node = node->ae.xp_node; @@ -1083,10 +1062,10 @@ int avc_has_extended_perms(struct selinux_state *state, goto decision; } rcu_read_unlock(); - security_compute_xperms_decision(state, ssid, tsid, tclass, + security_compute_xperms_decision(ssid, tsid, tclass, driver, &local_xpd); rcu_read_lock(); - avc_update_node(state->avc, AVC_CALLBACK_ADD_XPERMS, requested, + avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, driver, xperm, ssid, tsid, tclass, avd.seqno, &local_xpd, 0); } else { @@ -1100,12 +1079,12 @@ int avc_has_extended_perms(struct selinux_state *state, decision: denied = requested & ~(avd.allowed); if (unlikely(denied)) - rc = avc_denied(state, ssid, tsid, tclass, requested, + rc = avc_denied(ssid, tsid, tclass, requested, driver, xperm, AVC_EXTENDED_PERMS, &avd); rcu_read_unlock(); - rc2 = avc_xperms_audit(state, ssid, tsid, tclass, requested, + rc2 = avc_xperms_audit(ssid, tsid, tclass, requested, &avd, xpd, xperm, rc, ad); if (rc2) return rc2; @@ -1113,8 +1092,35 @@ decision: } /** + * avc_perm_nonode - Add an entry to the AVC + * @ssid: subject + * @tsid: object/target + * @tclass: object class + * @requested: requested permissions + * @flags: AVC flags + * @avd: access vector decision + * + * This is the "we have no node" part of avc_has_perm_noaudit(), which is + * unlikely and needs extra stack space for the new node that we generate, so + * don't inline it. + */ +static noinline int avc_perm_nonode(u32 ssid, u32 tsid, u16 tclass, + u32 requested, unsigned int flags, + struct av_decision *avd) +{ + u32 denied; + struct avc_xperms_node xp_node; + + avc_compute_av(ssid, tsid, tclass, avd, &xp_node); + denied = requested & ~(avd->allowed); + if (unlikely(denied)) + return avc_denied(ssid, tsid, tclass, requested, 0, 0, + flags, avd); + return 0; +} + +/** * avc_has_perm_noaudit - Check permissions but perform no auditing. - * @state: SELinux state * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class @@ -1133,40 +1139,36 @@ decision: * auditing, e.g. in cases where a lock must be held for the check but * should be released for the auditing. */ -inline int avc_has_perm_noaudit(struct selinux_state *state, - u32 ssid, u32 tsid, +inline int avc_has_perm_noaudit(u32 ssid, u32 tsid, u16 tclass, u32 requested, unsigned int flags, struct av_decision *avd) { - struct avc_node *node; - struct avc_xperms_node xp_node; - int rc = 0; u32 denied; + struct avc_node *node; if (WARN_ON(!requested)) return -EACCES; rcu_read_lock(); + node = avc_lookup(ssid, tsid, tclass); + if (unlikely(!node)) { + rcu_read_unlock(); + return avc_perm_nonode(ssid, tsid, tclass, requested, + flags, avd); + } + denied = requested & ~node->ae.avd.allowed; + memcpy(avd, &node->ae.avd, sizeof(*avd)); + rcu_read_unlock(); - node = avc_lookup(state->avc, ssid, tsid, tclass); - if (unlikely(!node)) - avc_compute_av(state, ssid, tsid, tclass, avd, &xp_node); - else - memcpy(avd, &node->ae.avd, sizeof(*avd)); - - denied = requested & ~(avd->allowed); if (unlikely(denied)) - rc = avc_denied(state, ssid, tsid, tclass, requested, 0, 0, - flags, avd); - - rcu_read_unlock(); - return rc; + return avc_denied(ssid, tsid, tclass, requested, 0, 0, + flags, avd); + return 0; } /** * avc_has_perm - Check permissions and perform any appropriate auditing. - * @state: SELinux state * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class @@ -1181,25 +1183,25 @@ inline int avc_has_perm_noaudit(struct selinux_state *state, * permissions are granted, -%EACCES if any permissions are denied, or * another -errno upon other errors. */ -int avc_has_perm(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, +int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct common_audit_data *auditdata) { struct av_decision avd; int rc, rc2; - rc = avc_has_perm_noaudit(state, ssid, tsid, tclass, requested, 0, + rc = avc_has_perm_noaudit(ssid, tsid, tclass, requested, 0, &avd); - rc2 = avc_audit(state, ssid, tsid, tclass, requested, &avd, rc, + rc2 = avc_audit(ssid, tsid, tclass, requested, &avd, rc, auditdata); if (rc2) return rc2; return rc; } -u32 avc_policy_seqno(struct selinux_state *state) +u32 avc_policy_seqno(void) { - return state->avc->avc_cache.latest_notif; + return selinux_avc.avc_cache.latest_notif; } void avc_disable(void) @@ -1216,7 +1218,7 @@ void avc_disable(void) * the cache and get that memory back. */ if (avc_node_cachep) { - avc_flush(selinux_state.avc); + avc_flush(); /* kmem_cache_destroy(avc_node_cachep); */ } } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 9a5bdfc21314..79b4890e9936 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -136,17 +136,13 @@ static int __init selinux_enabled_setup(char *str) __setup("selinux=", selinux_enabled_setup); #endif -static unsigned int selinux_checkreqprot_boot = - CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; - static int __init checkreqprot_setup(char *str) { unsigned long checkreqprot; if (!kstrtoul(str, 0, &checkreqprot)) { - selinux_checkreqprot_boot = checkreqprot ? 1 : 0; if (checkreqprot) - pr_err("SELinux: checkreqprot set to 1 via kernel parameter. This is deprecated and will be rejected in a future kernel release.\n"); + pr_err("SELinux: checkreqprot set to 1 via kernel parameter. This is no longer supported.\n"); } return 1; } @@ -257,7 +253,7 @@ static int __inode_security_revalidate(struct inode *inode, might_sleep_if(may_sleep); - if (selinux_initialized(&selinux_state) && + if (selinux_initialized() && isec->initialized != LABEL_INITIALIZED) { if (!may_sleep) return -ECHILD; @@ -403,14 +399,12 @@ static int may_context_mount_sb_relabel(u32 sid, const struct task_security_struct *tsec = selinux_cred(cred); int rc; - rc = avc_has_perm(&selinux_state, - tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, + rc = avc_has_perm(tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELFROM, NULL); if (rc) return rc; - rc = avc_has_perm(&selinux_state, - tsec->sid, sid, SECCLASS_FILESYSTEM, + rc = avc_has_perm(tsec->sid, sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELTO, NULL); return rc; } @@ -421,14 +415,12 @@ static int may_context_mount_inode_relabel(u32 sid, { const struct task_security_struct *tsec = selinux_cred(cred); int rc; - rc = avc_has_perm(&selinux_state, - tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, + rc = avc_has_perm(tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELFROM, NULL); if (rc) return rc; - rc = avc_has_perm(&selinux_state, - sid, sbsec->sid, SECCLASS_FILESYSTEM, + rc = avc_has_perm(sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__ASSOCIATE, NULL); return rc; } @@ -511,7 +503,7 @@ static int sb_check_xattr_support(struct super_block *sb) fallback: /* No xattr support - try to fallback to genfs if possible. */ - rc = security_genfs_sid(&selinux_state, sb->s_type->name, "/", + rc = security_genfs_sid(sb->s_type->name, "/", SECCLASS_DIR, &sid); if (rc) return -EOPNOTSUPP; @@ -615,7 +607,7 @@ static int selinux_set_mnt_opts(struct super_block *sb, mutex_lock(&sbsec->lock); - if (!selinux_initialized(&selinux_state)) { + if (!selinux_initialized()) { if (!opts) { /* Defer initialization until selinux_complete_init, after the initial policy is loaded and the security @@ -716,7 +708,7 @@ static int selinux_set_mnt_opts(struct super_block *sb, * Determine the labeling behavior to use for this * filesystem type. */ - rc = security_fs_use(&selinux_state, sb); + rc = security_fs_use(sb); if (rc) { pr_warn("%s: security_fs_use(%s) returned %d\n", __func__, sb->s_type->name, rc); @@ -741,8 +733,7 @@ static int selinux_set_mnt_opts(struct super_block *sb, } if (sbsec->behavior == SECURITY_FS_USE_XATTR) { sbsec->behavior = SECURITY_FS_USE_MNTPOINT; - rc = security_transition_sid(&selinux_state, - current_sid(), + rc = security_transition_sid(current_sid(), current_sid(), SECCLASS_FILE, NULL, &sbsec->mntpoint_sid); @@ -881,7 +872,7 @@ static int selinux_sb_clone_mnt_opts(const struct super_block *oldsb, * if the parent was able to be mounted it clearly had no special lsm * mount options. thus we can safely deal with this superblock later */ - if (!selinux_initialized(&selinux_state)) + if (!selinux_initialized()) return 0; /* @@ -911,7 +902,7 @@ static int selinux_sb_clone_mnt_opts(const struct super_block *oldsb, if (newsbsec->behavior == SECURITY_FS_USE_NATIVE && !(kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context) { - rc = security_fs_use(&selinux_state, newsb); + rc = security_fs_use(newsb); if (rc) goto out; } @@ -960,7 +951,7 @@ static int selinux_add_opt(int token, const char *s, void **mnt_opts) if (!s) return -EINVAL; - if (!selinux_initialized(&selinux_state)) { + if (!selinux_initialized()) { pr_warn("SELinux: Unable to set superblock options before the security server is initialized\n"); return -EINVAL; } @@ -997,7 +988,7 @@ static int selinux_add_opt(int token, const char *s, void **mnt_opts) WARN_ON(1); return -EINVAL; } - rc = security_context_str_to_sid(&selinux_state, s, dst_sid, GFP_KERNEL); + rc = security_context_str_to_sid(s, dst_sid, GFP_KERNEL); if (rc) pr_warn("SELinux: security_context_str_to_sid (%s) failed with errno=%d\n", s, rc); @@ -1014,8 +1005,7 @@ static int show_sid(struct seq_file *m, u32 sid) u32 len; int rc; - rc = security_sid_to_context(&selinux_state, sid, - &context, &len); + rc = security_sid_to_context(sid, &context, &len); if (!rc) { bool has_comma = strchr(context, ','); @@ -1038,7 +1028,7 @@ static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb) if (!(sbsec->flags & SE_SBINITIALIZED)) return 0; - if (!selinux_initialized(&selinux_state)) + if (!selinux_initialized()) return 0; if (sbsec->flags & FSCONTEXT_MNT) { @@ -1292,7 +1282,7 @@ static int selinux_genfs_get_sid(struct dentry *dentry, path++; } } - rc = security_genfs_sid(&selinux_state, sb->s_type->name, + rc = security_genfs_sid(sb->s_type->name, path, tclass, sid); if (rc == -ENOENT) { /* No match in policy, mark as unlabeled. */ @@ -1347,7 +1337,7 @@ static int inode_doinit_use_xattr(struct inode *inode, struct dentry *dentry, return 0; } - rc = security_context_to_sid_default(&selinux_state, context, rc, sid, + rc = security_context_to_sid_default(context, rc, sid, def_sid, GFP_NOFS); if (rc) { char *dev = inode->i_sb->s_id; @@ -1454,7 +1444,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent sid = sbsec->sid; /* Try to obtain a transition SID. */ - rc = security_transition_sid(&selinux_state, task_sid, sid, + rc = security_transition_sid(task_sid, sid, sclass, NULL, &sid); if (rc) goto out; @@ -1599,11 +1589,9 @@ static int cred_has_capability(const struct cred *cred, return -EINVAL; } - rc = avc_has_perm_noaudit(&selinux_state, - sid, sid, sclass, av, 0, &avd); + rc = avc_has_perm_noaudit(sid, sid, sclass, av, 0, &avd); if (!(opts & CAP_OPT_NOAUDIT)) { - int rc2 = avc_audit(&selinux_state, - sid, sid, sclass, av, &avd, rc, &ad); + int rc2 = avc_audit(sid, sid, sclass, av, &avd, rc, &ad); if (rc2) return rc2; } @@ -1629,8 +1617,7 @@ static int inode_has_perm(const struct cred *cred, sid = cred_sid(cred); isec = selinux_inode(inode); - return avc_has_perm(&selinux_state, - sid, isec->sid, isec->sclass, perms, adp); + return avc_has_perm(sid, isec->sid, isec->sclass, perms, adp); } /* Same as inode_has_perm, but pass explicit audit data containing @@ -1703,8 +1690,7 @@ static int file_has_perm(const struct cred *cred, ad.u.file = file; if (sid != fsec->sid) { - rc = avc_has_perm(&selinux_state, - sid, fsec->sid, + rc = avc_has_perm(sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); @@ -1747,7 +1733,7 @@ selinux_determine_inode_label(const struct task_security_struct *tsec, *_new_isid = tsec->create_sid; } else { const struct inode_security_struct *dsec = inode_security(dir); - return security_transition_sid(&selinux_state, tsec->sid, + return security_transition_sid(tsec->sid, dsec->sid, tclass, name, _new_isid); } @@ -1775,8 +1761,7 @@ static int may_create(struct inode *dir, ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; - rc = avc_has_perm(&selinux_state, - sid, dsec->sid, SECCLASS_DIR, + rc = avc_has_perm(sid, dsec->sid, SECCLASS_DIR, DIR__ADD_NAME | DIR__SEARCH, &ad); if (rc) @@ -1787,13 +1772,11 @@ static int may_create(struct inode *dir, if (rc) return rc; - rc = avc_has_perm(&selinux_state, - sid, newsid, tclass, FILE__CREATE, &ad); + rc = avc_has_perm(sid, newsid, tclass, FILE__CREATE, &ad); if (rc) return rc; - return avc_has_perm(&selinux_state, - newsid, sbsec->sid, + return avc_has_perm(newsid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__ASSOCIATE, &ad); } @@ -1822,8 +1805,7 @@ static int may_link(struct inode *dir, av = DIR__SEARCH; av |= (kind ? DIR__REMOVE_NAME : DIR__ADD_NAME); - rc = avc_has_perm(&selinux_state, - sid, dsec->sid, SECCLASS_DIR, av, &ad); + rc = avc_has_perm(sid, dsec->sid, SECCLASS_DIR, av, &ad); if (rc) return rc; @@ -1843,8 +1825,7 @@ static int may_link(struct inode *dir, return 0; } - rc = avc_has_perm(&selinux_state, - sid, isec->sid, isec->sclass, av, &ad); + rc = avc_has_perm(sid, isec->sid, isec->sclass, av, &ad); return rc; } @@ -1868,19 +1849,16 @@ static inline int may_rename(struct inode *old_dir, ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = old_dentry; - rc = avc_has_perm(&selinux_state, - sid, old_dsec->sid, SECCLASS_DIR, + rc = avc_has_perm(sid, old_dsec->sid, SECCLASS_DIR, DIR__REMOVE_NAME | DIR__SEARCH, &ad); if (rc) return rc; - rc = avc_has_perm(&selinux_state, - sid, old_isec->sid, + rc = avc_has_perm(sid, old_isec->sid, old_isec->sclass, FILE__RENAME, &ad); if (rc) return rc; if (old_is_dir && new_dir != old_dir) { - rc = avc_has_perm(&selinux_state, - sid, old_isec->sid, + rc = avc_has_perm(sid, old_isec->sid, old_isec->sclass, DIR__REPARENT, &ad); if (rc) return rc; @@ -1890,15 +1868,13 @@ static inline int may_rename(struct inode *old_dir, av = DIR__ADD_NAME | DIR__SEARCH; if (d_is_positive(new_dentry)) av |= DIR__REMOVE_NAME; - rc = avc_has_perm(&selinux_state, - sid, new_dsec->sid, SECCLASS_DIR, av, &ad); + rc = avc_has_perm(sid, new_dsec->sid, SECCLASS_DIR, av, &ad); if (rc) return rc; if (d_is_positive(new_dentry)) { new_isec = backing_inode_security(new_dentry); new_is_dir = d_is_dir(new_dentry); - rc = avc_has_perm(&selinux_state, - sid, new_isec->sid, + rc = avc_has_perm(sid, new_isec->sid, new_isec->sclass, (new_is_dir ? DIR__RMDIR : FILE__UNLINK), &ad); if (rc) @@ -1918,8 +1894,7 @@ static int superblock_has_perm(const struct cred *cred, u32 sid = cred_sid(cred); sbsec = selinux_superblock(sb); - return avc_has_perm(&selinux_state, - sid, sbsec->sid, SECCLASS_FILESYSTEM, perms, ad); + return avc_has_perm(sid, sbsec->sid, SECCLASS_FILESYSTEM, perms, ad); } /* Convert a Linux mode and permission mask to an access vector. */ @@ -1993,8 +1968,7 @@ static inline u32 open_file_to_av(struct file *file) static int selinux_binder_set_context_mgr(const struct cred *mgr) { - return avc_has_perm(&selinux_state, - current_sid(), cred_sid(mgr), SECCLASS_BINDER, + return avc_has_perm(current_sid(), cred_sid(mgr), SECCLASS_BINDER, BINDER__SET_CONTEXT_MGR, NULL); } @@ -2007,22 +1981,20 @@ static int selinux_binder_transaction(const struct cred *from, int rc; if (mysid != fromsid) { - rc = avc_has_perm(&selinux_state, - mysid, fromsid, SECCLASS_BINDER, + rc = avc_has_perm(mysid, fromsid, SECCLASS_BINDER, BINDER__IMPERSONATE, NULL); if (rc) return rc; } - return avc_has_perm(&selinux_state, fromsid, tosid, + return avc_has_perm(fromsid, tosid, SECCLASS_BINDER, BINDER__CALL, NULL); } static int selinux_binder_transfer_binder(const struct cred *from, const struct cred *to) { - return avc_has_perm(&selinux_state, - cred_sid(from), cred_sid(to), + return avc_has_perm(cred_sid(from), cred_sid(to), SECCLASS_BINDER, BINDER__TRANSFER, NULL); } @@ -2042,8 +2014,7 @@ static int selinux_binder_transfer_file(const struct cred *from, ad.u.path = file->f_path; if (sid != fsec->sid) { - rc = avc_has_perm(&selinux_state, - sid, fsec->sid, + rc = avc_has_perm(sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); @@ -2061,8 +2032,7 @@ static int selinux_binder_transfer_file(const struct cred *from, return 0; isec = backing_inode_security(dentry); - return avc_has_perm(&selinux_state, - sid, isec->sid, isec->sclass, file_to_av(file), + return avc_has_perm(sid, isec->sid, isec->sclass, file_to_av(file), &ad); } @@ -2073,26 +2043,24 @@ static int selinux_ptrace_access_check(struct task_struct *child, u32 csid = task_sid_obj(child); if (mode & PTRACE_MODE_READ) - return avc_has_perm(&selinux_state, - sid, csid, SECCLASS_FILE, FILE__READ, NULL); + return avc_has_perm(sid, csid, SECCLASS_FILE, FILE__READ, + NULL); - return avc_has_perm(&selinux_state, - sid, csid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL); + return avc_has_perm(sid, csid, SECCLASS_PROCESS, PROCESS__PTRACE, + NULL); } static int selinux_ptrace_traceme(struct task_struct *parent) { - return avc_has_perm(&selinux_state, - task_sid_obj(parent), task_sid_obj(current), + return avc_has_perm(task_sid_obj(parent), task_sid_obj(current), SECCLASS_PROCESS, PROCESS__PTRACE, NULL); } static int selinux_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { - return avc_has_perm(&selinux_state, - current_sid(), task_sid_obj(target), SECCLASS_PROCESS, - PROCESS__GETCAP, NULL); + return avc_has_perm(current_sid(), task_sid_obj(target), + SECCLASS_PROCESS, PROCESS__GETCAP, NULL); } static int selinux_capset(struct cred *new, const struct cred *old, @@ -2100,8 +2068,7 @@ static int selinux_capset(struct cred *new, const struct cred *old, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { - return avc_has_perm(&selinux_state, - cred_sid(old), cred_sid(new), SECCLASS_PROCESS, + return avc_has_perm(cred_sid(old), cred_sid(new), SECCLASS_PROCESS, PROCESS__SETCAP, NULL); } @@ -2168,21 +2135,18 @@ static int selinux_syslog(int type) switch (type) { case SYSLOG_ACTION_READ_ALL: /* Read last kernel messages */ case SYSLOG_ACTION_SIZE_BUFFER: /* Return size of the log buffer */ - return avc_has_perm(&selinux_state, - current_sid(), SECINITSID_KERNEL, + return avc_has_perm(current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__SYSLOG_READ, NULL); case SYSLOG_ACTION_CONSOLE_OFF: /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_ON: /* Enable logging to console */ /* Set level of messages printed to console */ case SYSLOG_ACTION_CONSOLE_LEVEL: - return avc_has_perm(&selinux_state, - current_sid(), SECINITSID_KERNEL, + return avc_has_perm(current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__SYSLOG_CONSOLE, NULL); } /* All other syslog types */ - return avc_has_perm(&selinux_state, - current_sid(), SECINITSID_KERNEL, + return avc_has_perm(current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__SYSLOG_MOD, NULL); } @@ -2249,8 +2213,7 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, av |= PROCESS2__NNP_TRANSITION; if (nosuid) av |= PROCESS2__NOSUID_TRANSITION; - rc = avc_has_perm(&selinux_state, - old_tsec->sid, new_tsec->sid, + rc = avc_has_perm(old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS2, av, NULL); if (!rc) return 0; @@ -2261,7 +2224,7 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, * i.e. SIDs that are guaranteed to only be allowed a subset * of the permissions of the current SID. */ - rc = security_bounded_transition(&selinux_state, old_tsec->sid, + rc = security_bounded_transition(old_tsec->sid, new_tsec->sid); if (!rc) return 0; @@ -2312,7 +2275,7 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) return rc; } else { /* Check for a default transition on this program. */ - rc = security_transition_sid(&selinux_state, old_tsec->sid, + rc = security_transition_sid(old_tsec->sid, isec->sid, SECCLASS_PROCESS, NULL, &new_tsec->sid); if (rc) @@ -2331,29 +2294,25 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) ad.u.file = bprm->file; if (new_tsec->sid == old_tsec->sid) { - rc = avc_has_perm(&selinux_state, - old_tsec->sid, isec->sid, + rc = avc_has_perm(old_tsec->sid, isec->sid, SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad); if (rc) return rc; } else { /* Check permissions for the transition. */ - rc = avc_has_perm(&selinux_state, - old_tsec->sid, new_tsec->sid, + rc = avc_has_perm(old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__TRANSITION, &ad); if (rc) return rc; - rc = avc_has_perm(&selinux_state, - new_tsec->sid, isec->sid, + rc = avc_has_perm(new_tsec->sid, isec->sid, SECCLASS_FILE, FILE__ENTRYPOINT, &ad); if (rc) return rc; /* Check for shared state */ if (bprm->unsafe & LSM_UNSAFE_SHARE) { - rc = avc_has_perm(&selinux_state, - old_tsec->sid, new_tsec->sid, + rc = avc_has_perm(old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__SHARE, NULL); if (rc) @@ -2365,8 +2324,7 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) if (bprm->unsafe & LSM_UNSAFE_PTRACE) { u32 ptsid = ptrace_parent_sid(); if (ptsid != 0) { - rc = avc_has_perm(&selinux_state, - ptsid, new_tsec->sid, + rc = avc_has_perm(ptsid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL); if (rc) @@ -2380,8 +2338,7 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) /* Enable secure mode for SIDs transitions unless the noatsecure permission is granted between the two SIDs, i.e. ahp returns 0. */ - rc = avc_has_perm(&selinux_state, - old_tsec->sid, new_tsec->sid, + rc = avc_has_perm(old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__NOATSECURE, NULL); bprm->secureexec |= !!rc; @@ -2473,8 +2430,7 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm) * higher than the default soft limit for cases where the default is * lower than the hard limit, e.g. RLIMIT_CORE or RLIMIT_STACK. */ - rc = avc_has_perm(&selinux_state, - new_tsec->osid, new_tsec->sid, SECCLASS_PROCESS, + rc = avc_has_perm(new_tsec->osid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__RLIMITINH, NULL); if (rc) { /* protect against do_prlimit() */ @@ -2513,8 +2469,7 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm) * This must occur _after_ the task SID has been updated so that any * kill done after the flush will be checked against the new SID. */ - rc = avc_has_perm(&selinux_state, - osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL); + rc = avc_has_perm(osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL); if (rc) { clear_itimer(); @@ -2841,7 +2796,7 @@ static int selinux_dentry_init_security(struct dentry *dentry, int mode, if (xattr_name) *xattr_name = XATTR_NAME_SELINUX; - return security_sid_to_context(&selinux_state, newsid, (char **)ctx, + return security_sid_to_context(newsid, (char **)ctx, ctxlen); } @@ -2895,7 +2850,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, isec->initialized = LABEL_INITIALIZED; } - if (!selinux_initialized(&selinux_state) || + if (!selinux_initialized() || !(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; @@ -2903,7 +2858,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, *name = XATTR_SELINUX_SUFFIX; if (value && len) { - rc = security_sid_to_context_force(&selinux_state, newsid, + rc = security_sid_to_context_force(newsid, &context, &clen); if (rc) return rc; @@ -2923,7 +2878,7 @@ static int selinux_inode_init_security_anon(struct inode *inode, struct inode_security_struct *isec; int rc; - if (unlikely(!selinux_initialized(&selinux_state))) + if (unlikely(!selinux_initialized())) return 0; isec = selinux_inode(inode); @@ -2947,7 +2902,7 @@ static int selinux_inode_init_security_anon(struct inode *inode, } else { isec->sclass = SECCLASS_ANON_INODE; rc = security_transition_sid( - &selinux_state, tsec->sid, tsec->sid, + tsec->sid, tsec->sid, isec->sclass, name, &isec->sid); if (rc) return rc; @@ -2962,8 +2917,7 @@ static int selinux_inode_init_security_anon(struct inode *inode, ad.type = LSM_AUDIT_DATA_ANONINODE; ad.u.anonclass = name ? (const char *)name->name : "?"; - return avc_has_perm(&selinux_state, - tsec->sid, + return avc_has_perm(tsec->sid, isec->sid, isec->sclass, FILE__CREATE, @@ -3035,8 +2989,7 @@ static int selinux_inode_follow_link(struct dentry *dentry, struct inode *inode, if (IS_ERR(isec)) return PTR_ERR(isec); - return avc_has_perm(&selinux_state, - sid, isec->sid, isec->sclass, FILE__READ, &ad); + return avc_has_perm(sid, isec->sid, isec->sclass, FILE__READ, &ad); } static noinline int audit_inode_permission(struct inode *inode, @@ -3049,8 +3002,7 @@ static noinline int audit_inode_permission(struct inode *inode, ad.type = LSM_AUDIT_DATA_INODE; ad.u.inode = inode; - return slow_avc_audit(&selinux_state, - current_sid(), isec->sid, isec->sclass, perms, + return slow_avc_audit(current_sid(), isec->sid, isec->sclass, perms, audited, denied, result, &ad); } @@ -3085,8 +3037,7 @@ static int selinux_inode_permission(struct inode *inode, int mask) if (IS_ERR(isec)) return PTR_ERR(isec); - rc = avc_has_perm_noaudit(&selinux_state, - sid, isec->sid, isec->sclass, perms, 0, + rc = avc_has_perm_noaudit(sid, isec->sid, isec->sclass, perms, 0, &avd); audited = avc_audit_required(perms, &avd, rc, from_access ? FILE__AUDIT_ACCESS : 0, @@ -3166,7 +3117,7 @@ static int selinux_inode_setxattr(struct mnt_idmap *idmap, return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); } - if (!selinux_initialized(&selinux_state)) + if (!selinux_initialized()) return (inode_owner_or_capable(idmap, inode) ? 0 : -EPERM); sbsec = selinux_superblock(inode->i_sb); @@ -3180,13 +3131,12 @@ static int selinux_inode_setxattr(struct mnt_idmap *idmap, ad.u.dentry = dentry; isec = backing_inode_security(dentry); - rc = avc_has_perm(&selinux_state, - sid, isec->sid, isec->sclass, + rc = avc_has_perm(sid, isec->sid, isec->sclass, FILE__RELABELFROM, &ad); if (rc) return rc; - rc = security_context_to_sid(&selinux_state, value, size, &newsid, + rc = security_context_to_sid(value, size, &newsid, GFP_KERNEL); if (rc == -EINVAL) { if (!has_cap_mac_admin(true)) { @@ -3215,25 +3165,23 @@ static int selinux_inode_setxattr(struct mnt_idmap *idmap, return rc; } - rc = security_context_to_sid_force(&selinux_state, value, + rc = security_context_to_sid_force(value, size, &newsid); } if (rc) return rc; - rc = avc_has_perm(&selinux_state, - sid, newsid, isec->sclass, + rc = avc_has_perm(sid, newsid, isec->sclass, FILE__RELABELTO, &ad); if (rc) return rc; - rc = security_validate_transition(&selinux_state, isec->sid, newsid, + rc = security_validate_transition(isec->sid, newsid, sid, isec->sclass); if (rc) return rc; - return avc_has_perm(&selinux_state, - newsid, + return avc_has_perm(newsid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__ASSOCIATE, @@ -3273,7 +3221,7 @@ static void selinux_inode_post_setxattr(struct dentry *dentry, const char *name, return; } - if (!selinux_initialized(&selinux_state)) { + if (!selinux_initialized()) { /* If we haven't even been initialized, then we can't validate * against a policy, so leave the label as invalid. It may * resolve to a valid label on the next revalidation try if @@ -3282,7 +3230,7 @@ static void selinux_inode_post_setxattr(struct dentry *dentry, const char *name, return; } - rc = security_context_to_sid_force(&selinux_state, value, size, + rc = security_context_to_sid_force(value, size, &newsid); if (rc) { pr_err("SELinux: unable to map context to SID" @@ -3326,7 +3274,7 @@ static int selinux_inode_removexattr(struct mnt_idmap *idmap, return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); } - if (!selinux_initialized(&selinux_state)) + if (!selinux_initialized()) return 0; /* No one is allowed to remove a SELinux security label. @@ -3396,7 +3344,7 @@ static int selinux_inode_getsecurity(struct mnt_idmap *idmap, * If we're not initialized yet, then we can't validate contexts, so * just let vfs_getxattr fall back to using the on-disk xattr. */ - if (!selinux_initialized(&selinux_state) || + if (!selinux_initialized() || strcmp(name, XATTR_SELINUX_SUFFIX)) return -EOPNOTSUPP; @@ -3411,11 +3359,10 @@ static int selinux_inode_getsecurity(struct mnt_idmap *idmap, */ isec = inode_security(inode); if (has_cap_mac_admin(false)) - error = security_sid_to_context_force(&selinux_state, - isec->sid, &context, + error = security_sid_to_context_force(isec->sid, &context, &size); else - error = security_sid_to_context(&selinux_state, isec->sid, + error = security_sid_to_context(isec->sid, &context, &size); if (error) return error; @@ -3447,7 +3394,7 @@ static int selinux_inode_setsecurity(struct inode *inode, const char *name, if (!value || !size) return -EACCES; - rc = security_context_to_sid(&selinux_state, value, size, &newsid, + rc = security_context_to_sid(value, size, &newsid, GFP_KERNEL); if (rc) return rc; @@ -3464,7 +3411,7 @@ static int selinux_inode_listsecurity(struct inode *inode, char *buffer, size_t { const int len = sizeof(XATTR_NAME_SELINUX); - if (!selinux_initialized(&selinux_state)) + if (!selinux_initialized()) return 0; if (buffer && len <= buffer_size) @@ -3540,7 +3487,7 @@ static int selinux_kernfs_init_security(struct kernfs_node *kn_dir, return rc; } - rc = security_context_to_sid(&selinux_state, context, clen, &parent_sid, + rc = security_context_to_sid(context, clen, &parent_sid, GFP_KERNEL); kfree(context); if (rc) @@ -3555,14 +3502,14 @@ static int selinux_kernfs_init_security(struct kernfs_node *kn_dir, q.name = kn->name; q.hash_len = hashlen_string(kn_dir, kn->name); - rc = security_transition_sid(&selinux_state, tsec->sid, + rc = security_transition_sid(tsec->sid, parent_sid, secclass, &q, &newsid); if (rc) return rc; } - rc = security_sid_to_context_force(&selinux_state, newsid, + rc = security_sid_to_context_force(newsid, &context, &clen); if (rc) return rc; @@ -3602,7 +3549,7 @@ static int selinux_file_permission(struct file *file, int mask) isec = inode_security(inode); if (sid == fsec->sid && fsec->isid == isec->sid && - fsec->pseqno == avc_policy_seqno(&selinux_state)) + fsec->pseqno == avc_policy_seqno()) /* No change since file_open check. */ return 0; @@ -3643,8 +3590,7 @@ static int ioctl_has_perm(const struct cred *cred, struct file *file, ad.u.op->path = file->f_path; if (ssid != fsec->sid) { - rc = avc_has_perm(&selinux_state, - ssid, fsec->sid, + rc = avc_has_perm(ssid, fsec->sid, SECCLASS_FD, FD__USE, &ad); @@ -3656,8 +3602,7 @@ static int ioctl_has_perm(const struct cred *cred, struct file *file, return 0; isec = inode_security(inode); - rc = avc_has_extended_perms(&selinux_state, - ssid, isec->sid, isec->sclass, + rc = avc_has_extended_perms(ssid, isec->sid, isec->sclass, requested, driver, xperm, &ad); out: return rc; @@ -3726,8 +3671,7 @@ static int file_map_prot_check(struct file *file, unsigned long prot, int shared * private file mapping that will also be writable. * This has an additional check. */ - rc = avc_has_perm(&selinux_state, - sid, sid, SECCLASS_PROCESS, + rc = avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__EXECMEM, NULL); if (rc) goto error; @@ -3757,15 +3701,15 @@ static int selinux_mmap_addr(unsigned long addr) if (addr < CONFIG_LSM_MMAP_MIN_ADDR) { u32 sid = current_sid(); - rc = avc_has_perm(&selinux_state, - sid, sid, SECCLASS_MEMPROTECT, + rc = avc_has_perm(sid, sid, SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, NULL); } return rc; } -static int selinux_mmap_file(struct file *file, unsigned long reqprot, +static int selinux_mmap_file(struct file *file, + unsigned long reqprot __always_unused, unsigned long prot, unsigned long flags) { struct common_audit_data ad; @@ -3780,37 +3724,29 @@ static int selinux_mmap_file(struct file *file, unsigned long reqprot, return rc; } - if (checkreqprot_get(&selinux_state)) - prot = reqprot; - return file_map_prot_check(file, prot, (flags & MAP_TYPE) == MAP_SHARED); } static int selinux_file_mprotect(struct vm_area_struct *vma, - unsigned long reqprot, + unsigned long reqprot __always_unused, unsigned long prot) { const struct cred *cred = current_cred(); u32 sid = cred_sid(cred); - if (checkreqprot_get(&selinux_state)) - prot = reqprot; - if (default_noexec && (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) { int rc = 0; if (vma->vm_start >= vma->vm_mm->start_brk && vma->vm_end <= vma->vm_mm->brk) { - rc = avc_has_perm(&selinux_state, - sid, sid, SECCLASS_PROCESS, + rc = avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__EXECHEAP, NULL); } else if (!vma->vm_file && ((vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack) || vma_is_stack_for_current(vma))) { - rc = avc_has_perm(&selinux_state, - sid, sid, SECCLASS_PROCESS, + rc = avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__EXECSTACK, NULL); } else if (vma->vm_file && vma->anon_vma) { /* @@ -3902,8 +3838,7 @@ static int selinux_file_send_sigiotask(struct task_struct *tsk, else perm = signal_to_av(signum); - return avc_has_perm(&selinux_state, - fsec->fown_sid, sid, + return avc_has_perm(fsec->fown_sid, sid, SECCLASS_PROCESS, perm, NULL); } @@ -3929,7 +3864,7 @@ static int selinux_file_open(struct file *file) * struct as its SID. */ fsec->isid = isec->sid; - fsec->pseqno = avc_policy_seqno(&selinux_state); + fsec->pseqno = avc_policy_seqno(); /* * Since the inode label or policy seqno may have changed * between the selinux_inode_permission check and the saving @@ -3948,8 +3883,7 @@ static int selinux_task_alloc(struct task_struct *task, { u32 sid = current_sid(); - return avc_has_perm(&selinux_state, - sid, sid, SECCLASS_PROCESS, PROCESS__FORK, NULL); + return avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__FORK, NULL); } /* @@ -3991,8 +3925,7 @@ static int selinux_kernel_act_as(struct cred *new, u32 secid) u32 sid = current_sid(); int ret; - ret = avc_has_perm(&selinux_state, - sid, secid, + ret = avc_has_perm(sid, secid, SECCLASS_KERNEL_SERVICE, KERNEL_SERVICE__USE_AS_OVERRIDE, NULL); @@ -4016,8 +3949,7 @@ static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode) u32 sid = current_sid(); int ret; - ret = avc_has_perm(&selinux_state, - sid, isec->sid, + ret = avc_has_perm(sid, isec->sid, SECCLASS_KERNEL_SERVICE, KERNEL_SERVICE__CREATE_FILES_AS, NULL); @@ -4034,8 +3966,7 @@ static int selinux_kernel_module_request(char *kmod_name) ad.type = LSM_AUDIT_DATA_KMOD; ad.u.kmod_name = kmod_name; - return avc_has_perm(&selinux_state, - current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, + return avc_has_perm(current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__MODULE_REQUEST, &ad); } @@ -4049,8 +3980,7 @@ static int selinux_kernel_module_from_file(struct file *file) /* init_module */ if (file == NULL) - return avc_has_perm(&selinux_state, - sid, sid, SECCLASS_SYSTEM, + return avc_has_perm(sid, sid, SECCLASS_SYSTEM, SYSTEM__MODULE_LOAD, NULL); /* finit_module */ @@ -4060,15 +3990,13 @@ static int selinux_kernel_module_from_file(struct file *file) fsec = selinux_file(file); if (sid != fsec->sid) { - rc = avc_has_perm(&selinux_state, - sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); + rc = avc_has_perm(sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) return rc; } isec = inode_security(file_inode(file)); - return avc_has_perm(&selinux_state, - sid, isec->sid, SECCLASS_SYSTEM, + return avc_has_perm(sid, isec->sid, SECCLASS_SYSTEM, SYSTEM__MODULE_LOAD, &ad); } @@ -4106,22 +4034,19 @@ static int selinux_kernel_load_data(enum kernel_load_data_id id, bool contents) static int selinux_task_setpgid(struct task_struct *p, pid_t pgid) { - return avc_has_perm(&selinux_state, - current_sid(), task_sid_obj(p), SECCLASS_PROCESS, + return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETPGID, NULL); } static int selinux_task_getpgid(struct task_struct *p) { - return avc_has_perm(&selinux_state, - current_sid(), task_sid_obj(p), SECCLASS_PROCESS, + return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__GETPGID, NULL); } static int selinux_task_getsid(struct task_struct *p) { - return avc_has_perm(&selinux_state, - current_sid(), task_sid_obj(p), SECCLASS_PROCESS, + return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__GETSESSION, NULL); } @@ -4137,22 +4062,19 @@ static void selinux_task_getsecid_obj(struct task_struct *p, u32 *secid) static int selinux_task_setnice(struct task_struct *p, int nice) { - return avc_has_perm(&selinux_state, - current_sid(), task_sid_obj(p), SECCLASS_PROCESS, + return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_setioprio(struct task_struct *p, int ioprio) { - return avc_has_perm(&selinux_state, - current_sid(), task_sid_obj(p), SECCLASS_PROCESS, + return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_getioprio(struct task_struct *p) { - return avc_has_perm(&selinux_state, - current_sid(), task_sid_obj(p), SECCLASS_PROCESS, + return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__GETSCHED, NULL); } @@ -4167,8 +4089,7 @@ static int selinux_task_prlimit(const struct cred *cred, const struct cred *tcre av |= PROCESS__SETRLIMIT; if (flags & LSM_PRLIMIT_READ) av |= PROCESS__GETRLIMIT; - return avc_has_perm(&selinux_state, - cred_sid(cred), cred_sid(tcred), + return avc_has_perm(cred_sid(cred), cred_sid(tcred), SECCLASS_PROCESS, av, NULL); } @@ -4182,8 +4103,7 @@ static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource, later be used as a safe reset point for the soft limit upon context transitions. See selinux_bprm_committing_creds. */ if (old_rlim->rlim_max != new_rlim->rlim_max) - return avc_has_perm(&selinux_state, - current_sid(), task_sid_obj(p), + return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETRLIMIT, NULL); return 0; @@ -4191,22 +4111,19 @@ static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource, static int selinux_task_setscheduler(struct task_struct *p) { - return avc_has_perm(&selinux_state, - current_sid(), task_sid_obj(p), SECCLASS_PROCESS, + return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_getscheduler(struct task_struct *p) { - return avc_has_perm(&selinux_state, - current_sid(), task_sid_obj(p), SECCLASS_PROCESS, + return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__GETSCHED, NULL); } static int selinux_task_movememory(struct task_struct *p) { - return avc_has_perm(&selinux_state, - current_sid(), task_sid_obj(p), SECCLASS_PROCESS, + return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } @@ -4224,8 +4141,7 @@ static int selinux_task_kill(struct task_struct *p, struct kernel_siginfo *info, secid = current_sid(); else secid = cred_sid(cred); - return avc_has_perm(&selinux_state, - secid, task_sid_obj(p), SECCLASS_PROCESS, perm, NULL); + return avc_has_perm(secid, task_sid_obj(p), SECCLASS_PROCESS, perm, NULL); } static void selinux_task_to_inode(struct task_struct *p, @@ -4245,8 +4161,8 @@ static int selinux_userns_create(const struct cred *cred) { u32 sid = current_sid(); - return avc_has_perm(&selinux_state, sid, sid, SECCLASS_USER_NAMESPACE, - USER_NAMESPACE__CREATE, NULL); + return avc_has_perm(sid, sid, SECCLASS_USER_NAMESPACE, + USER_NAMESPACE__CREATE, NULL); } /* Returns error only if unable to parse addresses */ @@ -4504,7 +4420,7 @@ static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid) if (unlikely(err)) return -EACCES; - err = security_net_peersid_resolve(&selinux_state, nlbl_sid, + err = security_net_peersid_resolve(nlbl_sid, nlbl_type, xfrm_sid, sid); if (unlikely(err)) { pr_warn( @@ -4533,7 +4449,7 @@ static int selinux_conn_sid(u32 sk_sid, u32 skb_sid, u32 *conn_sid) int err = 0; if (skb_sid != SECSID_NULL) - err = security_sid_mls_copy(&selinux_state, sk_sid, skb_sid, + err = security_sid_mls_copy(sk_sid, skb_sid, conn_sid); else *conn_sid = sk_sid; @@ -4551,7 +4467,7 @@ static int socket_sockcreate_sid(const struct task_security_struct *tsec, return 0; } - return security_transition_sid(&selinux_state, tsec->sid, tsec->sid, + return security_transition_sid(tsec->sid, tsec->sid, secclass, NULL, socksid); } @@ -4568,8 +4484,7 @@ static int sock_has_perm(struct sock *sk, u32 perms) ad.u.net = &net; ad.u.net->sk = sk; - return avc_has_perm(&selinux_state, - current_sid(), sksec->sid, sksec->sclass, perms, + return avc_has_perm(current_sid(), sksec->sid, sksec->sclass, perms, &ad); } @@ -4589,8 +4504,7 @@ static int selinux_socket_create(int family, int type, if (rc) return rc; - return avc_has_perm(&selinux_state, - tsec->sid, newsid, secclass, SOCKET__CREATE, NULL); + return avc_has_perm(tsec->sid, newsid, secclass, SOCKET__CREATE, NULL); } static int selinux_socket_post_create(struct socket *sock, int family, @@ -4719,8 +4633,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in snum, &sid); if (err) goto out; - err = avc_has_perm(&selinux_state, - sksec->sid, sid, + err = avc_has_perm(sksec->sid, sid, sksec->sclass, SOCKET__NAME_BIND, &ad); if (err) @@ -4759,8 +4672,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in else ad.u.net->v6info.saddr = addr6->sin6_addr; - err = avc_has_perm(&selinux_state, - sksec->sid, sid, + err = avc_has_perm(sksec->sid, sid, sksec->sclass, node_perm, &ad); if (err) goto out; @@ -4858,8 +4770,7 @@ static int selinux_socket_connect_helper(struct socket *sock, ad.u.net = &net; ad.u.net->dport = htons(snum); ad.u.net->family = address->sa_family; - err = avc_has_perm(&selinux_state, - sksec->sid, sid, sksec->sclass, perm, &ad); + err = avc_has_perm(sksec->sid, sid, sksec->sclass, perm, &ad); if (err) return err; } @@ -4971,8 +4882,7 @@ static int selinux_socket_unix_stream_connect(struct sock *sock, ad.u.net = &net; ad.u.net->sk = other; - err = avc_has_perm(&selinux_state, - sksec_sock->sid, sksec_other->sid, + err = avc_has_perm(sksec_sock->sid, sksec_other->sid, sksec_other->sclass, UNIX_STREAM_SOCKET__CONNECTTO, &ad); if (err) @@ -4980,7 +4890,7 @@ static int selinux_socket_unix_stream_connect(struct sock *sock, /* server child socket */ sksec_new->peer_sid = sksec_sock->sid; - err = security_sid_mls_copy(&selinux_state, sksec_other->sid, + err = security_sid_mls_copy(sksec_other->sid, sksec_sock->sid, &sksec_new->sid); if (err) return err; @@ -5003,8 +4913,7 @@ static int selinux_socket_unix_may_send(struct socket *sock, ad.u.net = &net; ad.u.net->sk = other->sk; - return avc_has_perm(&selinux_state, - ssec->sid, osec->sid, osec->sclass, SOCKET__SENDTO, + return avc_has_perm(ssec->sid, osec->sid, osec->sclass, SOCKET__SENDTO, &ad); } @@ -5019,8 +4928,7 @@ static int selinux_inet_sys_rcv_skb(struct net *ns, int ifindex, err = sel_netif_sid(ns, ifindex, &if_sid); if (err) return err; - err = avc_has_perm(&selinux_state, - peer_sid, if_sid, + err = avc_has_perm(peer_sid, if_sid, SECCLASS_NETIF, NETIF__INGRESS, ad); if (err) return err; @@ -5028,8 +4936,7 @@ static int selinux_inet_sys_rcv_skb(struct net *ns, int ifindex, err = sel_netnode_sid(addrp, family, &node_sid); if (err) return err; - return avc_has_perm(&selinux_state, - peer_sid, node_sid, + return avc_has_perm(peer_sid, node_sid, SECCLASS_NODE, NODE__RECVFROM, ad); } @@ -5052,8 +4959,7 @@ static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb, return err; if (selinux_secmark_enabled()) { - err = avc_has_perm(&selinux_state, - sk_sid, skb->secmark, SECCLASS_PACKET, + err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET, PACKET__RECV, &ad); if (err) return err; @@ -5118,8 +5024,7 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) selinux_netlbl_err(skb, family, err, 0); return err; } - err = avc_has_perm(&selinux_state, - sk_sid, peer_sid, SECCLASS_PEER, + err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER, PEER__RECV, &ad); if (err) { selinux_netlbl_err(skb, family, err, 0); @@ -5128,8 +5033,7 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) } if (secmark_active) { - err = avc_has_perm(&selinux_state, - sk_sid, skb->secmark, SECCLASS_PACKET, + err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET, PACKET__RECV, &ad); if (err) return err; @@ -5155,7 +5059,7 @@ static int selinux_socket_getpeersec_stream(struct socket *sock, if (peer_sid == SECSID_NULL) return -ENOPROTOOPT; - err = security_sid_to_context(&selinux_state, peer_sid, &scontext, + err = security_sid_to_context(peer_sid, &scontext, &scontext_len); if (err) return err; @@ -5312,8 +5216,7 @@ static int selinux_sctp_process_new_assoc(struct sctp_association *asoc, ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->sk = asoc->base.sk; - err = avc_has_perm(&selinux_state, - sksec->peer_sid, asoc->peer_secid, + err = avc_has_perm(sksec->peer_sid, asoc->peer_secid, sksec->sclass, SCTP_SOCKET__ASSOCIATION, &ad); if (err) @@ -5534,8 +5437,7 @@ static int selinux_secmark_relabel_packet(u32 sid) __tsec = selinux_cred(current_cred()); tsid = __tsec->sid; - return avc_has_perm(&selinux_state, - tsid, sid, SECCLASS_PACKET, PACKET__RELABELTO, + return avc_has_perm(tsid, sid, SECCLASS_PACKET, PACKET__RELABELTO, NULL); } @@ -5584,8 +5486,7 @@ static int selinux_tun_dev_create(void) * connections unlike traditional sockets - check the TUN driver to * get a better understanding of why this socket is special */ - return avc_has_perm(&selinux_state, - sid, sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__CREATE, + return avc_has_perm(sid, sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__CREATE, NULL); } @@ -5593,8 +5494,7 @@ static int selinux_tun_dev_attach_queue(void *security) { struct tun_security_struct *tunsec = security; - return avc_has_perm(&selinux_state, - current_sid(), tunsec->sid, SECCLASS_TUN_SOCKET, + return avc_has_perm(current_sid(), tunsec->sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__ATTACH_QUEUE, NULL); } @@ -5622,13 +5522,11 @@ static int selinux_tun_dev_open(void *security) u32 sid = current_sid(); int err; - err = avc_has_perm(&selinux_state, - sid, tunsec->sid, SECCLASS_TUN_SOCKET, + err = avc_has_perm(sid, tunsec->sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__RELABELFROM, NULL); if (err) return err; - err = avc_has_perm(&selinux_state, - sid, sid, SECCLASS_TUN_SOCKET, + err = avc_has_perm(sid, sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__RELABELTO, NULL); if (err) return err; @@ -5682,8 +5580,7 @@ static unsigned int selinux_ip_forward(void *priv, struct sk_buff *skb, } if (secmark_active) - if (avc_has_perm(&selinux_state, - peer_sid, skb->secmark, + if (avc_has_perm(peer_sid, skb->secmark, SECCLASS_PACKET, PACKET__FORWARD_IN, &ad)) return NF_DROP; @@ -5763,8 +5660,7 @@ static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb, return NF_DROP; if (selinux_secmark_enabled()) - if (avc_has_perm(&selinux_state, - sksec->sid, skb->secmark, + if (avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET, PACKET__SEND, &ad)) return NF_DROP_ERR(-ECONNREFUSED); @@ -5889,8 +5785,7 @@ static unsigned int selinux_ip_postroute(void *priv, return NF_DROP; if (secmark_active) - if (avc_has_perm(&selinux_state, - peer_sid, skb->secmark, + if (avc_has_perm(peer_sid, skb->secmark, SECCLASS_PACKET, secmark_perm, &ad)) return NF_DROP_ERR(-ECONNREFUSED); @@ -5900,15 +5795,13 @@ static unsigned int selinux_ip_postroute(void *priv, if (sel_netif_sid(state->net, ifindex, &if_sid)) return NF_DROP; - if (avc_has_perm(&selinux_state, - peer_sid, if_sid, + if (avc_has_perm(peer_sid, if_sid, SECCLASS_NETIF, NETIF__EGRESS, &ad)) return NF_DROP_ERR(-ECONNREFUSED); if (sel_netnode_sid(addrp, family, &node_sid)) return NF_DROP; - if (avc_has_perm(&selinux_state, - peer_sid, node_sid, + if (avc_has_perm(peer_sid, node_sid, SECCLASS_NODE, NODE__SENDTO, &ad)) return NF_DROP_ERR(-ECONNREFUSED); } @@ -5953,8 +5846,8 @@ static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb) sk->sk_protocol, nlh->nlmsg_type, secclass_map[sclass - 1].name, task_pid_nr(current), current->comm); - if (enforcing_enabled(&selinux_state) && - !security_get_allow_unknown(&selinux_state)) + if (enforcing_enabled() && + !security_get_allow_unknown()) return rc; rc = 0; } else if (rc == -ENOENT) { @@ -5993,8 +5886,7 @@ static int ipc_has_perm(struct kern_ipc_perm *ipc_perms, ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = ipc_perms->key; - return avc_has_perm(&selinux_state, - sid, isec->sid, isec->sclass, perms, &ad); + return avc_has_perm(sid, isec->sid, isec->sclass, perms, &ad); } static int selinux_msg_msg_alloc_security(struct msg_msg *msg) @@ -6020,8 +5912,7 @@ static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq) ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = msq->key; - return avc_has_perm(&selinux_state, - sid, isec->sid, SECCLASS_MSGQ, + return avc_has_perm(sid, isec->sid, SECCLASS_MSGQ, MSGQ__CREATE, &ad); } @@ -6036,8 +5927,7 @@ static int selinux_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg) ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = msq->key; - return avc_has_perm(&selinux_state, - sid, isec->sid, SECCLASS_MSGQ, + return avc_has_perm(sid, isec->sid, SECCLASS_MSGQ, MSGQ__ASSOCIATE, &ad); } @@ -6050,8 +5940,7 @@ static int selinux_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd) case IPC_INFO: case MSG_INFO: /* No specific object, just general system-wide information. */ - return avc_has_perm(&selinux_state, - current_sid(), SECINITSID_KERNEL, + return avc_has_perm(current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case IPC_STAT: case MSG_STAT: @@ -6091,7 +5980,7 @@ static int selinux_msg_queue_msgsnd(struct kern_ipc_perm *msq, struct msg_msg *m * Compute new sid based on current process and * message queue this message will be stored in */ - rc = security_transition_sid(&selinux_state, sid, isec->sid, + rc = security_transition_sid(sid, isec->sid, SECCLASS_MSG, NULL, &msec->sid); if (rc) return rc; @@ -6101,18 +5990,15 @@ static int selinux_msg_queue_msgsnd(struct kern_ipc_perm *msq, struct msg_msg *m ad.u.ipc_id = msq->key; /* Can this process write to the queue? */ - rc = avc_has_perm(&selinux_state, - sid, isec->sid, SECCLASS_MSGQ, + rc = avc_has_perm(sid, isec->sid, SECCLASS_MSGQ, MSGQ__WRITE, &ad); if (!rc) /* Can this process send the message */ - rc = avc_has_perm(&selinux_state, - sid, msec->sid, SECCLASS_MSG, + rc = avc_has_perm(sid, msec->sid, SECCLASS_MSG, MSG__SEND, &ad); if (!rc) /* Can the message be put in the queue? */ - rc = avc_has_perm(&selinux_state, - msec->sid, isec->sid, SECCLASS_MSGQ, + rc = avc_has_perm(msec->sid, isec->sid, SECCLASS_MSGQ, MSGQ__ENQUEUE, &ad); return rc; @@ -6134,12 +6020,10 @@ static int selinux_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *m ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = msq->key; - rc = avc_has_perm(&selinux_state, - sid, isec->sid, + rc = avc_has_perm(sid, isec->sid, SECCLASS_MSGQ, MSGQ__READ, &ad); if (!rc) - rc = avc_has_perm(&selinux_state, - sid, msec->sid, + rc = avc_has_perm(sid, msec->sid, SECCLASS_MSG, MSG__RECEIVE, &ad); return rc; } @@ -6157,8 +6041,7 @@ static int selinux_shm_alloc_security(struct kern_ipc_perm *shp) ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = shp->key; - return avc_has_perm(&selinux_state, - sid, isec->sid, SECCLASS_SHM, + return avc_has_perm(sid, isec->sid, SECCLASS_SHM, SHM__CREATE, &ad); } @@ -6173,8 +6056,7 @@ static int selinux_shm_associate(struct kern_ipc_perm *shp, int shmflg) ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = shp->key; - return avc_has_perm(&selinux_state, - sid, isec->sid, SECCLASS_SHM, + return avc_has_perm(sid, isec->sid, SECCLASS_SHM, SHM__ASSOCIATE, &ad); } @@ -6188,8 +6070,7 @@ static int selinux_shm_shmctl(struct kern_ipc_perm *shp, int cmd) case IPC_INFO: case SHM_INFO: /* No specific object, just general system-wide information. */ - return avc_has_perm(&selinux_state, - current_sid(), SECINITSID_KERNEL, + return avc_has_perm(current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case IPC_STAT: case SHM_STAT: @@ -6240,8 +6121,7 @@ static int selinux_sem_alloc_security(struct kern_ipc_perm *sma) ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = sma->key; - return avc_has_perm(&selinux_state, - sid, isec->sid, SECCLASS_SEM, + return avc_has_perm(sid, isec->sid, SECCLASS_SEM, SEM__CREATE, &ad); } @@ -6256,8 +6136,7 @@ static int selinux_sem_associate(struct kern_ipc_perm *sma, int semflg) ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = sma->key; - return avc_has_perm(&selinux_state, - sid, isec->sid, SECCLASS_SEM, + return avc_has_perm(sid, isec->sid, SECCLASS_SEM, SEM__ASSOCIATE, &ad); } @@ -6271,8 +6150,7 @@ static int selinux_sem_semctl(struct kern_ipc_perm *sma, int cmd) case IPC_INFO: case SEM_INFO: /* No specific object, just general system-wide information. */ - return avc_has_perm(&selinux_state, - current_sid(), SECINITSID_KERNEL, + return avc_has_perm(current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case GETPID: case GETNCNT: @@ -6359,8 +6237,7 @@ static int selinux_getprocattr(struct task_struct *p, __tsec = selinux_cred(__task_cred(p)); if (current != p) { - error = avc_has_perm(&selinux_state, - current_sid(), __tsec->sid, + error = avc_has_perm(current_sid(), __tsec->sid, SECCLASS_PROCESS, PROCESS__GETATTR, NULL); if (error) goto bad; @@ -6387,7 +6264,7 @@ static int selinux_getprocattr(struct task_struct *p, if (!sid) return 0; - error = security_sid_to_context(&selinux_state, sid, value, &len); + error = security_sid_to_context(sid, value, &len); if (error) return error; return len; @@ -6409,24 +6286,19 @@ static int selinux_setprocattr(const char *name, void *value, size_t size) * Basic control over ability to set these attributes at all. */ if (!strcmp(name, "exec")) - error = avc_has_perm(&selinux_state, - mysid, mysid, SECCLASS_PROCESS, + error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS, PROCESS__SETEXEC, NULL); else if (!strcmp(name, "fscreate")) - error = avc_has_perm(&selinux_state, - mysid, mysid, SECCLASS_PROCESS, + error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS, PROCESS__SETFSCREATE, NULL); else if (!strcmp(name, "keycreate")) - error = avc_has_perm(&selinux_state, - mysid, mysid, SECCLASS_PROCESS, + error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS, PROCESS__SETKEYCREATE, NULL); else if (!strcmp(name, "sockcreate")) - error = avc_has_perm(&selinux_state, - mysid, mysid, SECCLASS_PROCESS, + error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS, PROCESS__SETSOCKCREATE, NULL); else if (!strcmp(name, "current")) - error = avc_has_perm(&selinux_state, - mysid, mysid, SECCLASS_PROCESS, + error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS, PROCESS__SETCURRENT, NULL); else error = -EINVAL; @@ -6439,7 +6311,7 @@ static int selinux_setprocattr(const char *name, void *value, size_t size) str[size-1] = 0; size--; } - error = security_context_to_sid(&selinux_state, value, size, + error = security_context_to_sid(value, size, &sid, GFP_KERNEL); if (error == -EINVAL && !strcmp(name, "fscreate")) { if (!has_cap_mac_admin(true)) { @@ -6463,9 +6335,8 @@ static int selinux_setprocattr(const char *name, void *value, size_t size) return error; } - error = security_context_to_sid_force( - &selinux_state, - value, size, &sid); + error = security_context_to_sid_force(value, size, + &sid); } if (error) return error; @@ -6488,7 +6359,7 @@ static int selinux_setprocattr(const char *name, void *value, size_t size) tsec->create_sid = sid; } else if (!strcmp(name, "keycreate")) { if (sid) { - error = avc_has_perm(&selinux_state, mysid, sid, + error = avc_has_perm(mysid, sid, SECCLASS_KEY, KEY__CREATE, NULL); if (error) goto abort_change; @@ -6503,15 +6374,13 @@ static int selinux_setprocattr(const char *name, void *value, size_t size) /* Only allow single threaded processes to change context */ if (!current_is_single_threaded()) { - error = security_bounded_transition(&selinux_state, - tsec->sid, sid); + error = security_bounded_transition(tsec->sid, sid); if (error) goto abort_change; } /* Check permissions for the transition. */ - error = avc_has_perm(&selinux_state, - tsec->sid, sid, SECCLASS_PROCESS, + error = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS, PROCESS__DYNTRANSITION, NULL); if (error) goto abort_change; @@ -6520,8 +6389,7 @@ static int selinux_setprocattr(const char *name, void *value, size_t size) Otherwise, leave SID unchanged and fail. */ ptsid = ptrace_parent_sid(); if (ptsid != 0) { - error = avc_has_perm(&selinux_state, - ptsid, sid, SECCLASS_PROCESS, + error = avc_has_perm(ptsid, sid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL); if (error) goto abort_change; @@ -6548,13 +6416,13 @@ static int selinux_ismaclabel(const char *name) static int selinux_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { - return security_sid_to_context(&selinux_state, secid, + return security_sid_to_context(secid, secdata, seclen); } static int selinux_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { - return security_context_to_sid(&selinux_state, secdata, seclen, + return security_context_to_sid(secdata, seclen, secid, GFP_KERNEL); } @@ -6674,8 +6542,7 @@ static int selinux_key_permission(key_ref_t key_ref, key = key_ref_to_ptr(key_ref); ksec = key->security; - return avc_has_perm(&selinux_state, - sid, ksec->sid, SECCLASS_KEY, perm, NULL); + return avc_has_perm(sid, ksec->sid, SECCLASS_KEY, perm, NULL); } static int selinux_key_getsecurity(struct key *key, char **_buffer) @@ -6685,7 +6552,7 @@ static int selinux_key_getsecurity(struct key *key, char **_buffer) unsigned len; int rc; - rc = security_sid_to_context(&selinux_state, ksec->sid, + rc = security_sid_to_context(ksec->sid, &context, &len); if (!rc) rc = len; @@ -6699,8 +6566,7 @@ static int selinux_watch_key(struct key *key) struct key_security_struct *ksec = key->security; u32 sid = current_sid(); - return avc_has_perm(&selinux_state, - sid, ksec->sid, SECCLASS_KEY, KEY__VIEW, NULL); + return avc_has_perm(sid, ksec->sid, SECCLASS_KEY, KEY__VIEW, NULL); } #endif #endif @@ -6722,8 +6588,7 @@ static int selinux_ib_pkey_access(void *ib_sec, u64 subnet_prefix, u16 pkey_val) ibpkey.subnet_prefix = subnet_prefix; ibpkey.pkey = pkey_val; ad.u.ibpkey = &ibpkey; - return avc_has_perm(&selinux_state, - sec->sid, sid, + return avc_has_perm(sec->sid, sid, SECCLASS_INFINIBAND_PKEY, INFINIBAND_PKEY__ACCESS, &ad); } @@ -6737,7 +6602,7 @@ static int selinux_ib_endport_manage_subnet(void *ib_sec, const char *dev_name, struct ib_security_struct *sec = ib_sec; struct lsm_ibendport_audit ibendport; - err = security_ib_endport_sid(&selinux_state, dev_name, port_num, + err = security_ib_endport_sid(dev_name, port_num, &sid); if (err) @@ -6747,8 +6612,7 @@ static int selinux_ib_endport_manage_subnet(void *ib_sec, const char *dev_name, ibendport.dev_name = dev_name; ibendport.port = port_num; ad.u.ibendport = &ibendport; - return avc_has_perm(&selinux_state, - sec->sid, sid, + return avc_has_perm(sec->sid, sid, SECCLASS_INFINIBAND_ENDPORT, INFINIBAND_ENDPORT__MANAGE_SUBNET, &ad); } @@ -6781,13 +6645,11 @@ static int selinux_bpf(int cmd, union bpf_attr *attr, switch (cmd) { case BPF_MAP_CREATE: - ret = avc_has_perm(&selinux_state, - sid, sid, SECCLASS_BPF, BPF__MAP_CREATE, + ret = avc_has_perm(sid, sid, SECCLASS_BPF, BPF__MAP_CREATE, NULL); break; case BPF_PROG_LOAD: - ret = avc_has_perm(&selinux_state, - sid, sid, SECCLASS_BPF, BPF__PROG_LOAD, + ret = avc_has_perm(sid, sid, SECCLASS_BPF, BPF__PROG_LOAD, NULL); break; default: @@ -6827,16 +6689,14 @@ static int bpf_fd_pass(struct file *file, u32 sid) if (file->f_op == &bpf_map_fops) { map = file->private_data; bpfsec = map->security; - ret = avc_has_perm(&selinux_state, - sid, bpfsec->sid, SECCLASS_BPF, + ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, bpf_map_fmode_to_av(file->f_mode), NULL); if (ret) return ret; } else if (file->f_op == &bpf_prog_fops) { prog = file->private_data; bpfsec = prog->aux->security; - ret = avc_has_perm(&selinux_state, - sid, bpfsec->sid, SECCLASS_BPF, + ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, BPF__PROG_RUN, NULL); if (ret) return ret; @@ -6850,8 +6710,7 @@ static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode) struct bpf_security_struct *bpfsec; bpfsec = map->security; - return avc_has_perm(&selinux_state, - sid, bpfsec->sid, SECCLASS_BPF, + return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, bpf_map_fmode_to_av(fmode), NULL); } @@ -6861,8 +6720,7 @@ static int selinux_bpf_prog(struct bpf_prog *prog) struct bpf_security_struct *bpfsec; bpfsec = prog->aux->security; - return avc_has_perm(&selinux_state, - sid, bpfsec->sid, SECCLASS_BPF, + return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, BPF__PROG_RUN, NULL); } @@ -6911,7 +6769,7 @@ static void selinux_bpf_prog_free(struct bpf_prog_aux *aux) } #endif -struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = { +struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = { .lbs_cred = sizeof(struct task_security_struct), .lbs_file = sizeof(struct file_security_struct), .lbs_inode = sizeof(struct inode_security_struct), @@ -6936,7 +6794,7 @@ static int selinux_perf_event_open(struct perf_event_attr *attr, int type) else return -EINVAL; - return avc_has_perm(&selinux_state, sid, sid, SECCLASS_PERF_EVENT, + return avc_has_perm(sid, sid, SECCLASS_PERF_EVENT, requested, NULL); } @@ -6967,7 +6825,7 @@ static int selinux_perf_event_read(struct perf_event *event) struct perf_event_security_struct *perfsec = event->security; u32 sid = current_sid(); - return avc_has_perm(&selinux_state, sid, perfsec->sid, + return avc_has_perm(sid, perfsec->sid, SECCLASS_PERF_EVENT, PERF_EVENT__READ, NULL); } @@ -6976,7 +6834,7 @@ static int selinux_perf_event_write(struct perf_event *event) struct perf_event_security_struct *perfsec = event->security; u32 sid = current_sid(); - return avc_has_perm(&selinux_state, sid, perfsec->sid, + return avc_has_perm(sid, perfsec->sid, SECCLASS_PERF_EVENT, PERF_EVENT__WRITE, NULL); } #endif @@ -6991,7 +6849,7 @@ static int selinux_perf_event_write(struct perf_event *event) */ static int selinux_uring_override_creds(const struct cred *new) { - return avc_has_perm(&selinux_state, current_sid(), cred_sid(new), + return avc_has_perm(current_sid(), cred_sid(new), SECCLASS_IO_URING, IO_URING__OVERRIDE_CREDS, NULL); } @@ -7005,7 +6863,7 @@ static int selinux_uring_sqpoll(void) { int sid = current_sid(); - return avc_has_perm(&selinux_state, sid, sid, + return avc_has_perm(sid, sid, SECCLASS_IO_URING, IO_URING__SQPOLL, NULL); } @@ -7027,7 +6885,7 @@ static int selinux_uring_cmd(struct io_uring_cmd *ioucmd) ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; - return avc_has_perm(&selinux_state, current_sid(), isec->sid, + return avc_has_perm(current_sid(), isec->sid, SECCLASS_IO_URING, IO_URING__CMD, &ad); } #endif /* CONFIG_IO_URING */ @@ -7047,7 +6905,7 @@ static int selinux_uring_cmd(struct io_uring_cmd *ioucmd) * safely. Breaking the ordering rules above might lead to NULL pointer derefs * when disabling SELinux at runtime. */ -static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { +static struct security_hook_list selinux_hooks[] __ro_after_init = { LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr), LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction), LSM_HOOK_INIT(binder_transfer_binder, selinux_binder_transfer_binder), @@ -7334,11 +7192,8 @@ static __init int selinux_init(void) pr_info("SELinux: Initializing.\n"); memset(&selinux_state, 0, sizeof(selinux_state)); - enforcing_set(&selinux_state, selinux_enforcing_boot); - if (CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE) - pr_err("SELinux: CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE is non-zero. This is deprecated and will be rejected in a future kernel release.\n"); - checkreqprot_set(&selinux_state, selinux_checkreqprot_boot); - selinux_avc_init(&selinux_state.avc); + enforcing_set(selinux_enforcing_boot); + selinux_avc_init(); mutex_init(&selinux_state.status_lock); mutex_init(&selinux_state.policy_mutex); @@ -7398,7 +7253,6 @@ DEFINE_LSM(selinux) = { }; #if defined(CONFIG_NETFILTER) - static const struct nf_hook_ops selinux_nf_ops[] = { { .hook = selinux_ip_postroute, @@ -7473,56 +7327,4 @@ static int __init selinux_nf_ip_init(void) return 0; } __initcall(selinux_nf_ip_init); - -#ifdef CONFIG_SECURITY_SELINUX_DISABLE -static void selinux_nf_ip_exit(void) -{ - pr_debug("SELinux: Unregistering netfilter hooks\n"); - - unregister_pernet_subsys(&selinux_net_ops); -} -#endif - -#else /* CONFIG_NETFILTER */ - -#ifdef CONFIG_SECURITY_SELINUX_DISABLE -#define selinux_nf_ip_exit() -#endif - #endif /* CONFIG_NETFILTER */ - -#ifdef CONFIG_SECURITY_SELINUX_DISABLE -int selinux_disable(struct selinux_state *state) -{ - if (selinux_initialized(state)) { - /* Not permitted after initial policy load. */ - return -EINVAL; - } - - if (selinux_disabled(state)) { - /* Only do this once. */ - return -EINVAL; - } - - selinux_mark_disabled(state); - - pr_info("SELinux: Disabled at runtime.\n"); - - /* - * Unregister netfilter hooks. - * Must be done before security_delete_hooks() to avoid breaking - * runtime disable. - */ - selinux_nf_ip_exit(); - - security_delete_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks)); - - /* Try to destroy the avc node cache */ - avc_disable(); - - /* Unregister selinuxfs. */ - exit_sel_fs(); - - return 0; -} -#endif diff --git a/security/selinux/ibpkey.c b/security/selinux/ibpkey.c index 5839ca7bb9c7..48f537b41c58 100644 --- a/security/selinux/ibpkey.c +++ b/security/selinux/ibpkey.c @@ -141,7 +141,7 @@ static int sel_ib_pkey_sid_slow(u64 subnet_prefix, u16 pkey_num, u32 *sid) return 0; } - ret = security_ib_pkey_sid(&selinux_state, subnet_prefix, pkey_num, + ret = security_ib_pkey_sid(subnet_prefix, pkey_num, sid); if (ret) goto out; diff --git a/security/selinux/ima.c b/security/selinux/ima.c index a915b89d55b0..7daf59667f59 100644 --- a/security/selinux/ima.c +++ b/security/selinux/ima.c @@ -15,12 +15,10 @@ /* * selinux_ima_collect_state - Read selinux configuration settings * - * @state: selinux_state - * * On success returns the configuration settings string. * On error, returns NULL. */ -static char *selinux_ima_collect_state(struct selinux_state *state) +static char *selinux_ima_collect_state(void) { const char *on = "=1;", *off = "=0;"; char *buf; @@ -39,26 +37,27 @@ static char *selinux_ima_collect_state(struct selinux_state *state) rc = strscpy(buf, "initialized", buf_len); WARN_ON(rc < 0); - rc = strlcat(buf, selinux_initialized(state) ? on : off, buf_len); + rc = strlcat(buf, selinux_initialized() ? on : off, buf_len); WARN_ON(rc >= buf_len); rc = strlcat(buf, "enforcing", buf_len); WARN_ON(rc >= buf_len); - rc = strlcat(buf, enforcing_enabled(state) ? on : off, buf_len); + rc = strlcat(buf, enforcing_enabled() ? on : off, buf_len); WARN_ON(rc >= buf_len); rc = strlcat(buf, "checkreqprot", buf_len); WARN_ON(rc >= buf_len); - rc = strlcat(buf, checkreqprot_get(state) ? on : off, buf_len); + rc = strlcat(buf, checkreqprot_get() ? on : off, buf_len); WARN_ON(rc >= buf_len); for (i = 0; i < __POLICYDB_CAP_MAX; i++) { rc = strlcat(buf, selinux_policycap_names[i], buf_len); WARN_ON(rc >= buf_len); - rc = strlcat(buf, state->policycap[i] ? on : off, buf_len); + rc = strlcat(buf, selinux_state.policycap[i] ? on : off, + buf_len); WARN_ON(rc >= buf_len); } @@ -67,19 +66,17 @@ static char *selinux_ima_collect_state(struct selinux_state *state) /* * selinux_ima_measure_state_locked - Measure SELinux state and hash of policy - * - * @state: selinux state struct */ -void selinux_ima_measure_state_locked(struct selinux_state *state) +void selinux_ima_measure_state_locked(void) { char *state_str = NULL; void *policy = NULL; size_t policy_len; int rc = 0; - lockdep_assert_held(&state->policy_mutex); + lockdep_assert_held(&selinux_state.policy_mutex); - state_str = selinux_ima_collect_state(state); + state_str = selinux_ima_collect_state(); if (!state_str) { pr_err("SELinux: %s: failed to read state.\n", __func__); return; @@ -94,10 +91,10 @@ void selinux_ima_measure_state_locked(struct selinux_state *state) /* * Measure SELinux policy only after initialization is completed. */ - if (!selinux_initialized(state)) + if (!selinux_initialized()) return; - rc = security_read_state_kernel(state, &policy, &policy_len); + rc = security_read_state_kernel(&policy, &policy_len); if (rc) { pr_err("SELinux: %s: failed to read policy %d.\n", __func__, rc); return; @@ -112,14 +109,12 @@ void selinux_ima_measure_state_locked(struct selinux_state *state) /* * selinux_ima_measure_state - Measure SELinux state and hash of policy - * - * @state: selinux state struct */ -void selinux_ima_measure_state(struct selinux_state *state) +void selinux_ima_measure_state(void) { - lockdep_assert_not_held(&state->policy_mutex); + lockdep_assert_not_held(&selinux_state.policy_mutex); - mutex_lock(&state->policy_mutex); - selinux_ima_measure_state_locked(state); - mutex_unlock(&state->policy_mutex); + mutex_lock(&selinux_state.policy_mutex); + selinux_ima_measure_state_locked(); + mutex_unlock(&selinux_state.policy_mutex); } diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h index 5525b94fd266..9301222c8e55 100644 --- a/security/selinux/include/avc.h +++ b/security/selinux/include/avc.h @@ -52,7 +52,6 @@ struct selinux_audit_data { u32 audited; u32 denied; int result; - struct selinux_state *state; } __randomize_layout; /* @@ -97,14 +96,12 @@ static inline u32 avc_audit_required(u32 requested, return audited; } -int slow_avc_audit(struct selinux_state *state, - u32 ssid, u32 tsid, u16 tclass, +int slow_avc_audit(u32 ssid, u32 tsid, u16 tclass, u32 requested, u32 audited, u32 denied, int result, struct common_audit_data *a); /** * avc_audit - Audit the granting or denial of permissions. - * @state: SELinux state * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class @@ -122,8 +119,7 @@ int slow_avc_audit(struct selinux_state *state, * be performed under a lock, to allow the lock to be released * before calling the auditing code. */ -static inline int avc_audit(struct selinux_state *state, - u32 ssid, u32 tsid, +static inline int avc_audit(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct av_decision *avd, int result, @@ -133,30 +129,27 @@ static inline int avc_audit(struct selinux_state *state, audited = avc_audit_required(requested, avd, result, 0, &denied); if (likely(!audited)) return 0; - return slow_avc_audit(state, ssid, tsid, tclass, + return slow_avc_audit(ssid, tsid, tclass, requested, audited, denied, result, a); } #define AVC_STRICT 1 /* Ignore permissive mode. */ #define AVC_EXTENDED_PERMS 2 /* update extended permissions */ -int avc_has_perm_noaudit(struct selinux_state *state, - u32 ssid, u32 tsid, +int avc_has_perm_noaudit(u32 ssid, u32 tsid, u16 tclass, u32 requested, unsigned flags, struct av_decision *avd); -int avc_has_perm(struct selinux_state *state, - u32 ssid, u32 tsid, +int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct common_audit_data *auditdata); -int avc_has_extended_perms(struct selinux_state *state, - u32 ssid, u32 tsid, u16 tclass, u32 requested, +int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, u8 driver, u8 perm, struct common_audit_data *ad); -u32 avc_policy_seqno(struct selinux_state *state); +u32 avc_policy_seqno(void); #define AVC_CALLBACK_GRANT 1 #define AVC_CALLBACK_TRY_REVOKE 2 @@ -171,11 +164,9 @@ u32 avc_policy_seqno(struct selinux_state *state); int avc_add_callback(int (*callback)(u32 event), u32 events); /* Exported to selinuxfs */ -struct selinux_avc; -int avc_get_hash_stats(struct selinux_avc *avc, char *page); -unsigned int avc_get_cache_threshold(struct selinux_avc *avc); -void avc_set_cache_threshold(struct selinux_avc *avc, - unsigned int cache_threshold); +int avc_get_hash_stats(char *page); +unsigned int avc_get_cache_threshold(void); +void avc_set_cache_threshold(unsigned int cache_threshold); /* Attempt to free avc node cache */ void avc_disable(void); diff --git a/security/selinux/include/avc_ss.h b/security/selinux/include/avc_ss.h index 42912c917fd4..b9668be7b443 100644 --- a/security/selinux/include/avc_ss.h +++ b/security/selinux/include/avc_ss.h @@ -9,8 +9,7 @@ #include <linux/types.h> -struct selinux_avc; -int avc_ss_reset(struct selinux_avc *avc, u32 seqno); +int avc_ss_reset(u32 seqno); /* Class/perm mapping support */ struct security_class_mapping { diff --git a/security/selinux/include/conditional.h b/security/selinux/include/conditional.h index b09343346e3f..693a654714eb 100644 --- a/security/selinux/include/conditional.h +++ b/security/selinux/include/conditional.h @@ -16,8 +16,8 @@ int security_get_bools(struct selinux_policy *policy, u32 *len, char ***names, int **values); -int security_set_bools(struct selinux_state *state, u32 len, int *values); +int security_set_bools(u32 len, int *values); -int security_get_bool_value(struct selinux_state *state, u32 index); +int security_get_bool_value(u32 index); #endif diff --git a/security/selinux/include/ima.h b/security/selinux/include/ima.h index 75ca92b4a462..05e04172c86d 100644 --- a/security/selinux/include/ima.h +++ b/security/selinux/include/ima.h @@ -14,15 +14,13 @@ #include "security.h" #ifdef CONFIG_IMA -extern void selinux_ima_measure_state(struct selinux_state *selinux_state); -extern void selinux_ima_measure_state_locked( - struct selinux_state *selinux_state); +extern void selinux_ima_measure_state(void); +extern void selinux_ima_measure_state_locked(void); #else -static inline void selinux_ima_measure_state(struct selinux_state *selinux_state) +static inline void selinux_ima_measure_state(void) { } -static inline void selinux_ima_measure_state_locked( - struct selinux_state *selinux_state) +static inline void selinux_ima_measure_state_locked(void) { } #endif diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 393aff41d3ef..8746fafeb778 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -86,94 +86,65 @@ extern int selinux_enabled_boot; /* limitation of boundary depth */ #define POLICYDB_BOUNDS_MAXDEPTH 4 -struct selinux_avc; struct selinux_policy; struct selinux_state { -#ifdef CONFIG_SECURITY_SELINUX_DISABLE - bool disabled; -#endif #ifdef CONFIG_SECURITY_SELINUX_DEVELOP bool enforcing; #endif - bool checkreqprot; bool initialized; bool policycap[__POLICYDB_CAP_MAX]; struct page *status_page; struct mutex status_lock; - struct selinux_avc *avc; struct selinux_policy __rcu *policy; struct mutex policy_mutex; } __randomize_layout; -void selinux_avc_init(struct selinux_avc **avc); +void selinux_avc_init(void); extern struct selinux_state selinux_state; -static inline bool selinux_initialized(const struct selinux_state *state) +static inline bool selinux_initialized(void) { /* do a synchronized load to avoid race conditions */ - return smp_load_acquire(&state->initialized); + return smp_load_acquire(&selinux_state.initialized); } -static inline void selinux_mark_initialized(struct selinux_state *state) +static inline void selinux_mark_initialized(void) { /* do a synchronized write to avoid race conditions */ - smp_store_release(&state->initialized, true); + smp_store_release(&selinux_state.initialized, true); } #ifdef CONFIG_SECURITY_SELINUX_DEVELOP -static inline bool enforcing_enabled(struct selinux_state *state) +static inline bool enforcing_enabled(void) { - return READ_ONCE(state->enforcing); + return READ_ONCE(selinux_state.enforcing); } -static inline void enforcing_set(struct selinux_state *state, bool value) +static inline void enforcing_set(bool value) { - WRITE_ONCE(state->enforcing, value); + WRITE_ONCE(selinux_state.enforcing, value); } #else -static inline bool enforcing_enabled(struct selinux_state *state) +static inline bool enforcing_enabled(void) { return true; } -static inline void enforcing_set(struct selinux_state *state, bool value) +static inline void enforcing_set(bool value) { } #endif -static inline bool checkreqprot_get(const struct selinux_state *state) -{ - return READ_ONCE(state->checkreqprot); -} - -static inline void checkreqprot_set(struct selinux_state *state, bool value) +static inline bool checkreqprot_get(void) { - if (value) - pr_err("SELinux: https://github.com/SELinuxProject/selinux-kernel/wiki/DEPRECATE-checkreqprot\n"); - WRITE_ONCE(state->checkreqprot, value); + /* non-zero/true checkreqprot values are no longer supported */ + return 0; } -#ifdef CONFIG_SECURITY_SELINUX_DISABLE -static inline bool selinux_disabled(struct selinux_state *state) -{ - return READ_ONCE(state->disabled); -} - -static inline void selinux_mark_disabled(struct selinux_state *state) -{ - WRITE_ONCE(state->disabled, true); -} -#else -static inline bool selinux_disabled(struct selinux_state *state) -{ - return false; -} -#endif - static inline bool selinux_policycap_netpeer(void) { struct selinux_state *state = &selinux_state; @@ -237,20 +208,14 @@ struct selinux_load_state { struct selinux_policy_convert_data *convert_data; }; -int security_mls_enabled(struct selinux_state *state); -int security_load_policy(struct selinux_state *state, - void *data, size_t len, +int security_mls_enabled(void); +int security_load_policy(void *data, size_t len, struct selinux_load_state *load_state); -void selinux_policy_commit(struct selinux_state *state, - struct selinux_load_state *load_state); -void selinux_policy_cancel(struct selinux_state *state, - struct selinux_load_state *load_state); -int security_read_policy(struct selinux_state *state, - void **data, size_t *len); -int security_read_state_kernel(struct selinux_state *state, - void **data, size_t *len); -int security_policycap_supported(struct selinux_state *state, - unsigned int req_cap); +void selinux_policy_commit(struct selinux_load_state *load_state); +void selinux_policy_cancel(struct selinux_load_state *load_state); +int security_read_policy(void **data, size_t *len); +int security_read_state_kernel(void **data, size_t *len); +int security_policycap_supported(unsigned int req_cap); #define SEL_VEC_MAX 32 struct av_decision { @@ -287,94 +252,68 @@ struct extended_perms { /* definitions of av_decision.flags */ #define AVD_FLAGS_PERMISSIVE 0x0001 -void security_compute_av(struct selinux_state *state, - u32 ssid, u32 tsid, +void security_compute_av(u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd, struct extended_perms *xperms); -void security_compute_xperms_decision(struct selinux_state *state, - u32 ssid, u32 tsid, u16 tclass, +void security_compute_xperms_decision(u32 ssid, u32 tsid, u16 tclass, u8 driver, struct extended_perms_decision *xpermd); -void security_compute_av_user(struct selinux_state *state, - u32 ssid, u32 tsid, +void security_compute_av_user(u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd); -int security_transition_sid(struct selinux_state *state, - u32 ssid, u32 tsid, u16 tclass, +int security_transition_sid(u32 ssid, u32 tsid, u16 tclass, const struct qstr *qstr, u32 *out_sid); -int security_transition_sid_user(struct selinux_state *state, - u32 ssid, u32 tsid, u16 tclass, +int security_transition_sid_user(u32 ssid, u32 tsid, u16 tclass, const char *objname, u32 *out_sid); -int security_member_sid(struct selinux_state *state, u32 ssid, u32 tsid, - u16 tclass, u32 *out_sid); +int security_member_sid(u32 ssid, u32 tsid, u16 tclass, u32 *out_sid); -int security_change_sid(struct selinux_state *state, u32 ssid, u32 tsid, - u16 tclass, u32 *out_sid); +int security_change_sid(u32 ssid, u32 tsid, u16 tclass, u32 *out_sid); -int security_sid_to_context(struct selinux_state *state, u32 sid, - char **scontext, u32 *scontext_len); +int security_sid_to_context(u32 sid, char **scontext, u32 *scontext_len); -int security_sid_to_context_force(struct selinux_state *state, - u32 sid, char **scontext, u32 *scontext_len); +int security_sid_to_context_force(u32 sid, char **scontext, u32 *scontext_len); -int security_sid_to_context_inval(struct selinux_state *state, - u32 sid, char **scontext, u32 *scontext_len); +int security_sid_to_context_inval(u32 sid, char **scontext, u32 *scontext_len); -int security_context_to_sid(struct selinux_state *state, - const char *scontext, u32 scontext_len, +int security_context_to_sid(const char *scontext, u32 scontext_len, u32 *out_sid, gfp_t gfp); -int security_context_str_to_sid(struct selinux_state *state, - const char *scontext, u32 *out_sid, gfp_t gfp); +int security_context_str_to_sid(const char *scontext, u32 *out_sid, gfp_t gfp); -int security_context_to_sid_default(struct selinux_state *state, - const char *scontext, u32 scontext_len, +int security_context_to_sid_default(const char *scontext, u32 scontext_len, u32 *out_sid, u32 def_sid, gfp_t gfp_flags); -int security_context_to_sid_force(struct selinux_state *state, - const char *scontext, u32 scontext_len, +int security_context_to_sid_force(const char *scontext, u32 scontext_len, u32 *sid); -int security_get_user_sids(struct selinux_state *state, - u32 callsid, char *username, - u32 **sids, u32 *nel); +int security_get_user_sids(u32 callsid, char *username, u32 **sids, u32 *nel); -int security_port_sid(struct selinux_state *state, - u8 protocol, u16 port, u32 *out_sid); +int security_port_sid(u8 protocol, u16 port, u32 *out_sid); -int security_ib_pkey_sid(struct selinux_state *state, - u64 subnet_prefix, u16 pkey_num, u32 *out_sid); +int security_ib_pkey_sid(u64 subnet_prefix, u16 pkey_num, u32 *out_sid); -int security_ib_endport_sid(struct selinux_state *state, - const char *dev_name, u8 port_num, u32 *out_sid); +int security_ib_endport_sid(const char *dev_name, u8 port_num, u32 *out_sid); -int security_netif_sid(struct selinux_state *state, - char *name, u32 *if_sid); +int security_netif_sid(char *name, u32 *if_sid); -int security_node_sid(struct selinux_state *state, - u16 domain, void *addr, u32 addrlen, +int security_node_sid(u16 domain, void *addr, u32 addrlen, u32 *out_sid); -int security_validate_transition(struct selinux_state *state, - u32 oldsid, u32 newsid, u32 tasksid, +int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, u16 tclass); -int security_validate_transition_user(struct selinux_state *state, - u32 oldsid, u32 newsid, u32 tasksid, +int security_validate_transition_user(u32 oldsid, u32 newsid, u32 tasksid, u16 tclass); -int security_bounded_transition(struct selinux_state *state, - u32 oldsid, u32 newsid); +int security_bounded_transition(u32 oldsid, u32 newsid); -int security_sid_mls_copy(struct selinux_state *state, - u32 sid, u32 mls_sid, u32 *new_sid); +int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid); -int security_net_peersid_resolve(struct selinux_state *state, - u32 nlbl_sid, u32 nlbl_type, +int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type, u32 xfrm_sid, u32 *peer_sid); @@ -382,8 +321,8 @@ int security_get_classes(struct selinux_policy *policy, char ***classes, int *nclasses); int security_get_permissions(struct selinux_policy *policy, char *class, char ***perms, int *nperms); -int security_get_reject_unknown(struct selinux_state *state); -int security_get_allow_unknown(struct selinux_state *state); +int security_get_reject_unknown(void); +int security_get_allow_unknown(void); #define SECURITY_FS_USE_XATTR 1 /* use xattr */ #define SECURITY_FS_USE_TRANS 2 /* use transition SIDs, e.g. devpts/tmpfs */ @@ -394,10 +333,9 @@ int security_get_allow_unknown(struct selinux_state *state); #define SECURITY_FS_USE_NATIVE 7 /* use native label support */ #define SECURITY_FS_USE_MAX 7 /* Highest SECURITY_FS_USE_XXX */ -int security_fs_use(struct selinux_state *state, struct super_block *sb); +int security_fs_use(struct super_block *sb); -int security_genfs_sid(struct selinux_state *state, - const char *fstype, const char *path, u16 sclass, +int security_genfs_sid(const char *fstype, const char *path, u16 sclass, u32 *sid); int selinux_policy_genfs_sid(struct selinux_policy *policy, @@ -405,23 +343,19 @@ int selinux_policy_genfs_sid(struct selinux_policy *policy, u32 *sid); #ifdef CONFIG_NETLABEL -int security_netlbl_secattr_to_sid(struct selinux_state *state, - struct netlbl_lsm_secattr *secattr, +int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, u32 *sid); -int security_netlbl_sid_to_secattr(struct selinux_state *state, - u32 sid, +int security_netlbl_sid_to_secattr(u32 sid, struct netlbl_lsm_secattr *secattr); #else -static inline int security_netlbl_secattr_to_sid(struct selinux_state *state, - struct netlbl_lsm_secattr *secattr, +static inline int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, u32 *sid) { return -EIDRM; } -static inline int security_netlbl_sid_to_secattr(struct selinux_state *state, - u32 sid, +static inline int security_netlbl_sid_to_secattr(u32 sid, struct netlbl_lsm_secattr *secattr) { return -ENOENT; @@ -433,7 +367,7 @@ const char *security_get_initial_sid_context(u32 sid); /* * status notifier using mmap interface */ -extern struct page *selinux_kernel_status_page(struct selinux_state *state); +extern struct page *selinux_kernel_status_page(void); #define SELINUX_KERNEL_STATUS_VERSION 1 struct selinux_kernel_status { @@ -447,12 +381,9 @@ struct selinux_kernel_status { */ } __packed; -extern void selinux_status_update_setenforce(struct selinux_state *state, - int enforcing); -extern void selinux_status_update_policyload(struct selinux_state *state, - int seqno); +extern void selinux_status_update_setenforce(int enforcing); +extern void selinux_status_update_policyload(int seqno); extern void selinux_complete_init(void); -extern int selinux_disable(struct selinux_state *state); extern void exit_sel_fs(void); extern struct path selinux_null; extern void selnl_notify_setenforce(int val); @@ -462,6 +393,6 @@ extern int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm); extern void avtab_cache_init(void); extern void ebitmap_cache_init(void); extern void hashtab_cache_init(void); -extern int security_sidtab_hash_stats(struct selinux_state *state, char *page); +extern int security_sidtab_hash_stats(char *page); #endif /* _SELINUX_SECURITY_H_ */ diff --git a/security/selinux/netif.c b/security/selinux/netif.c index 1ab03efe7494..adbe9bea2d26 100644 --- a/security/selinux/netif.c +++ b/security/selinux/netif.c @@ -153,7 +153,7 @@ static int sel_netif_sid_slow(struct net *ns, int ifindex, u32 *sid) goto out; } - ret = security_netif_sid(&selinux_state, dev->name, sid); + ret = security_netif_sid(dev->name, sid); if (ret != 0) goto out; new = kzalloc(sizeof(*new), GFP_ATOMIC); diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c index 1321f15799e2..767c670d33ea 100644 --- a/security/selinux/netlabel.c +++ b/security/selinux/netlabel.c @@ -46,7 +46,7 @@ static int selinux_netlbl_sidlookup_cached(struct sk_buff *skb, { int rc; - rc = security_netlbl_secattr_to_sid(&selinux_state, secattr, sid); + rc = security_netlbl_secattr_to_sid(secattr, sid); if (rc == 0 && (secattr->flags & NETLBL_SECATTR_CACHEABLE) && (secattr->flags & NETLBL_SECATTR_CACHE)) @@ -77,8 +77,7 @@ static struct netlbl_lsm_secattr *selinux_netlbl_sock_genattr(struct sock *sk) secattr = netlbl_secattr_alloc(GFP_ATOMIC); if (secattr == NULL) return NULL; - rc = security_netlbl_sid_to_secattr(&selinux_state, sksec->sid, - secattr); + rc = security_netlbl_sid_to_secattr(sksec->sid, secattr); if (rc != 0) { netlbl_secattr_free(secattr); return NULL; @@ -245,8 +244,7 @@ int selinux_netlbl_skbuff_setsid(struct sk_buff *skb, if (secattr == NULL) { secattr = &secattr_storage; netlbl_secattr_init(secattr); - rc = security_netlbl_sid_to_secattr(&selinux_state, sid, - secattr); + rc = security_netlbl_sid_to_secattr(sid, secattr); if (rc != 0) goto skbuff_setsid_return; } @@ -283,8 +281,7 @@ int selinux_netlbl_sctp_assoc_request(struct sctp_association *asoc, return 0; netlbl_secattr_init(&secattr); - rc = security_netlbl_sid_to_secattr(&selinux_state, - asoc->secid, &secattr); + rc = security_netlbl_sid_to_secattr(asoc->secid, &secattr); if (rc != 0) goto assoc_request_return; @@ -332,8 +329,7 @@ int selinux_netlbl_inet_conn_request(struct request_sock *req, u16 family) return 0; netlbl_secattr_init(&secattr); - rc = security_netlbl_sid_to_secattr(&selinux_state, req->secid, - &secattr); + rc = security_netlbl_sid_to_secattr(req->secid, &secattr); if (rc != 0) goto inet_conn_request_return; rc = netlbl_req_setattr(req, &secattr); @@ -463,8 +459,7 @@ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, perm = RAWIP_SOCKET__RECVFROM; } - rc = avc_has_perm(&selinux_state, - sksec->sid, nlbl_sid, sksec->sclass, perm, ad); + rc = avc_has_perm(sksec->sid, nlbl_sid, sksec->sclass, perm, ad); if (rc == 0) return 0; diff --git a/security/selinux/netnode.c b/security/selinux/netnode.c index 0ac7df9a9367..5c8c77e50aad 100644 --- a/security/selinux/netnode.c +++ b/security/selinux/netnode.c @@ -204,13 +204,13 @@ static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid) new = kzalloc(sizeof(*new), GFP_ATOMIC); switch (family) { case PF_INET: - ret = security_node_sid(&selinux_state, PF_INET, + ret = security_node_sid(PF_INET, addr, sizeof(struct in_addr), sid); if (new) new->nsec.addr.ipv4 = *(__be32 *)addr; break; case PF_INET6: - ret = security_node_sid(&selinux_state, PF_INET6, + ret = security_node_sid(PF_INET6, addr, sizeof(struct in6_addr), sid); if (new) new->nsec.addr.ipv6 = *(struct in6_addr *)addr; diff --git a/security/selinux/netport.c b/security/selinux/netport.c index 8eec6347cf01..2e22ad9c2bd0 100644 --- a/security/selinux/netport.c +++ b/security/selinux/netport.c @@ -148,7 +148,7 @@ static int sel_netport_sid_slow(u8 protocol, u16 pnum, u32 *sid) return 0; } - ret = security_port_sid(&selinux_state, protocol, pnum, sid); + ret = security_port_sid(protocol, pnum, sid); if (ret != 0) goto out; new = kzalloc(sizeof(*new), GFP_ATOMIC); diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 18498979a640..69a583b91fc5 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -77,7 +77,6 @@ struct selinux_fs_info { bool policy_opened; struct dentry *policycap_dir; unsigned long last_ino; - struct selinux_state *state; struct super_block *sb; }; @@ -90,7 +89,6 @@ static int selinux_fs_info_create(struct super_block *sb) return -ENOMEM; fsi->last_ino = SEL_INO_NEXT - 1; - fsi->state = &selinux_state; fsi->sb = sb; sb->s_fs_info = fsi; return 0; @@ -125,12 +123,11 @@ static void selinux_fs_info_free(struct super_block *sb) static ssize_t sel_read_enforce(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct selinux_fs_info *fsi = file_inode(filp)->i_sb->s_fs_info; char tmpbuf[TMPBUFLEN]; ssize_t length; length = scnprintf(tmpbuf, TMPBUFLEN, "%d", - enforcing_enabled(fsi->state)); + enforcing_enabled()); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } @@ -139,8 +136,6 @@ static ssize_t sel_write_enforce(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; - struct selinux_state *state = fsi->state; char *page = NULL; ssize_t length; int old_value, new_value; @@ -162,10 +157,9 @@ static ssize_t sel_write_enforce(struct file *file, const char __user *buf, new_value = !!new_value; - old_value = enforcing_enabled(state); + old_value = enforcing_enabled(); if (new_value != old_value) { - length = avc_has_perm(&selinux_state, - current_sid(), SECINITSID_SECURITY, + length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__SETENFORCE, NULL); if (length) @@ -176,15 +170,15 @@ static ssize_t sel_write_enforce(struct file *file, const char __user *buf, new_value, old_value, from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); - enforcing_set(state, new_value); + enforcing_set(new_value); if (new_value) - avc_ss_reset(state->avc, 0); + avc_ss_reset(0); selnl_notify_setenforce(new_value); - selinux_status_update_setenforce(state, new_value); + selinux_status_update_setenforce(new_value); if (!new_value) call_blocking_lsm_notifier(LSM_POLICY_CHANGE, NULL); - selinux_ima_measure_state(state); + selinux_ima_measure_state(); } length = count; out: @@ -204,14 +198,12 @@ static const struct file_operations sel_enforce_ops = { static ssize_t sel_read_handle_unknown(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct selinux_fs_info *fsi = file_inode(filp)->i_sb->s_fs_info; - struct selinux_state *state = fsi->state; char tmpbuf[TMPBUFLEN]; ssize_t length; ino_t ino = file_inode(filp)->i_ino; int handle_unknown = (ino == SEL_REJECT_UNKNOWN) ? - security_get_reject_unknown(state) : - !security_get_allow_unknown(state); + security_get_reject_unknown() : + !security_get_allow_unknown(); length = scnprintf(tmpbuf, TMPBUFLEN, "%d", handle_unknown); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); @@ -224,8 +216,7 @@ static const struct file_operations sel_handle_unknown_ops = { static int sel_open_handle_status(struct inode *inode, struct file *filp) { - struct selinux_fs_info *fsi = file_inode(filp)->i_sb->s_fs_info; - struct page *status = selinux_kernel_status_page(fsi->state); + struct page *status = selinux_kernel_status_page(); if (!status) return -ENOMEM; @@ -276,25 +267,13 @@ static const struct file_operations sel_handle_status_ops = { .llseek = generic_file_llseek, }; -#ifdef CONFIG_SECURITY_SELINUX_DISABLE static ssize_t sel_write_disable(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; char *page; ssize_t length; int new_value; - int enforcing; - - /* NOTE: we are now officially considering runtime disable as - * deprecated, and using it will become increasingly painful - * (e.g. sleeping/blocking) as we progress through future - * kernel releases until eventually it is removed - */ - pr_err("SELinux: Runtime disable is deprecated, use selinux=0 on the kernel cmdline.\n"); - pr_err("SELinux: https://github.com/SELinuxProject/selinux-kernel/wiki/DEPRECATE-runtime-disable\n"); - ssleep(15); if (count >= PAGE_SIZE) return -ENOMEM; @@ -307,31 +286,21 @@ static ssize_t sel_write_disable(struct file *file, const char __user *buf, if (IS_ERR(page)) return PTR_ERR(page); - length = -EINVAL; - if (sscanf(page, "%d", &new_value) != 1) + if (sscanf(page, "%d", &new_value) != 1) { + length = -EINVAL; goto out; + } + length = count; if (new_value) { - enforcing = enforcing_enabled(fsi->state); - length = selinux_disable(fsi->state); - if (length) - goto out; - audit_log(audit_context(), GFP_KERNEL, AUDIT_MAC_STATUS, - "enforcing=%d old_enforcing=%d auid=%u ses=%u" - " enabled=0 old-enabled=1 lsm=selinux res=1", - enforcing, enforcing, - from_kuid(&init_user_ns, audit_get_loginuid(current)), - audit_get_sessionid(current)); + pr_err("SELinux: https://github.com/SELinuxProject/selinux-kernel/wiki/DEPRECATE-runtime-disable\n"); + pr_err("SELinux: Runtime disable is not supported, use selinux=0 on the kernel cmdline.\n"); } - length = count; out: kfree(page); return length; } -#else -#define sel_write_disable NULL -#endif static const struct file_operations sel_disable_ops = { .write = sel_write_disable, @@ -375,12 +344,11 @@ static void sel_remove_entries(struct dentry *de); static ssize_t sel_read_mls(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct selinux_fs_info *fsi = file_inode(filp)->i_sb->s_fs_info; char tmpbuf[TMPBUFLEN]; ssize_t length; length = scnprintf(tmpbuf, TMPBUFLEN, "%d", - security_mls_enabled(fsi->state)); + security_mls_enabled()); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } @@ -397,16 +365,14 @@ struct policy_load_memory { static int sel_open_policy(struct inode *inode, struct file *filp) { struct selinux_fs_info *fsi = inode->i_sb->s_fs_info; - struct selinux_state *state = fsi->state; struct policy_load_memory *plm = NULL; int rc; BUG_ON(filp->private_data); - mutex_lock(&fsi->state->policy_mutex); + mutex_lock(&selinux_state.policy_mutex); - rc = avc_has_perm(&selinux_state, - current_sid(), SECINITSID_SECURITY, + rc = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__READ_POLICY, NULL); if (rc) goto err; @@ -420,7 +386,7 @@ static int sel_open_policy(struct inode *inode, struct file *filp) if (!plm) goto err; - rc = security_read_policy(state, &plm->data, &plm->len); + rc = security_read_policy(&plm->data, &plm->len); if (rc) goto err; @@ -434,11 +400,11 @@ static int sel_open_policy(struct inode *inode, struct file *filp) filp->private_data = plm; - mutex_unlock(&fsi->state->policy_mutex); + mutex_unlock(&selinux_state.policy_mutex); return 0; err: - mutex_unlock(&fsi->state->policy_mutex); + mutex_unlock(&selinux_state.policy_mutex); if (plm) vfree(plm->data); @@ -467,8 +433,7 @@ static ssize_t sel_read_policy(struct file *filp, char __user *buf, struct policy_load_memory *plm = filp->private_data; int ret; - ret = avc_has_perm(&selinux_state, - current_sid(), SECINITSID_SECURITY, + ret = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__READ_POLICY, NULL); if (ret) return ret; @@ -621,10 +586,9 @@ static ssize_t sel_write_load(struct file *file, const char __user *buf, ssize_t length; void *data = NULL; - mutex_lock(&fsi->state->policy_mutex); + mutex_lock(&selinux_state.policy_mutex); - length = avc_has_perm(&selinux_state, - current_sid(), SECINITSID_SECURITY, + length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__LOAD_POLICY, NULL); if (length) goto out; @@ -643,7 +607,7 @@ static ssize_t sel_write_load(struct file *file, const char __user *buf, if (copy_from_user(data, buf, count) != 0) goto out; - length = security_load_policy(fsi->state, data, count, &load_state); + length = security_load_policy(data, count, &load_state); if (length) { pr_warn_ratelimited("SELinux: failed to load policy\n"); goto out; @@ -652,11 +616,11 @@ static ssize_t sel_write_load(struct file *file, const char __user *buf, length = sel_make_policy_nodes(fsi, load_state.policy); if (length) { pr_warn_ratelimited("SELinux: failed to initialize selinuxfs\n"); - selinux_policy_cancel(fsi->state, &load_state); + selinux_policy_cancel(&load_state); goto out; } - selinux_policy_commit(fsi->state, &load_state); + selinux_policy_commit(&load_state); length = count; @@ -665,7 +629,7 @@ static ssize_t sel_write_load(struct file *file, const char __user *buf, from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); out: - mutex_unlock(&fsi->state->policy_mutex); + mutex_unlock(&selinux_state.policy_mutex); vfree(data); return length; } @@ -677,23 +641,20 @@ static const struct file_operations sel_load_ops = { static ssize_t sel_write_context(struct file *file, char *buf, size_t size) { - struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; - struct selinux_state *state = fsi->state; char *canon = NULL; u32 sid, len; ssize_t length; - length = avc_has_perm(&selinux_state, - current_sid(), SECINITSID_SECURITY, + length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__CHECK_CONTEXT, NULL); if (length) goto out; - length = security_context_to_sid(state, buf, size, &sid, GFP_KERNEL); + length = security_context_to_sid(buf, size, &sid, GFP_KERNEL); if (length) goto out; - length = security_sid_to_context(state, sid, &canon, &len); + length = security_sid_to_context(sid, &canon, &len); if (length) goto out; @@ -714,25 +675,22 @@ out: static ssize_t sel_read_checkreqprot(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct selinux_fs_info *fsi = file_inode(filp)->i_sb->s_fs_info; char tmpbuf[TMPBUFLEN]; ssize_t length; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", - checkreqprot_get(fsi->state)); + checkreqprot_get()); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; char *page; ssize_t length; unsigned int new_value; - length = avc_has_perm(&selinux_state, - current_sid(), SECINITSID_SECURITY, + length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__SETCHECKREQPROT, NULL); if (length) @@ -749,24 +707,21 @@ static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, if (IS_ERR(page)) return PTR_ERR(page); - length = -EINVAL; - if (sscanf(page, "%u", &new_value) != 1) + if (sscanf(page, "%u", &new_value) != 1) { + length = -EINVAL; goto out; + } + length = count; if (new_value) { char comm[sizeof(current->comm)]; memcpy(comm, current->comm, sizeof(comm)); - pr_err("SELinux: %s (%d) set checkreqprot to 1. This is deprecated and will be rejected in a future kernel release.\n", + pr_err("SELinux: %s (%d) set checkreqprot to 1. This is no longer supported.\n", comm, current->pid); } - checkreqprot_set(fsi->state, (new_value ? 1 : 0)); - if (new_value) - ssleep(15); - length = count; - - selinux_ima_measure_state(fsi->state); + selinux_ima_measure_state(); out: kfree(page); @@ -782,16 +737,13 @@ static ssize_t sel_write_validatetrans(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; - struct selinux_state *state = fsi->state; char *oldcon = NULL, *newcon = NULL, *taskcon = NULL; char *req = NULL; u32 osid, nsid, tsid; u16 tclass; int rc; - rc = avc_has_perm(&selinux_state, - current_sid(), SECINITSID_SECURITY, + rc = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__VALIDATE_TRANS, NULL); if (rc) goto out; @@ -829,19 +781,19 @@ static ssize_t sel_write_validatetrans(struct file *file, if (sscanf(req, "%s %s %hu %s", oldcon, newcon, &tclass, taskcon) != 4) goto out; - rc = security_context_str_to_sid(state, oldcon, &osid, GFP_KERNEL); + rc = security_context_str_to_sid(oldcon, &osid, GFP_KERNEL); if (rc) goto out; - rc = security_context_str_to_sid(state, newcon, &nsid, GFP_KERNEL); + rc = security_context_str_to_sid(newcon, &nsid, GFP_KERNEL); if (rc) goto out; - rc = security_context_str_to_sid(state, taskcon, &tsid, GFP_KERNEL); + rc = security_context_str_to_sid(taskcon, &tsid, GFP_KERNEL); if (rc) goto out; - rc = security_validate_transition_user(state, osid, nsid, tsid, tclass); + rc = security_validate_transition_user(osid, nsid, tsid, tclass); if (!rc) rc = count; out: @@ -911,16 +863,13 @@ static const struct file_operations transaction_ops = { static ssize_t sel_write_access(struct file *file, char *buf, size_t size) { - struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; - struct selinux_state *state = fsi->state; char *scon = NULL, *tcon = NULL; u32 ssid, tsid; u16 tclass; struct av_decision avd; ssize_t length; - length = avc_has_perm(&selinux_state, - current_sid(), SECINITSID_SECURITY, + length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__COMPUTE_AV, NULL); if (length) goto out; @@ -939,15 +888,15 @@ static ssize_t sel_write_access(struct file *file, char *buf, size_t size) if (sscanf(buf, "%s %s %hu", scon, tcon, &tclass) != 3) goto out; - length = security_context_str_to_sid(state, scon, &ssid, GFP_KERNEL); + length = security_context_str_to_sid(scon, &ssid, GFP_KERNEL); if (length) goto out; - length = security_context_str_to_sid(state, tcon, &tsid, GFP_KERNEL); + length = security_context_str_to_sid(tcon, &tsid, GFP_KERNEL); if (length) goto out; - security_compute_av_user(state, ssid, tsid, tclass, &avd); + security_compute_av_user(ssid, tsid, tclass, &avd); length = scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%x %x %x %x %u %x", @@ -962,8 +911,6 @@ out: static ssize_t sel_write_create(struct file *file, char *buf, size_t size) { - struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; - struct selinux_state *state = fsi->state; char *scon = NULL, *tcon = NULL; char *namebuf = NULL, *objname = NULL; u32 ssid, tsid, newsid; @@ -973,8 +920,7 @@ static ssize_t sel_write_create(struct file *file, char *buf, size_t size) u32 len; int nargs; - length = avc_has_perm(&selinux_state, - current_sid(), SECINITSID_SECURITY, + length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__COMPUTE_CREATE, NULL); if (length) @@ -1030,20 +976,20 @@ static ssize_t sel_write_create(struct file *file, char *buf, size_t size) objname = namebuf; } - length = security_context_str_to_sid(state, scon, &ssid, GFP_KERNEL); + length = security_context_str_to_sid(scon, &ssid, GFP_KERNEL); if (length) goto out; - length = security_context_str_to_sid(state, tcon, &tsid, GFP_KERNEL); + length = security_context_str_to_sid(tcon, &tsid, GFP_KERNEL); if (length) goto out; - length = security_transition_sid_user(state, ssid, tsid, tclass, + length = security_transition_sid_user(ssid, tsid, tclass, objname, &newsid); if (length) goto out; - length = security_sid_to_context(state, newsid, &newcon, &len); + length = security_sid_to_context(newsid, &newcon, &len); if (length) goto out; @@ -1066,8 +1012,6 @@ out: static ssize_t sel_write_relabel(struct file *file, char *buf, size_t size) { - struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; - struct selinux_state *state = fsi->state; char *scon = NULL, *tcon = NULL; u32 ssid, tsid, newsid; u16 tclass; @@ -1075,8 +1019,7 @@ static ssize_t sel_write_relabel(struct file *file, char *buf, size_t size) char *newcon = NULL; u32 len; - length = avc_has_perm(&selinux_state, - current_sid(), SECINITSID_SECURITY, + length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__COMPUTE_RELABEL, NULL); if (length) @@ -1096,19 +1039,19 @@ static ssize_t sel_write_relabel(struct file *file, char *buf, size_t size) if (sscanf(buf, "%s %s %hu", scon, tcon, &tclass) != 3) goto out; - length = security_context_str_to_sid(state, scon, &ssid, GFP_KERNEL); + length = security_context_str_to_sid(scon, &ssid, GFP_KERNEL); if (length) goto out; - length = security_context_str_to_sid(state, tcon, &tsid, GFP_KERNEL); + length = security_context_str_to_sid(tcon, &tsid, GFP_KERNEL); if (length) goto out; - length = security_change_sid(state, ssid, tsid, tclass, &newsid); + length = security_change_sid(ssid, tsid, tclass, &newsid); if (length) goto out; - length = security_sid_to_context(state, newsid, &newcon, &len); + length = security_sid_to_context(newsid, &newcon, &len); if (length) goto out; @@ -1127,8 +1070,6 @@ out: static ssize_t sel_write_user(struct file *file, char *buf, size_t size) { - struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; - struct selinux_state *state = fsi->state; char *con = NULL, *user = NULL, *ptr; u32 sid, *sids = NULL; ssize_t length; @@ -1136,8 +1077,7 @@ static ssize_t sel_write_user(struct file *file, char *buf, size_t size) int i, rc; u32 len, nsids; - length = avc_has_perm(&selinux_state, - current_sid(), SECINITSID_SECURITY, + length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__COMPUTE_USER, NULL); if (length) @@ -1157,18 +1097,18 @@ static ssize_t sel_write_user(struct file *file, char *buf, size_t size) if (sscanf(buf, "%s %s", con, user) != 2) goto out; - length = security_context_str_to_sid(state, con, &sid, GFP_KERNEL); + length = security_context_str_to_sid(con, &sid, GFP_KERNEL); if (length) goto out; - length = security_get_user_sids(state, sid, user, &sids, &nsids); + length = security_get_user_sids(sid, user, &sids, &nsids); if (length) goto out; length = sprintf(buf, "%u", nsids) + 1; ptr = buf + length; for (i = 0; i < nsids; i++) { - rc = security_sid_to_context(state, sids[i], &newcon, &len); + rc = security_sid_to_context(sids[i], &newcon, &len); if (rc) { length = rc; goto out; @@ -1192,8 +1132,6 @@ out: static ssize_t sel_write_member(struct file *file, char *buf, size_t size) { - struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; - struct selinux_state *state = fsi->state; char *scon = NULL, *tcon = NULL; u32 ssid, tsid, newsid; u16 tclass; @@ -1201,8 +1139,7 @@ static ssize_t sel_write_member(struct file *file, char *buf, size_t size) char *newcon = NULL; u32 len; - length = avc_has_perm(&selinux_state, - current_sid(), SECINITSID_SECURITY, + length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__COMPUTE_MEMBER, NULL); if (length) @@ -1222,19 +1159,19 @@ static ssize_t sel_write_member(struct file *file, char *buf, size_t size) if (sscanf(buf, "%s %s %hu", scon, tcon, &tclass) != 3) goto out; - length = security_context_str_to_sid(state, scon, &ssid, GFP_KERNEL); + length = security_context_str_to_sid(scon, &ssid, GFP_KERNEL); if (length) goto out; - length = security_context_str_to_sid(state, tcon, &tsid, GFP_KERNEL); + length = security_context_str_to_sid(tcon, &tsid, GFP_KERNEL); if (length) goto out; - length = security_member_sid(state, ssid, tsid, tclass, &newsid); + length = security_member_sid(ssid, tsid, tclass, &newsid); if (length) goto out; - length = security_sid_to_context(state, newsid, &newcon, &len); + length = security_sid_to_context(newsid, &newcon, &len); if (length) goto out; @@ -1276,7 +1213,7 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf, unsigned index = file_inode(filep)->i_ino & SEL_INO_MASK; const char *name = filep->f_path.dentry->d_name.name; - mutex_lock(&fsi->state->policy_mutex); + mutex_lock(&selinux_state.policy_mutex); ret = -EINVAL; if (index >= fsi->bool_num || strcmp(name, @@ -1288,21 +1225,21 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf, if (!page) goto out_unlock; - cur_enforcing = security_get_bool_value(fsi->state, index); + cur_enforcing = security_get_bool_value(index); if (cur_enforcing < 0) { ret = cur_enforcing; goto out_unlock; } length = scnprintf(page, PAGE_SIZE, "%d %d", cur_enforcing, fsi->bool_pending_values[index]); - mutex_unlock(&fsi->state->policy_mutex); + mutex_unlock(&selinux_state.policy_mutex); ret = simple_read_from_buffer(buf, count, ppos, page, length); out_free: free_page((unsigned long)page); return ret; out_unlock: - mutex_unlock(&fsi->state->policy_mutex); + mutex_unlock(&selinux_state.policy_mutex); goto out_free; } @@ -1327,10 +1264,9 @@ static ssize_t sel_write_bool(struct file *filep, const char __user *buf, if (IS_ERR(page)) return PTR_ERR(page); - mutex_lock(&fsi->state->policy_mutex); + mutex_lock(&selinux_state.policy_mutex); - length = avc_has_perm(&selinux_state, - current_sid(), SECINITSID_SECURITY, + length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__SETBOOL, NULL); if (length) @@ -1352,7 +1288,7 @@ static ssize_t sel_write_bool(struct file *filep, const char __user *buf, length = count; out: - mutex_unlock(&fsi->state->policy_mutex); + mutex_unlock(&selinux_state.policy_mutex); kfree(page); return length; } @@ -1383,10 +1319,9 @@ static ssize_t sel_commit_bools_write(struct file *filep, if (IS_ERR(page)) return PTR_ERR(page); - mutex_lock(&fsi->state->policy_mutex); + mutex_lock(&selinux_state.policy_mutex); - length = avc_has_perm(&selinux_state, - current_sid(), SECINITSID_SECURITY, + length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__SETBOOL, NULL); if (length) @@ -1398,14 +1333,14 @@ static ssize_t sel_commit_bools_write(struct file *filep, length = 0; if (new_value && fsi->bool_pending_values) - length = security_set_bools(fsi->state, fsi->bool_num, + length = security_set_bools(fsi->bool_num, fsi->bool_pending_values); if (!length) length = count; out: - mutex_unlock(&fsi->state->policy_mutex); + mutex_unlock(&selinux_state.policy_mutex); kfree(page); return length; } @@ -1503,13 +1438,11 @@ out: static ssize_t sel_read_avc_cache_threshold(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct selinux_fs_info *fsi = file_inode(filp)->i_sb->s_fs_info; - struct selinux_state *state = fsi->state; char tmpbuf[TMPBUFLEN]; ssize_t length; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", - avc_get_cache_threshold(state->avc)); + avc_get_cache_threshold()); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } @@ -1518,14 +1451,11 @@ static ssize_t sel_write_avc_cache_threshold(struct file *file, size_t count, loff_t *ppos) { - struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; - struct selinux_state *state = fsi->state; char *page; ssize_t ret; unsigned int new_value; - ret = avc_has_perm(&selinux_state, - current_sid(), SECINITSID_SECURITY, + ret = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__SETSECPARAM, NULL); if (ret) @@ -1546,7 +1476,7 @@ static ssize_t sel_write_avc_cache_threshold(struct file *file, if (sscanf(page, "%u", &new_value) != 1) goto out; - avc_set_cache_threshold(state->avc, new_value); + avc_set_cache_threshold(new_value); ret = count; out: @@ -1557,8 +1487,6 @@ out: static ssize_t sel_read_avc_hash_stats(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct selinux_fs_info *fsi = file_inode(filp)->i_sb->s_fs_info; - struct selinux_state *state = fsi->state; char *page; ssize_t length; @@ -1566,7 +1494,7 @@ static ssize_t sel_read_avc_hash_stats(struct file *filp, char __user *buf, if (!page) return -ENOMEM; - length = avc_get_hash_stats(state->avc, page); + length = avc_get_hash_stats(page); if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, page, length); free_page((unsigned long)page); @@ -1577,8 +1505,6 @@ static ssize_t sel_read_avc_hash_stats(struct file *filp, char __user *buf, static ssize_t sel_read_sidtab_hash_stats(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct selinux_fs_info *fsi = file_inode(filp)->i_sb->s_fs_info; - struct selinux_state *state = fsi->state; char *page; ssize_t length; @@ -1586,7 +1512,7 @@ static ssize_t sel_read_sidtab_hash_stats(struct file *filp, char __user *buf, if (!page) return -ENOMEM; - length = security_sidtab_hash_stats(state, page); + length = security_sidtab_hash_stats(page); if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, page, length); @@ -1752,13 +1678,12 @@ static int sel_make_ss_files(struct dentry *dir) static ssize_t sel_read_initcon(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; char *con; u32 sid, len; ssize_t ret; sid = file_inode(file)->i_ino&SEL_INO_MASK; - ret = security_sid_to_context(fsi->state, sid, &con, &len); + ret = security_sid_to_context(sid, &con, &len); if (ret) return ret; @@ -1852,13 +1777,12 @@ static const struct file_operations sel_perm_ops = { static ssize_t sel_read_policycap(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; int value; char tmpbuf[TMPBUFLEN]; ssize_t length; unsigned long i_ino = file_inode(file)->i_ino; - value = security_policycap_supported(fsi->state, i_ino & SEL_INO_MASK); + value = security_policycap_supported(i_ino & SEL_INO_MASK); length = scnprintf(tmpbuf, TMPBUFLEN, "%d", value); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); @@ -2249,13 +2173,3 @@ static int __init init_sel_fs(void) } __initcall(init_sel_fs); - -#ifdef CONFIG_SECURITY_SELINUX_DISABLE -void exit_sel_fs(void) -{ - sysfs_remove_mount_point(fs_kobj, "selinux"); - dput(selinux_null.dentry); - kern_unmount(selinuxfs_mount); - unregister_filesystem(&sel_fs_type); -} -#endif diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 0092b29022f5..f14d1ffe54c5 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -235,16 +235,16 @@ static void map_decision(struct selinux_map *map, } } -int security_mls_enabled(struct selinux_state *state) +int security_mls_enabled(void) { int mls_enabled; struct selinux_policy *policy; - if (!selinux_initialized(state)) + if (!selinux_initialized()) return 0; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); mls_enabled = policy->policydb.mls_enabled; rcu_read_unlock(); return mls_enabled; @@ -713,8 +713,7 @@ static void context_struct_compute_av(struct policydb *policydb, tclass, avd); } -static int security_validtrans_handle_fail(struct selinux_state *state, - struct selinux_policy *policy, +static int security_validtrans_handle_fail(struct selinux_policy *policy, struct sidtab_entry *oentry, struct sidtab_entry *nentry, struct sidtab_entry *tentry, @@ -740,13 +739,12 @@ out: kfree(n); kfree(t); - if (!enforcing_enabled(state)) + if (!enforcing_enabled()) return 0; return -EPERM; } -static int security_compute_validatetrans(struct selinux_state *state, - u32 oldsid, u32 newsid, u32 tasksid, +static int security_compute_validatetrans(u32 oldsid, u32 newsid, u32 tasksid, u16 orig_tclass, bool user) { struct selinux_policy *policy; @@ -761,12 +759,12 @@ static int security_compute_validatetrans(struct selinux_state *state, int rc = 0; - if (!selinux_initialized(state)) + if (!selinux_initialized()) return 0; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; @@ -813,8 +811,7 @@ static int security_compute_validatetrans(struct selinux_state *state, if (user) rc = -EPERM; else - rc = security_validtrans_handle_fail(state, - policy, + rc = security_validtrans_handle_fail(policy, oentry, nentry, tentry, @@ -829,19 +826,17 @@ out: return rc; } -int security_validate_transition_user(struct selinux_state *state, - u32 oldsid, u32 newsid, u32 tasksid, +int security_validate_transition_user(u32 oldsid, u32 newsid, u32 tasksid, u16 tclass) { - return security_compute_validatetrans(state, oldsid, newsid, tasksid, + return security_compute_validatetrans(oldsid, newsid, tasksid, tclass, true); } -int security_validate_transition(struct selinux_state *state, - u32 oldsid, u32 newsid, u32 tasksid, +int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, u16 orig_tclass) { - return security_compute_validatetrans(state, oldsid, newsid, tasksid, + return security_compute_validatetrans(oldsid, newsid, tasksid, orig_tclass, false); } @@ -851,12 +846,10 @@ int security_validate_transition(struct selinux_state *state, * It returns 0, if @newsid is bounded by @oldsid. * Otherwise, it returns error code. * - * @state: SELinux state * @oldsid : current security identifier * @newsid : destinated security identifier */ -int security_bounded_transition(struct selinux_state *state, - u32 old_sid, u32 new_sid) +int security_bounded_transition(u32 old_sid, u32 new_sid) { struct selinux_policy *policy; struct policydb *policydb; @@ -866,11 +859,11 @@ int security_bounded_transition(struct selinux_state *state, int index; int rc; - if (!selinux_initialized(state)) + if (!selinux_initialized()) return 0; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; @@ -1004,8 +997,7 @@ void services_compute_xperms_decision(struct extended_perms_decision *xpermd, } } -void security_compute_xperms_decision(struct selinux_state *state, - u32 ssid, +void security_compute_xperms_decision(u32 ssid, u32 tsid, u16 orig_tclass, u8 driver, @@ -1029,10 +1021,10 @@ void security_compute_xperms_decision(struct selinux_state *state, memset(xpermd->dontaudit->p, 0, sizeof(xpermd->dontaudit->p)); rcu_read_lock(); - if (!selinux_initialized(state)) + if (!selinux_initialized()) goto allow; - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; @@ -1091,7 +1083,6 @@ allow: /** * security_compute_av - Compute access vector decisions. - * @state: SELinux state * @ssid: source security identifier * @tsid: target security identifier * @orig_tclass: target security class @@ -1101,8 +1092,7 @@ allow: * Compute a set of access vector decisions based on the * SID pair (@ssid, @tsid) for the permissions in @tclass. */ -void security_compute_av(struct selinux_state *state, - u32 ssid, +void security_compute_av(u32 ssid, u32 tsid, u16 orig_tclass, struct av_decision *avd, @@ -1115,10 +1105,10 @@ void security_compute_av(struct selinux_state *state, struct context *scontext = NULL, *tcontext = NULL; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); avd_init(policy, avd); xperms->len = 0; - if (!selinux_initialized(state)) + if (!selinux_initialized()) goto allow; policydb = &policy->policydb; @@ -1160,8 +1150,7 @@ allow: goto out; } -void security_compute_av_user(struct selinux_state *state, - u32 ssid, +void security_compute_av_user(u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd) @@ -1172,9 +1161,9 @@ void security_compute_av_user(struct selinux_state *state, struct context *scontext = NULL, *tcontext = NULL; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); avd_init(policy, avd); - if (!selinux_initialized(state)) + if (!selinux_initialized()) goto allow; policydb = &policy->policydb; @@ -1290,19 +1279,19 @@ static int sidtab_entry_to_string(struct policydb *p, #include "initial_sid_to_string.h" -int security_sidtab_hash_stats(struct selinux_state *state, char *page) +int security_sidtab_hash_stats(char *page) { struct selinux_policy *policy; int rc; - if (!selinux_initialized(state)) { + if (!selinux_initialized()) { pr_err("SELinux: %s: called before initial load_policy\n", __func__); return -EINVAL; } rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); rc = sidtab_hash_stats(policy->sidtab, page); rcu_read_unlock(); @@ -1316,8 +1305,7 @@ const char *security_get_initial_sid_context(u32 sid) return initial_sid_to_string[sid]; } -static int security_sid_to_context_core(struct selinux_state *state, - u32 sid, char **scontext, +static int security_sid_to_context_core(u32 sid, char **scontext, u32 *scontext_len, int force, int only_invalid) { @@ -1331,7 +1319,7 @@ static int security_sid_to_context_core(struct selinux_state *state, *scontext = NULL; *scontext_len = 0; - if (!selinux_initialized(state)) { + if (!selinux_initialized()) { if (sid <= SECINITSID_NUM) { char *scontextp; const char *s = initial_sid_to_string[sid]; @@ -1352,7 +1340,7 @@ static int security_sid_to_context_core(struct selinux_state *state, return -EINVAL; } rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; @@ -1380,7 +1368,6 @@ out_unlock: /** * security_sid_to_context - Obtain a context for a given SID. - * @state: SELinux state * @sid: security identifier, SID * @scontext: security context * @scontext_len: length in bytes @@ -1389,24 +1376,22 @@ out_unlock: * into a dynamically allocated string of the correct size. Set @scontext * to point to this string and set @scontext_len to the length of the string. */ -int security_sid_to_context(struct selinux_state *state, - u32 sid, char **scontext, u32 *scontext_len) +int security_sid_to_context(u32 sid, char **scontext, u32 *scontext_len) { - return security_sid_to_context_core(state, sid, scontext, + return security_sid_to_context_core(sid, scontext, scontext_len, 0, 0); } -int security_sid_to_context_force(struct selinux_state *state, u32 sid, +int security_sid_to_context_force(u32 sid, char **scontext, u32 *scontext_len) { - return security_sid_to_context_core(state, sid, scontext, + return security_sid_to_context_core(sid, scontext, scontext_len, 1, 0); } /** * security_sid_to_context_inval - Obtain a context for a given SID if it * is invalid. - * @state: SELinux state * @sid: security identifier, SID * @scontext: security context * @scontext_len: length in bytes @@ -1417,10 +1402,10 @@ int security_sid_to_context_force(struct selinux_state *state, u32 sid, * this string (or NULL if the context is valid) and set @scontext_len to * the length of the string (or 0 if the context is valid). */ -int security_sid_to_context_inval(struct selinux_state *state, u32 sid, +int security_sid_to_context_inval(u32 sid, char **scontext, u32 *scontext_len) { - return security_sid_to_context_core(state, sid, scontext, + return security_sid_to_context_core(sid, scontext, scontext_len, 1, 1); } @@ -1505,8 +1490,7 @@ out: return rc; } -static int security_context_to_sid_core(struct selinux_state *state, - const char *scontext, u32 scontext_len, +static int security_context_to_sid_core(const char *scontext, u32 scontext_len, u32 *sid, u32 def_sid, gfp_t gfp_flags, int force) { @@ -1526,7 +1510,7 @@ static int security_context_to_sid_core(struct selinux_state *state, if (!scontext2) return -ENOMEM; - if (!selinux_initialized(state)) { + if (!selinux_initialized()) { int i; for (i = 1; i < SECINITSID_NUM; i++) { @@ -1551,7 +1535,7 @@ static int security_context_to_sid_core(struct selinux_state *state, } retry: rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; rc = string_to_context_struct(policydb, sidtab, scontext2, @@ -1583,7 +1567,6 @@ out: /** * security_context_to_sid - Obtain a SID for a given security context. - * @state: SELinux state * @scontext: security context * @scontext_len: length in bytes * @sid: security identifier, SID @@ -1594,18 +1577,16 @@ out: * Returns -%EINVAL if the context is invalid, -%ENOMEM if insufficient * memory is available, or 0 on success. */ -int security_context_to_sid(struct selinux_state *state, - const char *scontext, u32 scontext_len, u32 *sid, +int security_context_to_sid(const char *scontext, u32 scontext_len, u32 *sid, gfp_t gfp) { - return security_context_to_sid_core(state, scontext, scontext_len, + return security_context_to_sid_core(scontext, scontext_len, sid, SECSID_NULL, gfp, 0); } -int security_context_str_to_sid(struct selinux_state *state, - const char *scontext, u32 *sid, gfp_t gfp) +int security_context_str_to_sid(const char *scontext, u32 *sid, gfp_t gfp) { - return security_context_to_sid(state, scontext, strlen(scontext), + return security_context_to_sid(scontext, strlen(scontext), sid, gfp); } @@ -1613,7 +1594,6 @@ int security_context_str_to_sid(struct selinux_state *state, * security_context_to_sid_default - Obtain a SID for a given security context, * falling back to specified default if needed. * - * @state: SELinux state * @scontext: security context * @scontext_len: length in bytes * @sid: security identifier, SID @@ -1629,24 +1609,21 @@ int security_context_str_to_sid(struct selinux_state *state, * Returns -%EINVAL if the context is invalid, -%ENOMEM if insufficient * memory is available, or 0 on success. */ -int security_context_to_sid_default(struct selinux_state *state, - const char *scontext, u32 scontext_len, +int security_context_to_sid_default(const char *scontext, u32 scontext_len, u32 *sid, u32 def_sid, gfp_t gfp_flags) { - return security_context_to_sid_core(state, scontext, scontext_len, + return security_context_to_sid_core(scontext, scontext_len, sid, def_sid, gfp_flags, 1); } -int security_context_to_sid_force(struct selinux_state *state, - const char *scontext, u32 scontext_len, +int security_context_to_sid_force(const char *scontext, u32 scontext_len, u32 *sid) { - return security_context_to_sid_core(state, scontext, scontext_len, + return security_context_to_sid_core(scontext, scontext_len, sid, SECSID_NULL, GFP_KERNEL, 1); } static int compute_sid_handle_invalid_context( - struct selinux_state *state, struct selinux_policy *policy, struct sidtab_entry *sentry, struct sidtab_entry *tentry, @@ -1679,7 +1656,7 @@ out: kfree(s); kfree(t); kfree(n); - if (!enforcing_enabled(state)) + if (!enforcing_enabled()) return 0; return -EACCES; } @@ -1714,8 +1691,7 @@ static void filename_compute_type(struct policydb *policydb, } } -static int security_compute_sid(struct selinux_state *state, - u32 ssid, +static int security_compute_sid(u32 ssid, u32 tsid, u16 orig_tclass, u32 specified, @@ -1736,7 +1712,7 @@ static int security_compute_sid(struct selinux_state *state, int rc = 0; bool sock; - if (!selinux_initialized(state)) { + if (!selinux_initialized()) { switch (orig_tclass) { case SECCLASS_PROCESS: /* kernel value */ *out_sid = ssid; @@ -1754,7 +1730,7 @@ retry: rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); if (kern) { tclass = unmap_class(&policy->map, orig_tclass); @@ -1886,7 +1862,7 @@ retry: /* Check the validity of the context. */ if (!policydb_context_isvalid(policydb, &newcontext)) { - rc = compute_sid_handle_invalid_context(state, policy, sentry, + rc = compute_sid_handle_invalid_context(policy, sentry, tentry, tclass, &newcontext); if (rc) @@ -1908,7 +1884,6 @@ out: /** * security_transition_sid - Compute the SID for a new subject/object. - * @state: SELinux state * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class @@ -1921,27 +1896,24 @@ out: * if insufficient memory is available, or %0 if the new SID was * computed successfully. */ -int security_transition_sid(struct selinux_state *state, - u32 ssid, u32 tsid, u16 tclass, +int security_transition_sid(u32 ssid, u32 tsid, u16 tclass, const struct qstr *qstr, u32 *out_sid) { - return security_compute_sid(state, ssid, tsid, tclass, + return security_compute_sid(ssid, tsid, tclass, AVTAB_TRANSITION, qstr ? qstr->name : NULL, out_sid, true); } -int security_transition_sid_user(struct selinux_state *state, - u32 ssid, u32 tsid, u16 tclass, +int security_transition_sid_user(u32 ssid, u32 tsid, u16 tclass, const char *objname, u32 *out_sid) { - return security_compute_sid(state, ssid, tsid, tclass, + return security_compute_sid(ssid, tsid, tclass, AVTAB_TRANSITION, objname, out_sid, false); } /** * security_member_sid - Compute the SID for member selection. - * @state: SELinux state * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class @@ -1953,20 +1925,18 @@ int security_transition_sid_user(struct selinux_state *state, * if insufficient memory is available, or %0 if the SID was * computed successfully. */ -int security_member_sid(struct selinux_state *state, - u32 ssid, +int security_member_sid(u32 ssid, u32 tsid, u16 tclass, u32 *out_sid) { - return security_compute_sid(state, ssid, tsid, tclass, + return security_compute_sid(ssid, tsid, tclass, AVTAB_MEMBER, NULL, out_sid, false); } /** * security_change_sid - Compute the SID for object relabeling. - * @state: SELinux state * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class @@ -1978,26 +1948,23 @@ int security_member_sid(struct selinux_state *state, * if insufficient memory is available, or %0 if the SID was * computed successfully. */ -int security_change_sid(struct selinux_state *state, - u32 ssid, +int security_change_sid(u32 ssid, u32 tsid, u16 tclass, u32 *out_sid) { - return security_compute_sid(state, - ssid, tsid, tclass, AVTAB_CHANGE, NULL, + return security_compute_sid(ssid, tsid, tclass, AVTAB_CHANGE, NULL, out_sid, false); } static inline int convert_context_handle_invalid_context( - struct selinux_state *state, struct policydb *policydb, struct context *context) { char *s; u32 len; - if (enforcing_enabled(state)) + if (enforcing_enabled()) return -EINVAL; if (!context_struct_to_string(policydb, context, &s, &len)) { @@ -2115,8 +2082,7 @@ int services_convert_context(struct convert_context_args *args, /* Check the validity of the new context. */ if (!policydb_context_isvalid(args->newp, newc)) { - rc = convert_context_handle_invalid_context(args->state, - args->oldp, oldc); + rc = convert_context_handle_invalid_context(args->oldp, oldc); if (rc) goto bad; } @@ -2135,8 +2101,7 @@ bad: return 0; } -static void security_load_policycaps(struct selinux_state *state, - struct selinux_policy *policy) +static void security_load_policycaps(struct selinux_policy *policy) { struct policydb *p; unsigned int i; @@ -2144,8 +2109,8 @@ static void security_load_policycaps(struct selinux_state *state, p = &policy->policydb; - for (i = 0; i < ARRAY_SIZE(state->policycap); i++) - WRITE_ONCE(state->policycap[i], + for (i = 0; i < ARRAY_SIZE(selinux_state.policycap); i++) + WRITE_ONCE(selinux_state.policycap[i], ebitmap_get_bit(&p->policycaps, i)); for (i = 0; i < ARRAY_SIZE(selinux_policycap_names); i++) @@ -2181,9 +2146,9 @@ static void selinux_policy_cond_free(struct selinux_policy *policy) kfree(policy); } -void selinux_policy_cancel(struct selinux_state *state, - struct selinux_load_state *load_state) +void selinux_policy_cancel(struct selinux_load_state *load_state) { + struct selinux_state *state = &selinux_state; struct selinux_policy *oldpolicy; oldpolicy = rcu_dereference_protected(state->policy, @@ -2194,21 +2159,20 @@ void selinux_policy_cancel(struct selinux_state *state, kfree(load_state->convert_data); } -static void selinux_notify_policy_change(struct selinux_state *state, - u32 seqno) +static void selinux_notify_policy_change(u32 seqno) { /* Flush external caches and notify userspace of policy load */ - avc_ss_reset(state->avc, seqno); + avc_ss_reset(seqno); selnl_notify_policyload(seqno); - selinux_status_update_policyload(state, seqno); + selinux_status_update_policyload(seqno); selinux_netlbl_cache_invalidate(); selinux_xfrm_notify_policyload(); - selinux_ima_measure_state_locked(state); + selinux_ima_measure_state_locked(); } -void selinux_policy_commit(struct selinux_state *state, - struct selinux_load_state *load_state) +void selinux_policy_commit(struct selinux_load_state *load_state) { + struct selinux_state *state = &selinux_state; struct selinux_policy *oldpolicy, *newpolicy = load_state->policy; unsigned long flags; u32 seqno; @@ -2241,15 +2205,15 @@ void selinux_policy_commit(struct selinux_state *state, } /* Load the policycaps from the new policy */ - security_load_policycaps(state, newpolicy); + security_load_policycaps(newpolicy); - if (!selinux_initialized(state)) { + if (!selinux_initialized()) { /* * After first policy load, the security server is * marked as initialized and ready to handle requests and * any objects created prior to policy load are then labeled. */ - selinux_mark_initialized(state); + selinux_mark_initialized(); selinux_complete_init(); } @@ -2259,12 +2223,11 @@ void selinux_policy_commit(struct selinux_state *state, kfree(load_state->convert_data); /* Notify others of the policy change */ - selinux_notify_policy_change(state, seqno); + selinux_notify_policy_change(seqno); } /** * security_load_policy - Load a security policy configuration. - * @state: SELinux state * @data: binary policy data * @len: length of data in bytes * @load_state: policy load state @@ -2274,9 +2237,10 @@ void selinux_policy_commit(struct selinux_state *state, * This function will flush the access vector cache after * loading the new policy. */ -int security_load_policy(struct selinux_state *state, void *data, size_t len, +int security_load_policy(void *data, size_t len, struct selinux_load_state *load_state) { + struct selinux_state *state = &selinux_state; struct selinux_policy *newpolicy, *oldpolicy; struct selinux_policy_convert_data *convert_data; int rc = 0; @@ -2308,7 +2272,7 @@ int security_load_policy(struct selinux_state *state, void *data, size_t len, goto err_mapping; } - if (!selinux_initialized(state)) { + if (!selinux_initialized()) { /* First policy load, so no need to preserve state from old policy */ load_state->policy = newpolicy; load_state->convert_data = NULL; @@ -2336,7 +2300,6 @@ int security_load_policy(struct selinux_state *state, void *data, size_t len, goto err_free_isids; } - convert_data->args.state = state; convert_data->args.oldp = &oldpolicy->policydb; convert_data->args.newp = &newpolicy->policydb; @@ -2410,13 +2373,11 @@ static int ocontext_to_sid(struct sidtab *sidtab, struct ocontext *c, /** * security_port_sid - Obtain the SID for a port. - * @state: SELinux state * @protocol: protocol number * @port: port number * @out_sid: security identifier */ -int security_port_sid(struct selinux_state *state, - u8 protocol, u16 port, u32 *out_sid) +int security_port_sid(u8 protocol, u16 port, u32 *out_sid) { struct selinux_policy *policy; struct policydb *policydb; @@ -2424,7 +2385,7 @@ int security_port_sid(struct selinux_state *state, struct ocontext *c; int rc; - if (!selinux_initialized(state)) { + if (!selinux_initialized()) { *out_sid = SECINITSID_PORT; return 0; } @@ -2432,7 +2393,7 @@ int security_port_sid(struct selinux_state *state, retry: rc = 0; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; @@ -2464,13 +2425,11 @@ out: /** * security_ib_pkey_sid - Obtain the SID for a pkey. - * @state: SELinux state * @subnet_prefix: Subnet Prefix * @pkey_num: pkey number * @out_sid: security identifier */ -int security_ib_pkey_sid(struct selinux_state *state, - u64 subnet_prefix, u16 pkey_num, u32 *out_sid) +int security_ib_pkey_sid(u64 subnet_prefix, u16 pkey_num, u32 *out_sid) { struct selinux_policy *policy; struct policydb *policydb; @@ -2478,7 +2437,7 @@ int security_ib_pkey_sid(struct selinux_state *state, struct ocontext *c; int rc; - if (!selinux_initialized(state)) { + if (!selinux_initialized()) { *out_sid = SECINITSID_UNLABELED; return 0; } @@ -2486,7 +2445,7 @@ int security_ib_pkey_sid(struct selinux_state *state, retry: rc = 0; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; @@ -2518,13 +2477,11 @@ out: /** * security_ib_endport_sid - Obtain the SID for a subnet management interface. - * @state: SELinux state * @dev_name: device name * @port_num: port number * @out_sid: security identifier */ -int security_ib_endport_sid(struct selinux_state *state, - const char *dev_name, u8 port_num, u32 *out_sid) +int security_ib_endport_sid(const char *dev_name, u8 port_num, u32 *out_sid) { struct selinux_policy *policy; struct policydb *policydb; @@ -2532,7 +2489,7 @@ int security_ib_endport_sid(struct selinux_state *state, struct ocontext *c; int rc; - if (!selinux_initialized(state)) { + if (!selinux_initialized()) { *out_sid = SECINITSID_UNLABELED; return 0; } @@ -2540,7 +2497,7 @@ int security_ib_endport_sid(struct selinux_state *state, retry: rc = 0; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; @@ -2573,12 +2530,10 @@ out: /** * security_netif_sid - Obtain the SID for a network interface. - * @state: SELinux state * @name: interface name * @if_sid: interface SID */ -int security_netif_sid(struct selinux_state *state, - char *name, u32 *if_sid) +int security_netif_sid(char *name, u32 *if_sid) { struct selinux_policy *policy; struct policydb *policydb; @@ -2586,7 +2541,7 @@ int security_netif_sid(struct selinux_state *state, int rc; struct ocontext *c; - if (!selinux_initialized(state)) { + if (!selinux_initialized()) { *if_sid = SECINITSID_NETIF; return 0; } @@ -2594,7 +2549,7 @@ int security_netif_sid(struct selinux_state *state, retry: rc = 0; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; @@ -2636,14 +2591,12 @@ static int match_ipv6_addrmask(u32 *input, u32 *addr, u32 *mask) /** * security_node_sid - Obtain the SID for a node (host). - * @state: SELinux state * @domain: communication domain aka address family * @addrp: address * @addrlen: address length in bytes * @out_sid: security identifier */ -int security_node_sid(struct selinux_state *state, - u16 domain, +int security_node_sid(u16 domain, void *addrp, u32 addrlen, u32 *out_sid) @@ -2654,14 +2607,14 @@ int security_node_sid(struct selinux_state *state, int rc; struct ocontext *c; - if (!selinux_initialized(state)) { + if (!selinux_initialized()) { *out_sid = SECINITSID_NODE; return 0; } retry: rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; @@ -2725,7 +2678,6 @@ out: /** * security_get_user_sids - Obtain reachable SIDs for a user. - * @state: SELinux state * @fromsid: starting SID * @username: username * @sids: array of reachable SIDs for user @@ -2738,8 +2690,7 @@ out: * number of elements in the array. */ -int security_get_user_sids(struct selinux_state *state, - u32 fromsid, +int security_get_user_sids(u32 fromsid, char *username, u32 **sids, u32 *nel) @@ -2758,7 +2709,7 @@ int security_get_user_sids(struct selinux_state *state, *sids = NULL; *nel = 0; - if (!selinux_initialized(state)) + if (!selinux_initialized()) return 0; mysids = kcalloc(maxnel, sizeof(*mysids), GFP_KERNEL); @@ -2768,7 +2719,7 @@ int security_get_user_sids(struct selinux_state *state, retry: mynel = 0; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; @@ -2834,8 +2785,7 @@ out_unlock: } for (i = 0, j = 0; i < mynel; i++) { struct av_decision dummy_avd; - rc = avc_has_perm_noaudit(state, - fromsid, mysids[i], + rc = avc_has_perm_noaudit(fromsid, mysids[i], SECCLASS_PROCESS, /* kernel value */ PROCESS__TRANSITION, AVC_STRICT, &dummy_avd); @@ -2908,7 +2858,6 @@ static inline int __security_genfs_sid(struct selinux_policy *policy, /** * security_genfs_sid - Obtain a SID for a file in a filesystem - * @state: SELinux state * @fstype: filesystem type * @path: path from root of mount * @orig_sclass: file security class @@ -2917,8 +2866,7 @@ static inline int __security_genfs_sid(struct selinux_policy *policy, * Acquire policy_rwlock before calling __security_genfs_sid() and release * it afterward. */ -int security_genfs_sid(struct selinux_state *state, - const char *fstype, +int security_genfs_sid(const char *fstype, const char *path, u16 orig_sclass, u32 *sid) @@ -2926,14 +2874,14 @@ int security_genfs_sid(struct selinux_state *state, struct selinux_policy *policy; int retval; - if (!selinux_initialized(state)) { + if (!selinux_initialized()) { *sid = SECINITSID_UNLABELED; return 0; } do { rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); retval = __security_genfs_sid(policy, fstype, path, orig_sclass, sid); rcu_read_unlock(); @@ -2953,10 +2901,9 @@ int selinux_policy_genfs_sid(struct selinux_policy *policy, /** * security_fs_use - Determine how to handle labeling for a filesystem. - * @state: SELinux state * @sb: superblock in question */ -int security_fs_use(struct selinux_state *state, struct super_block *sb) +int security_fs_use(struct super_block *sb) { struct selinux_policy *policy; struct policydb *policydb; @@ -2966,7 +2913,7 @@ int security_fs_use(struct selinux_state *state, struct super_block *sb) struct superblock_security_struct *sbsec = selinux_superblock(sb); const char *fstype = sb->s_type->name; - if (!selinux_initialized(state)) { + if (!selinux_initialized()) { sbsec->behavior = SECURITY_FS_USE_NONE; sbsec->sid = SECINITSID_UNLABELED; return 0; @@ -2974,7 +2921,7 @@ int security_fs_use(struct selinux_state *state, struct super_block *sb) retry: rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; @@ -3067,13 +3014,14 @@ err: } -int security_set_bools(struct selinux_state *state, u32 len, int *values) +int security_set_bools(u32 len, int *values) { + struct selinux_state *state = &selinux_state; struct selinux_policy *newpolicy, *oldpolicy; int rc; u32 i, seqno = 0; - if (!selinux_initialized(state)) + if (!selinux_initialized()) return -EINVAL; oldpolicy = rcu_dereference_protected(state->policy, @@ -3134,23 +3082,22 @@ int security_set_bools(struct selinux_state *state, u32 len, int *values) selinux_policy_cond_free(oldpolicy); /* Notify others of the policy change */ - selinux_notify_policy_change(state, seqno); + selinux_notify_policy_change(seqno); return 0; } -int security_get_bool_value(struct selinux_state *state, - u32 index) +int security_get_bool_value(u32 index) { struct selinux_policy *policy; struct policydb *policydb; int rc; u32 len; - if (!selinux_initialized(state)) + if (!selinux_initialized()) return 0; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; rc = -EFAULT; @@ -3197,8 +3144,7 @@ out: * security_sid_mls_copy() - computes a new sid based on the given * sid and the mls portion of mls_sid. */ -int security_sid_mls_copy(struct selinux_state *state, - u32 sid, u32 mls_sid, u32 *new_sid) +int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid) { struct selinux_policy *policy; struct policydb *policydb; @@ -3210,7 +3156,7 @@ int security_sid_mls_copy(struct selinux_state *state, u32 len; int rc; - if (!selinux_initialized(state)) { + if (!selinux_initialized()) { *new_sid = sid; return 0; } @@ -3220,7 +3166,7 @@ retry: context_init(&newcon); rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; @@ -3254,7 +3200,7 @@ retry: /* Check the validity of the new context. */ if (!policydb_context_isvalid(policydb, &newcon)) { - rc = convert_context_handle_invalid_context(state, policydb, + rc = convert_context_handle_invalid_context(policydb, &newcon); if (rc) { if (!context_struct_to_string(policydb, &newcon, &s, @@ -3288,7 +3234,6 @@ out_unlock: /** * security_net_peersid_resolve - Compare and resolve two network peer SIDs - * @state: SELinux state * @nlbl_sid: NetLabel SID * @nlbl_type: NetLabel labeling protocol type * @xfrm_sid: XFRM SID @@ -3308,8 +3253,7 @@ out_unlock: * multiple, inconsistent labels | -<errno> | SECSID_NULL * */ -int security_net_peersid_resolve(struct selinux_state *state, - u32 nlbl_sid, u32 nlbl_type, +int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type, u32 xfrm_sid, u32 *peer_sid) { @@ -3337,11 +3281,11 @@ int security_net_peersid_resolve(struct selinux_state *state, return 0; } - if (!selinux_initialized(state)) + if (!selinux_initialized()) return 0; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; @@ -3482,31 +3426,31 @@ err: return rc; } -int security_get_reject_unknown(struct selinux_state *state) +int security_get_reject_unknown(void) { struct selinux_policy *policy; int value; - if (!selinux_initialized(state)) + if (!selinux_initialized()) return 0; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); value = policy->policydb.reject_unknown; rcu_read_unlock(); return value; } -int security_get_allow_unknown(struct selinux_state *state) +int security_get_allow_unknown(void) { struct selinux_policy *policy; int value; - if (!selinux_initialized(state)) + if (!selinux_initialized()) return 0; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); value = policy->policydb.allow_unknown; rcu_read_unlock(); return value; @@ -3514,7 +3458,6 @@ int security_get_allow_unknown(struct selinux_state *state) /** * security_policycap_supported - Check for a specific policy capability - * @state: SELinux state * @req_cap: capability * * Description: @@ -3523,17 +3466,16 @@ int security_get_allow_unknown(struct selinux_state *state) * supported, false (0) if it isn't supported. * */ -int security_policycap_supported(struct selinux_state *state, - unsigned int req_cap) +int security_policycap_supported(unsigned int req_cap) { struct selinux_policy *policy; int rc; - if (!selinux_initialized(state)) + if (!selinux_initialized()) return 0; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); rc = ebitmap_get_bit(&policy->policydb.policycaps, req_cap); rcu_read_unlock(); @@ -3569,7 +3511,7 @@ int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) *rule = NULL; - if (!selinux_initialized(state)) + if (!selinux_initialized()) return -EOPNOTSUPP; switch (field) { @@ -3696,7 +3638,7 @@ int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule) return -ENOENT; } - if (!selinux_initialized(state)) + if (!selinux_initialized()) return 0; rcu_read_lock(); @@ -3849,7 +3791,6 @@ static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr, /** * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID - * @state: SELinux state * @secattr: the NetLabel packet security attributes * @sid: the SELinux SID * @@ -3863,8 +3804,7 @@ static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr, * failure. * */ -int security_netlbl_secattr_to_sid(struct selinux_state *state, - struct netlbl_lsm_secattr *secattr, +int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, u32 *sid) { struct selinux_policy *policy; @@ -3874,7 +3814,7 @@ int security_netlbl_secattr_to_sid(struct selinux_state *state, struct context *ctx; struct context ctx_new; - if (!selinux_initialized(state)) { + if (!selinux_initialized()) { *sid = SECSID_NULL; return 0; } @@ -3882,7 +3822,7 @@ int security_netlbl_secattr_to_sid(struct selinux_state *state, retry: rc = 0; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; @@ -3932,7 +3872,6 @@ out: /** * security_netlbl_sid_to_secattr - Convert a SELinux SID to a NetLabel secattr - * @state: SELinux state * @sid: the SELinux SID * @secattr: the NetLabel packet security attributes * @@ -3941,19 +3880,18 @@ out: * Returns zero on success, negative values on failure. * */ -int security_netlbl_sid_to_secattr(struct selinux_state *state, - u32 sid, struct netlbl_lsm_secattr *secattr) +int security_netlbl_sid_to_secattr(u32 sid, struct netlbl_lsm_secattr *secattr) { struct selinux_policy *policy; struct policydb *policydb; int rc; struct context *ctx; - if (!selinux_initialized(state)) + if (!selinux_initialized()) return 0; rcu_read_lock(); - policy = rcu_dereference(state->policy); + policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; rc = -ENOENT; @@ -4003,14 +3941,13 @@ static int __security_read_policy(struct selinux_policy *policy, /** * security_read_policy - read the policy. - * @state: selinux_state * @data: binary policy data * @len: length of data in bytes * */ -int security_read_policy(struct selinux_state *state, - void **data, size_t *len) +int security_read_policy(void **data, size_t *len) { + struct selinux_state *state = &selinux_state; struct selinux_policy *policy; policy = rcu_dereference_protected( @@ -4028,7 +3965,6 @@ int security_read_policy(struct selinux_state *state, /** * security_read_state_kernel - read the policy. - * @state: selinux_state * @data: binary policy data * @len: length of data in bytes * @@ -4038,10 +3974,10 @@ int security_read_policy(struct selinux_state *state, * * This function must be called with policy_mutex held. */ -int security_read_state_kernel(struct selinux_state *state, - void **data, size_t *len) +int security_read_state_kernel(void **data, size_t *len) { int err; + struct selinux_state *state = &selinux_state; struct selinux_policy *policy; policy = rcu_dereference_protected( diff --git a/security/selinux/ss/services.h b/security/selinux/ss/services.h index c4301626487f..8a9b85f44b66 100644 --- a/security/selinux/ss/services.h +++ b/security/selinux/ss/services.h @@ -30,7 +30,6 @@ struct selinux_policy { } __randomize_layout; struct convert_context_args { - struct selinux_state *state; struct policydb *oldp; struct policydb *newp; }; diff --git a/security/selinux/status.c b/security/selinux/status.c index 4bc8f809934c..19ef929a075c 100644 --- a/security/selinux/status.c +++ b/security/selinux/status.c @@ -39,21 +39,21 @@ * It returns a reference to selinux_status_page. If the status page is * not allocated yet, it also tries to allocate it at the first time. */ -struct page *selinux_kernel_status_page(struct selinux_state *state) +struct page *selinux_kernel_status_page(void) { struct selinux_kernel_status *status; struct page *result = NULL; - mutex_lock(&state->status_lock); - if (!state->status_page) { - state->status_page = alloc_page(GFP_KERNEL|__GFP_ZERO); + mutex_lock(&selinux_state.status_lock); + if (!selinux_state.status_page) { + selinux_state.status_page = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (state->status_page) { - status = page_address(state->status_page); + if (selinux_state.status_page) { + status = page_address(selinux_state.status_page); status->version = SELINUX_KERNEL_STATUS_VERSION; status->sequence = 0; - status->enforcing = enforcing_enabled(state); + status->enforcing = enforcing_enabled(); /* * NOTE: the next policyload event shall set * a positive value on the status->policyload, @@ -62,11 +62,11 @@ struct page *selinux_kernel_status_page(struct selinux_state *state) */ status->policyload = 0; status->deny_unknown = - !security_get_allow_unknown(state); + !security_get_allow_unknown(); } } - result = state->status_page; - mutex_unlock(&state->status_lock); + result = selinux_state.status_page; + mutex_unlock(&selinux_state.status_lock); return result; } @@ -76,14 +76,13 @@ struct page *selinux_kernel_status_page(struct selinux_state *state) * * It updates status of the current enforcing/permissive mode. */ -void selinux_status_update_setenforce(struct selinux_state *state, - int enforcing) +void selinux_status_update_setenforce(int enforcing) { struct selinux_kernel_status *status; - mutex_lock(&state->status_lock); - if (state->status_page) { - status = page_address(state->status_page); + mutex_lock(&selinux_state.status_lock); + if (selinux_state.status_page) { + status = page_address(selinux_state.status_page); status->sequence++; smp_wmb(); @@ -93,7 +92,7 @@ void selinux_status_update_setenforce(struct selinux_state *state, smp_wmb(); status->sequence++; } - mutex_unlock(&state->status_lock); + mutex_unlock(&selinux_state.status_lock); } /* @@ -102,23 +101,22 @@ void selinux_status_update_setenforce(struct selinux_state *state, * It updates status of the times of policy reloaded, and current * setting of deny_unknown. */ -void selinux_status_update_policyload(struct selinux_state *state, - int seqno) +void selinux_status_update_policyload(int seqno) { struct selinux_kernel_status *status; - mutex_lock(&state->status_lock); - if (state->status_page) { - status = page_address(state->status_page); + mutex_lock(&selinux_state.status_lock); + if (selinux_state.status_page) { + status = page_address(selinux_state.status_page); status->sequence++; smp_wmb(); status->policyload = seqno; - status->deny_unknown = !security_get_allow_unknown(state); + status->deny_unknown = !security_get_allow_unknown(); smp_wmb(); status->sequence++; } - mutex_unlock(&state->status_lock); + mutex_unlock(&selinux_state.status_lock); } diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index c576832febc6..1fca42c4d0ae 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c @@ -98,13 +98,12 @@ static int selinux_xfrm_alloc_user(struct xfrm_sec_ctx **ctxp, ctx->ctx_len = str_len; memcpy(ctx->ctx_str, &uctx[1], str_len); ctx->ctx_str[str_len] = '\0'; - rc = security_context_to_sid(&selinux_state, ctx->ctx_str, str_len, + rc = security_context_to_sid(ctx->ctx_str, str_len, &ctx->ctx_sid, gfp); if (rc) goto err; - rc = avc_has_perm(&selinux_state, - tsec->sid, ctx->ctx_sid, + rc = avc_has_perm(tsec->sid, ctx->ctx_sid, SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, NULL); if (rc) goto err; @@ -140,8 +139,7 @@ static int selinux_xfrm_delete(struct xfrm_sec_ctx *ctx) if (!ctx) return 0; - return avc_has_perm(&selinux_state, - tsec->sid, ctx->ctx_sid, + return avc_has_perm(tsec->sid, ctx->ctx_sid, SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, NULL); } @@ -163,8 +161,7 @@ int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid) if (!selinux_authorizable_ctx(ctx)) return -EINVAL; - rc = avc_has_perm(&selinux_state, - fl_secid, ctx->ctx_sid, + rc = avc_has_perm(fl_secid, ctx->ctx_sid, SECCLASS_ASSOCIATION, ASSOCIATION__POLMATCH, NULL); return (rc == -EACCES ? -ESRCH : rc); } @@ -205,7 +202,7 @@ int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x, /* We don't need a separate SA Vs. policy polmatch check since the SA * is now of the same label as the flow and a flow Vs. policy polmatch * check had already happened in selinux_xfrm_policy_lookup() above. */ - return (avc_has_perm(&selinux_state, flic_sid, state_sid, + return (avc_has_perm(flic_sid, state_sid, SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, NULL) ? 0 : 1); } @@ -355,7 +352,7 @@ int selinux_xfrm_state_alloc_acquire(struct xfrm_state *x, if (secid == 0) return -EINVAL; - rc = security_sid_to_context(&selinux_state, secid, &ctx_str, + rc = security_sid_to_context(secid, &ctx_str, &str_len); if (rc) return rc; @@ -424,8 +421,7 @@ int selinux_xfrm_sock_rcv_skb(u32 sk_sid, struct sk_buff *skb, /* This check even when there's no association involved is intended, * according to Trent Jaeger, to make sure a process can't engage in * non-IPsec communication unless explicitly allowed by policy. */ - return avc_has_perm(&selinux_state, - sk_sid, peer_sid, + return avc_has_perm(sk_sid, peer_sid, SECCLASS_ASSOCIATION, ASSOCIATION__RECVFROM, ad); } @@ -468,6 +464,6 @@ int selinux_xfrm_postroute_last(u32 sk_sid, struct sk_buff *skb, /* This check even when there's no association involved is intended, * according to Trent Jaeger, to make sure a process can't engage in * non-IPsec communication unless explicitly allowed by policy. */ - return avc_has_perm(&selinux_state, sk_sid, SECINITSID_UNLABELED, + return avc_has_perm(sk_sid, SECINITSID_UNLABELED, SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, ad); } diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index cfcbb748da25..7a3e9ab137d8 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -550,23 +550,22 @@ static int smack_sb_alloc_security(struct super_block *sb) } struct smack_mnt_opts { - const char *fsdefault, *fsfloor, *fshat, *fsroot, *fstransmute; + const char *fsdefault; + const char *fsfloor; + const char *fshat; + const char *fsroot; + const char *fstransmute; }; static void smack_free_mnt_opts(void *mnt_opts) { - struct smack_mnt_opts *opts = mnt_opts; - kfree(opts->fsdefault); - kfree(opts->fsfloor); - kfree(opts->fshat); - kfree(opts->fsroot); - kfree(opts->fstransmute); - kfree(opts); + kfree(mnt_opts); } static int smack_add_opt(int token, const char *s, void **mnt_opts) { struct smack_mnt_opts *opts = *mnt_opts; + struct smack_known *skp; if (!opts) { opts = kzalloc(sizeof(struct smack_mnt_opts), GFP_KERNEL); @@ -577,31 +576,35 @@ static int smack_add_opt(int token, const char *s, void **mnt_opts) if (!s) return -ENOMEM; + skp = smk_import_entry(s, 0); + if (IS_ERR(skp)) + return PTR_ERR(skp); + switch (token) { case Opt_fsdefault: if (opts->fsdefault) goto out_opt_err; - opts->fsdefault = s; + opts->fsdefault = skp->smk_known; break; case Opt_fsfloor: if (opts->fsfloor) goto out_opt_err; - opts->fsfloor = s; + opts->fsfloor = skp->smk_known; break; case Opt_fshat: if (opts->fshat) goto out_opt_err; - opts->fshat = s; + opts->fshat = skp->smk_known; break; case Opt_fsroot: if (opts->fsroot) goto out_opt_err; - opts->fsroot = s; + opts->fsroot = skp->smk_known; break; case Opt_fstransmute: if (opts->fstransmute) goto out_opt_err; - opts->fstransmute = s; + opts->fstransmute = skp->smk_known; break; } return 0; @@ -629,33 +632,14 @@ static int smack_fs_context_dup(struct fs_context *fc, fc->security = kzalloc(sizeof(struct smack_mnt_opts), GFP_KERNEL); if (!fc->security) return -ENOMEM; + dst = fc->security; + dst->fsdefault = src->fsdefault; + dst->fsfloor = src->fsfloor; + dst->fshat = src->fshat; + dst->fsroot = src->fsroot; + dst->fstransmute = src->fstransmute; - if (src->fsdefault) { - dst->fsdefault = kstrdup(src->fsdefault, GFP_KERNEL); - if (!dst->fsdefault) - return -ENOMEM; - } - if (src->fsfloor) { - dst->fsfloor = kstrdup(src->fsfloor, GFP_KERNEL); - if (!dst->fsfloor) - return -ENOMEM; - } - if (src->fshat) { - dst->fshat = kstrdup(src->fshat, GFP_KERNEL); - if (!dst->fshat) - return -ENOMEM; - } - if (src->fsroot) { - dst->fsroot = kstrdup(src->fsroot, GFP_KERNEL); - if (!dst->fsroot) - return -ENOMEM; - } - if (src->fstransmute) { - dst->fstransmute = kstrdup(src->fstransmute, GFP_KERNEL); - if (!dst->fstransmute) - return -ENOMEM; - } return 0; } @@ -712,8 +696,8 @@ static int smack_sb_eat_lsm_opts(char *options, void **mnt_opts) if (token != Opt_error) { arg = kmemdup_nul(arg, from + len - arg, GFP_KERNEL); rc = smack_add_opt(token, arg, mnt_opts); + kfree(arg); if (unlikely(rc)) { - kfree(arg); if (*mnt_opts) smack_free_mnt_opts(*mnt_opts); *mnt_opts = NULL; @@ -1477,7 +1461,7 @@ static int smack_inode_getsecurity(struct mnt_idmap *idmap, struct socket_smack *ssp; struct socket *sock; struct super_block *sbp; - struct inode *ip = (struct inode *)inode; + struct inode *ip = inode; struct smack_known *isp; if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) @@ -4847,7 +4831,7 @@ static int smack_uring_cmd(struct io_uring_cmd *ioucmd) #endif /* CONFIG_IO_URING */ -struct lsm_blob_sizes smack_blob_sizes __lsm_ro_after_init = { +struct lsm_blob_sizes smack_blob_sizes __ro_after_init = { .lbs_cred = sizeof(struct task_smack), .lbs_file = sizeof(struct smack_known *), .lbs_inode = sizeof(struct inode_smack), @@ -4856,7 +4840,7 @@ struct lsm_blob_sizes smack_blob_sizes __lsm_ro_after_init = { .lbs_superblock = sizeof(struct superblock_smack), }; -static struct security_hook_list smack_hooks[] __lsm_ro_after_init = { +static struct security_hook_list smack_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, smack_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme), LSM_HOOK_INIT(syslog, smack_syslog), diff --git a/security/tomoyo/audit.c b/security/tomoyo/audit.c index 7cf8fdbb29bf..610c1536cf70 100644 --- a/security/tomoyo/audit.c +++ b/security/tomoyo/audit.c @@ -271,7 +271,7 @@ char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt, /* +18 is for " symlink.target=\"%s\"" */ len += 18 + strlen(symlink); } - len = tomoyo_round2(len); + len = kmalloc_size_roundup(len); buf = kzalloc(len, GFP_NOFS); if (!buf) goto out; @@ -382,12 +382,12 @@ void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt, goto out; } entry->log = buf; - len = tomoyo_round2(strlen(buf) + 1); + len = kmalloc_size_roundup(strlen(buf) + 1); /* * The entry->size is used for memory quota checks. * Don't go beyond strlen(entry->log). */ - entry->size = len + tomoyo_round2(sizeof(*entry)); + entry->size = len + kmalloc_size_roundup(sizeof(*entry)); spin_lock(&tomoyo_log_lock); if (tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT] && tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] + entry->size >= diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index f4cd9b58b205..969d4aa6fd55 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -2094,7 +2094,7 @@ int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...) tomoyo_add_entry(r->domain, entry.query); goto out; } - len = tomoyo_round2(entry.query_len); + len = kmalloc_size_roundup(entry.query_len); entry.domain = r->domain; spin_lock(&tomoyo_query_list_lock); if (tomoyo_memory_quota[TOMOYO_MEMORY_QUERY] && diff --git a/security/tomoyo/common.h b/security/tomoyo/common.h index ca285f362705..a539b2cbb5c4 100644 --- a/security/tomoyo/common.h +++ b/security/tomoyo/common.h @@ -1276,50 +1276,6 @@ static inline struct tomoyo_policy_namespace *tomoyo_current_namespace(void) return tomoyo_domain()->ns; } -#if defined(CONFIG_SLOB) - -/** - * tomoyo_round2 - Round up to power of 2 for calculating memory usage. - * - * @size: Size to be rounded up. - * - * Returns @size. - * - * Since SLOB does not round up, this function simply returns @size. - */ -static inline int tomoyo_round2(size_t size) -{ - return size; -} - -#else - -/** - * tomoyo_round2 - Round up to power of 2 for calculating memory usage. - * - * @size: Size to be rounded up. - * - * Returns rounded size. - * - * Strictly speaking, SLAB may be able to allocate (e.g.) 96 bytes instead of - * (e.g.) 128 bytes. - */ -static inline int tomoyo_round2(size_t size) -{ -#if PAGE_SIZE == 4096 - size_t bsize = 32; -#else - size_t bsize = 64; -#endif - if (!size) - return 0; - while (size > bsize) - bsize <<= 1; - return bsize; -} - -#endif - /** * list_for_each_cookie - iterate over a list with cookie. * @pos: the &struct list_head to use as a loop cursor. diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index af04a7b7eb28..25006fddc964 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -499,7 +499,7 @@ static int tomoyo_socket_sendmsg(struct socket *sock, struct msghdr *msg, return tomoyo_socket_sendmsg_permission(sock, msg, size); } -struct lsm_blob_sizes tomoyo_blob_sizes __lsm_ro_after_init = { +struct lsm_blob_sizes tomoyo_blob_sizes __ro_after_init = { .lbs_task = sizeof(struct tomoyo_task), }; @@ -546,7 +546,7 @@ static void tomoyo_task_free(struct task_struct *task) * tomoyo_security_ops is a "struct security_operations" which is used for * registering TOMOYO. */ -static struct security_hook_list tomoyo_hooks[] __lsm_ro_after_init = { +static struct security_hook_list tomoyo_hooks[] __ro_after_init = { LSM_HOOK_INIT(cred_prepare, tomoyo_cred_prepare), LSM_HOOK_INIT(bprm_committed_creds, tomoyo_bprm_committed_creds), LSM_HOOK_INIT(task_alloc, tomoyo_task_alloc), @@ -583,7 +583,7 @@ static struct security_hook_list tomoyo_hooks[] __lsm_ro_after_init = { /* Lock for GC. */ DEFINE_SRCU(tomoyo_ss); -int tomoyo_enabled __lsm_ro_after_init = 1; +int tomoyo_enabled __ro_after_init = 1; /** * tomoyo_init - Register TOMOYO Linux as a LSM module. diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 06e226166aab..478be269571a 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -421,7 +421,7 @@ static int yama_ptrace_traceme(struct task_struct *parent) return rc; } -static struct security_hook_list yama_hooks[] __lsm_ro_after_init = { +static struct security_hook_list yama_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, yama_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, yama_ptrace_traceme), LSM_HOOK_INIT(task_prctl, yama_task_prctl), diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 331380c2438b..5868661d461b 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -3521,6 +3521,7 @@ static ssize_t snd_pcm_readv(struct kiocb *iocb, struct iov_iter *to) unsigned long i; void __user **bufs; snd_pcm_uframes_t frames; + const struct iovec *iov = iter_iov(to); pcm_file = iocb->ki_filp->private_data; substream = pcm_file->substream; @@ -3530,18 +3531,20 @@ static ssize_t snd_pcm_readv(struct kiocb *iocb, struct iov_iter *to) if (runtime->state == SNDRV_PCM_STATE_OPEN || runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; - if (!iter_is_iovec(to)) + if (!to->user_backed) return -EINVAL; if (to->nr_segs > 1024 || to->nr_segs != runtime->channels) return -EINVAL; - if (!frame_aligned(runtime, to->iov->iov_len)) + if (!frame_aligned(runtime, iov->iov_len)) return -EINVAL; - frames = bytes_to_samples(runtime, to->iov->iov_len); + frames = bytes_to_samples(runtime, iov->iov_len); bufs = kmalloc_array(to->nr_segs, sizeof(void *), GFP_KERNEL); if (bufs == NULL) return -ENOMEM; - for (i = 0; i < to->nr_segs; ++i) - bufs[i] = to->iov[i].iov_base; + for (i = 0; i < to->nr_segs; ++i) { + bufs[i] = iov->iov_base; + iov++; + } result = snd_pcm_lib_readv(substream, bufs, frames); if (result > 0) result = frames_to_bytes(runtime, result); @@ -3558,6 +3561,7 @@ static ssize_t snd_pcm_writev(struct kiocb *iocb, struct iov_iter *from) unsigned long i; void __user **bufs; snd_pcm_uframes_t frames; + const struct iovec *iov = iter_iov(from); pcm_file = iocb->ki_filp->private_data; substream = pcm_file->substream; @@ -3567,17 +3571,19 @@ static ssize_t snd_pcm_writev(struct kiocb *iocb, struct iov_iter *from) if (runtime->state == SNDRV_PCM_STATE_OPEN || runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; - if (!iter_is_iovec(from)) + if (!from->user_backed) return -EINVAL; if (from->nr_segs > 128 || from->nr_segs != runtime->channels || - !frame_aligned(runtime, from->iov->iov_len)) + !frame_aligned(runtime, iov->iov_len)) return -EINVAL; - frames = bytes_to_samples(runtime, from->iov->iov_len); + frames = bytes_to_samples(runtime, iov->iov_len); bufs = kmalloc_array(from->nr_segs, sizeof(void *), GFP_KERNEL); if (bufs == NULL) return -ENOMEM; - for (i = 0; i < from->nr_segs; ++i) - bufs[i] = from->iov[i].iov_base; + for (i = 0; i < from->nr_segs; ++i) { + bufs[i] = iov->iov_base; + iov++; + } result = snd_pcm_lib_writev(substream, bufs, frames); if (result > 0) result = frames_to_bytes(runtime, result); diff --git a/sound/firewire/tascam/tascam-stream.c b/sound/firewire/tascam/tascam-stream.c index 53e094cc411f..dfe783d01d7d 100644 --- a/sound/firewire/tascam/tascam-stream.c +++ b/sound/firewire/tascam/tascam-stream.c @@ -490,7 +490,7 @@ int snd_tscm_stream_start_duplex(struct snd_tscm *tscm, unsigned int rate) // packet is important for media clock recovery. err = amdtp_domain_start(&tscm->domain, tx_init_skip_cycles, true, true); if (err < 0) - return err; + goto error; if (!amdtp_domain_wait_ready(&tscm->domain, READY_TIMEOUT_MS)) { err = -ETIMEDOUT; diff --git a/sound/i2c/cs8427.c b/sound/i2c/cs8427.c index 65012af6a36e..f58b14b49045 100644 --- a/sound/i2c/cs8427.c +++ b/sound/i2c/cs8427.c @@ -561,10 +561,13 @@ int snd_cs8427_iec958_active(struct snd_i2c_device *cs8427, int active) if (snd_BUG_ON(!cs8427)) return -ENXIO; chip = cs8427->private_data; - if (active) + if (active) { memcpy(chip->playback.pcm_status, chip->playback.def_status, 24); - chip->playback.pcm_ctl->vd[0].access &= ~SNDRV_CTL_ELEM_ACCESS_INACTIVE; + chip->playback.pcm_ctl->vd[0].access &= ~SNDRV_CTL_ELEM_ACCESS_INACTIVE; + } else { + chip->playback.pcm_ctl->vd[0].access |= SNDRV_CTL_ELEM_ACCESS_INACTIVE; + } snd_ctl_notify(cs8427->bus->card, SNDRV_CTL_EVENT_MASK_VALUE | SNDRV_CTL_EVENT_MASK_INFO, &chip->playback.pcm_ctl->id); diff --git a/sound/pci/emu10k1/emupcm.c b/sound/pci/emu10k1/emupcm.c index 48af77ae8020..6ec394fb1846 100644 --- a/sound/pci/emu10k1/emupcm.c +++ b/sound/pci/emu10k1/emupcm.c @@ -1236,7 +1236,7 @@ static int snd_emu10k1_capture_mic_close(struct snd_pcm_substream *substream) { struct snd_emu10k1 *emu = snd_pcm_substream_chip(substream); - emu->capture_interrupt = NULL; + emu->capture_mic_interrupt = NULL; emu->pcm_capture_mic_substream = NULL; return 0; } @@ -1344,7 +1344,7 @@ static int snd_emu10k1_capture_efx_close(struct snd_pcm_substream *substream) { struct snd_emu10k1 *emu = snd_pcm_substream_chip(substream); - emu->capture_interrupt = NULL; + emu->capture_efx_interrupt = NULL; emu->pcm_capture_efx_substream = NULL; return 0; } @@ -1781,17 +1781,21 @@ int snd_emu10k1_pcm_efx(struct snd_emu10k1 *emu, int device) struct snd_kcontrol *kctl; int err; - err = snd_pcm_new(emu->card, "emu10k1 efx", device, 8, 1, &pcm); + err = snd_pcm_new(emu->card, "emu10k1 efx", device, emu->audigy ? 0 : 8, 1, &pcm); if (err < 0) return err; pcm->private_data = emu; - snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &snd_emu10k1_fx8010_playback_ops); + if (!emu->audigy) + snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &snd_emu10k1_fx8010_playback_ops); snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_CAPTURE, &snd_emu10k1_capture_efx_ops); pcm->info_flags = 0; - strcpy(pcm->name, "Multichannel Capture/PT Playback"); + if (emu->audigy) + strcpy(pcm->name, "Multichannel Capture"); + else + strcpy(pcm->name, "Multichannel Capture/PT Playback"); emu->pcm_efx = pcm; /* EFX capture - record the "FXBUS2" channels, by default we connect the EXTINs diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 4ffa3a59f419..5c6980394dce 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -4604,7 +4604,7 @@ HDA_CODEC_ENTRY(0x80862814, "DG1 HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x80862815, "Alderlake HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x80862816, "Rocketlake HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x80862818, "Raptorlake HDMI", patch_i915_tgl_hdmi), -HDA_CODEC_ENTRY(0x80862819, "DG2 HDMI", patch_i915_adlp_hdmi), +HDA_CODEC_ENTRY(0x80862819, "DG2 HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x8086281a, "Jasperlake HDMI", patch_i915_icl_hdmi), HDA_CODEC_ENTRY(0x8086281b, "Elkhartlake HDMI", patch_i915_icl_hdmi), HDA_CODEC_ENTRY(0x8086281c, "Alderlake-P HDMI", patch_i915_adlp_hdmi), diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 26187f5d56b5..f70d6a33421d 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6960,6 +6960,8 @@ enum { ALC269_FIXUP_DELL_M101Z, ALC269_FIXUP_SKU_IGNORE, ALC269_FIXUP_ASUS_G73JW, + ALC269_FIXUP_ASUS_N7601ZM_PINS, + ALC269_FIXUP_ASUS_N7601ZM, ALC269_FIXUP_LENOVO_EAPD, ALC275_FIXUP_SONY_HWEQ, ALC275_FIXUP_SONY_DISABLE_AAMIX, @@ -7256,6 +7258,29 @@ static const struct hda_fixup alc269_fixups[] = { { } } }, + [ALC269_FIXUP_ASUS_N7601ZM_PINS] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x19, 0x03A11050 }, + { 0x1a, 0x03A11C30 }, + { 0x21, 0x03211420 }, + { } + } + }, + [ALC269_FIXUP_ASUS_N7601ZM] = { + .type = HDA_FIXUP_VERBS, + .v.verbs = (const struct hda_verb[]) { + {0x20, AC_VERB_SET_COEF_INDEX, 0x62}, + {0x20, AC_VERB_SET_PROC_COEF, 0xa007}, + {0x20, AC_VERB_SET_COEF_INDEX, 0x10}, + {0x20, AC_VERB_SET_PROC_COEF, 0x8420}, + {0x20, AC_VERB_SET_COEF_INDEX, 0x0f}, + {0x20, AC_VERB_SET_PROC_COEF, 0x7774}, + { } + }, + .chained = true, + .chain_id = ALC269_FIXUP_ASUS_N7601ZM_PINS, + }, [ALC269_FIXUP_LENOVO_EAPD] = { .type = HDA_FIXUP_VERBS, .v.verbs = (const struct hda_verb[]) { @@ -9263,7 +9288,6 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0a62, "Dell Precision 5560", ALC289_FIXUP_DUAL_SPK), SND_PCI_QUIRK(0x1028, 0x0a9d, "Dell Latitude 5430", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x0a9e, "Dell Latitude 5430", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE), - SND_PCI_QUIRK(0x1028, 0x0ac9, "Dell Precision 3260", ALC283_FIXUP_CHROME_BOOK), SND_PCI_QUIRK(0x1028, 0x0b19, "Dell XPS 15 9520", ALC289_FIXUP_DUAL_SPK), SND_PCI_QUIRK(0x1028, 0x0b1a, "Dell Precision 5570", ALC289_FIXUP_DUAL_SPK), SND_PCI_QUIRK(0x1028, 0x0b37, "Dell Inspiron 16 Plus 7620 2-in-1", ALC295_FIXUP_DELL_INSPIRON_TOP_SPEAKERS), @@ -9444,6 +9468,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8b47, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8b5d, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8b5e, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8b65, "HP ProBook 455 15.6 inch G10 Notebook PC", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8b66, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8b7a, "HP", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8b7d, "HP", ALC236_FIXUP_HP_GPIO_LED), @@ -9466,6 +9491,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1271, "ASUS X430UN", ALC256_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1290, "ASUS X441SA", ALC233_FIXUP_EAPD_COEF_AND_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x12a0, "ASUS X441UV", ALC233_FIXUP_EAPD_COEF_AND_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1043, 0x12a3, "Asus N7691ZM", ALC269_FIXUP_ASUS_N7601ZM), SND_PCI_QUIRK(0x1043, 0x12af, "ASUS UX582ZS", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x12e0, "ASUS X541SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x12f0, "ASUS X541UV", ALC256_FIXUP_ASUS_MIC), @@ -9663,6 +9689,9 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x22f1, "Thinkpad", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x22f2, "Thinkpad", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x22f3, "Thinkpad", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x17aa, 0x2318, "Thinkpad Z13 Gen2", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x17aa, 0x2319, "Thinkpad Z16 Gen2", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x17aa, 0x231a, "Thinkpad Z16 Gen2", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x30bb, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY), SND_PCI_QUIRK(0x17aa, 0x30e2, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY), SND_PCI_QUIRK(0x17aa, 0x310c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION), diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c index a794a01a68ca..61258b0aac8d 100644 --- a/sound/pci/hda/patch_sigmatel.c +++ b/sound/pci/hda/patch_sigmatel.c @@ -1707,6 +1707,7 @@ static const struct snd_pci_quirk stac925x_fixup_tbl[] = { }; static const struct hda_pintbl ref92hd73xx_pin_configs[] = { + // Port A-H { 0x0a, 0x02214030 }, { 0x0b, 0x02a19040 }, { 0x0c, 0x01a19020 }, @@ -1715,9 +1716,12 @@ static const struct hda_pintbl ref92hd73xx_pin_configs[] = { { 0x0f, 0x01014010 }, { 0x10, 0x01014020 }, { 0x11, 0x01014030 }, + // CD in { 0x12, 0x02319040 }, + // Digial Mic ins { 0x13, 0x90a000f0 }, { 0x14, 0x90a000f0 }, + // Digital outs { 0x22, 0x01452050 }, { 0x23, 0x01452050 }, {} @@ -1758,6 +1762,7 @@ static const struct hda_pintbl alienware_m17x_pin_configs[] = { }; static const struct hda_pintbl intel_dg45id_pin_configs[] = { + // Analog outputs { 0x0a, 0x02214230 }, { 0x0b, 0x02A19240 }, { 0x0c, 0x01013214 }, @@ -1765,6 +1770,9 @@ static const struct hda_pintbl intel_dg45id_pin_configs[] = { { 0x0e, 0x01A19250 }, { 0x0f, 0x01011212 }, { 0x10, 0x01016211 }, + // Digital output + { 0x22, 0x01451380 }, + { 0x23, 0x40f000f0 }, {} }; @@ -1955,6 +1963,8 @@ static const struct snd_pci_quirk stac92hd73xx_fixup_tbl[] = { "DFI LanParty", STAC_92HD73XX_REF), SND_PCI_QUIRK(PCI_VENDOR_ID_DFI, 0x3101, "DFI LanParty", STAC_92HD73XX_REF), + SND_PCI_QUIRK(PCI_VENDOR_ID_INTEL, 0x5001, + "Intel DP45SG", STAC_92HD73XX_INTEL), SND_PCI_QUIRK(PCI_VENDOR_ID_INTEL, 0x5002, "Intel DG45ID", STAC_92HD73XX_INTEL), SND_PCI_QUIRK(PCI_VENDOR_ID_INTEL, 0x5003, diff --git a/sound/soc/codecs/max98373.c b/sound/soc/codecs/max98373.c index f90a6a7ba83b..fde055c6c894 100644 --- a/sound/soc/codecs/max98373.c +++ b/sound/soc/codecs/max98373.c @@ -31,7 +31,7 @@ static int max98373_dac_event(struct snd_soc_dapm_widget *w, MAX98373_GLOBAL_EN_MASK, 1); usleep_range(30000, 31000); break; - case SND_SOC_DAPM_POST_PMD: + case SND_SOC_DAPM_PRE_PMD: regmap_update_bits(max98373->regmap, MAX98373_R20FF_GLOBAL_SHDN, MAX98373_GLOBAL_EN_MASK, 0); @@ -64,7 +64,7 @@ static const struct snd_kcontrol_new max98373_spkfb_control = static const struct snd_soc_dapm_widget max98373_dapm_widgets[] = { SND_SOC_DAPM_DAC_E("Amp Enable", "HiFi Playback", MAX98373_R202B_PCM_RX_EN, 0, 0, max98373_dac_event, - SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD), SND_SOC_DAPM_MUX("DAI Sel Mux", SND_SOC_NOPM, 0, 0, &max98373_dai_controls), SND_SOC_DAPM_OUTPUT("BE_OUT"), diff --git a/sound/soc/fsl/fsl_asrc_dma.c b/sound/soc/fsl/fsl_asrc_dma.c index 3b81a465814a..05a7d1588d20 100644 --- a/sound/soc/fsl/fsl_asrc_dma.c +++ b/sound/soc/fsl/fsl_asrc_dma.c @@ -209,14 +209,19 @@ static int fsl_asrc_dma_hw_params(struct snd_soc_component *component, be_chan = soc_component_to_pcm(component_be)->chan[substream->stream]; tmp_chan = be_chan; } - if (!tmp_chan) - tmp_chan = dma_request_slave_channel(dev_be, tx ? "tx" : "rx"); + if (!tmp_chan) { + tmp_chan = dma_request_chan(dev_be, tx ? "tx" : "rx"); + if (IS_ERR(tmp_chan)) { + dev_err(dev, "failed to request DMA channel for Back-End\n"); + return -EINVAL; + } + } /* * An EDMA DEV_TO_DEV channel is fixed and bound with DMA event of each * peripheral, unlike SDMA channel that is allocated dynamically. So no * need to configure dma_request and dma_request2, but get dma_chan of - * Back-End device directly via dma_request_slave_channel. + * Back-End device directly via dma_request_chan. */ if (!asrc->use_edma) { /* Get DMA request of Back-End */ diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c index 1b197478b3d9..990bba0be1fb 100644 --- a/sound/soc/fsl/fsl_sai.c +++ b/sound/soc/fsl/fsl_sai.c @@ -1546,7 +1546,7 @@ static const struct fsl_sai_soc_data fsl_sai_imx8qm_data = { .use_imx_pcm = true, .use_edma = true, .fifo_depth = 64, - .pins = 1, + .pins = 4, .reg_offset = 0, .mclk0_is_mclk1 = false, .flags = 0, diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index 669b99a4f76e..3a5394c3dd83 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -1806,10 +1806,12 @@ static int sof_ipc4_route_setup(struct snd_sof_dev *sdev, struct snd_sof_route * int ret; if (!src_fw_module || !sink_fw_module) { - /* The NULL module will print as "(efault)" */ - dev_err(sdev->dev, "source %s or sink %s widget weren't set up properly\n", - src_fw_module->man4_module_entry.name, - sink_fw_module->man4_module_entry.name); + dev_err(sdev->dev, + "cannot bind %s -> %s, no firmware module for: %s%s\n", + src_widget->widget->name, sink_widget->widget->name, + src_fw_module ? "" : " source", + sink_fw_module ? "" : " sink"); + return -ENODEV; } diff --git a/sound/soc/sof/pm.c b/sound/soc/sof/pm.c index 8d3383085d12..85412aeb1ca1 100644 --- a/sound/soc/sof/pm.c +++ b/sound/soc/sof/pm.c @@ -183,6 +183,7 @@ static int sof_suspend(struct device *dev, bool runtime_suspend) const struct sof_ipc_tplg_ops *tplg_ops = sof_ipc_get_ops(sdev, tplg); pm_message_t pm_state; u32 target_state = snd_sof_dsp_power_target(sdev); + u32 old_state = sdev->dsp_power_state.state; int ret; /* do nothing if dsp suspend callback is not set */ @@ -192,7 +193,12 @@ static int sof_suspend(struct device *dev, bool runtime_suspend) if (runtime_suspend && !sof_ops(sdev)->runtime_suspend) return 0; - if (tplg_ops && tplg_ops->tear_down_all_pipelines) + /* we need to tear down pipelines only if the DSP hardware is + * active, which happens for PCI devices. if the device is + * suspended, it is brought back to full power and then + * suspended again + */ + if (tplg_ops && tplg_ops->tear_down_all_pipelines && (old_state == SOF_DSP_PM_D0)) tplg_ops->tear_down_all_pipelines(sdev, false); if (sdev->fw_state != SOF_FW_BOOT_COMPLETE) diff --git a/tools/Makefile b/tools/Makefile index e497875fc7e3..37e9f6804832 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -39,7 +39,7 @@ help: @echo ' turbostat - Intel CPU idle stats and freq reporting tool' @echo ' usb - USB testing tools' @echo ' virtio - vhost test module' - @echo ' vm - misc vm tools' + @echo ' mm - misc mm tools' @echo ' wmi - WMI interface examples' @echo ' x86_energy_perf_policy - Intel energy policy tool' @echo '' @@ -69,7 +69,7 @@ acpi: FORCE cpupower: FORCE $(call descend,power/$@) -cgroup counter firewire hv guest bootconfig spi usb virtio vm bpf iio gpio objtool leds wmi pci firmware debugging tracing: FORCE +cgroup counter firewire hv guest bootconfig spi usb virtio mm bpf iio gpio objtool leds wmi pci firmware debugging tracing: FORCE $(call descend,$@) bpf/%: FORCE @@ -118,7 +118,7 @@ kvm_stat: FORCE all: acpi cgroup counter cpupower gpio hv firewire \ perf selftests bootconfig spi turbostat usb \ - virtio vm bpf x86_energy_perf_policy \ + virtio mm bpf x86_energy_perf_policy \ tmon freefall iio objtool kvm_stat wmi \ pci debugging tracing thermal thermometer thermal-engine @@ -128,7 +128,7 @@ acpi_install: cpupower_install: $(call descend,power/$(@:_install=),install) -cgroup_install counter_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install vm_install bpf_install objtool_install wmi_install pci_install debugging_install tracing_install: +cgroup_install counter_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install mm_install bpf_install objtool_install wmi_install pci_install debugging_install tracing_install: $(call descend,$(@:_install=),install) selftests_install: @@ -158,7 +158,7 @@ kvm_stat_install: install: acpi_install cgroup_install counter_install cpupower_install gpio_install \ hv_install firewire_install iio_install \ perf_install selftests_install turbostat_install usb_install \ - virtio_install vm_install bpf_install x86_energy_perf_policy_install \ + virtio_install mm_install bpf_install x86_energy_perf_policy_install \ tmon_install freefall_install objtool_install kvm_stat_install \ wmi_install pci_install debugging_install intel-speed-select_install \ tracing_install thermometer_install thermal-engine_install @@ -169,7 +169,7 @@ acpi_clean: cpupower_clean: $(call descend,power/cpupower,clean) -cgroup_clean counter_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean vm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean firmware_clean debugging_clean tracing_clean: +cgroup_clean counter_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean mm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean firmware_clean debugging_clean tracing_clean: $(call descend,$(@:_clean=),clean) libapi_clean: @@ -211,7 +211,7 @@ build_clean: clean: acpi_clean cgroup_clean counter_clean cpupower_clean hv_clean firewire_clean \ perf_clean selftests_clean turbostat_clean bootconfig_clean spi_clean usb_clean virtio_clean \ - vm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ + mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean \ gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \ intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean diff --git a/tools/arch/loongarch/include/uapi/asm/bitsperlong.h b/tools/arch/loongarch/include/uapi/asm/bitsperlong.h index d4e32b3d4843..00b4ba1e5cdf 100644 --- a/tools/arch/loongarch/include/uapi/asm/bitsperlong.h +++ b/tools/arch/loongarch/include/uapi/asm/bitsperlong.h @@ -2,7 +2,7 @@ #ifndef __ASM_LOONGARCH_BITSPERLONG_H #define __ASM_LOONGARCH_BITSPERLONG_H -#define __BITS_PER_LONG (__SIZEOF_POINTER__ * 8) +#define __BITS_PER_LONG (__SIZEOF_LONG__ * 8) #include <asm-generic/bitsperlong.h> diff --git a/tools/arch/x86/kcpuid/cpuid.csv b/tools/arch/x86/kcpuid/cpuid.csv index 4f1c4b0c29e9..e0c25b75327e 100644 --- a/tools/arch/x86/kcpuid/cpuid.csv +++ b/tools/arch/x86/kcpuid/cpuid.csv @@ -184,8 +184,8 @@ 7, 0, EBX, 27, avx512er, AVX512 Exponent Reciproca instr 7, 0, EBX, 28, avx512cd, AVX512 Conflict Detection instr 7, 0, EBX, 29, sha, Intel Secure Hash Algorithm Extensions instr - 7, 0, EBX, 26, avx512bw, AVX512 Byte & Word instr - 7, 0, EBX, 28, avx512vl, AVX512 Vector Length Extentions (VL) + 7, 0, EBX, 30, avx512bw, AVX512 Byte & Word instr + 7, 0, EBX, 31, avx512vl, AVX512 Vector Length Extentions (VL) 7, 0, ECX, 0, prefetchwt1, X 7, 0, ECX, 1, avx512vbmi, AVX512 Vector Byte Manipulation Instructions 7, 0, ECX, 2, umip, User-mode Instruction Prevention @@ -340,19 +340,70 @@ # According to SDM # 40000000H - 4FFFFFFFH is invalid range - # Leaf 80000001H # Extended Processor Signature and Feature Bits +0x80000001, 0, EAX, 27:20, extfamily, Extended family +0x80000001, 0, EAX, 19:16, extmodel, Extended model +0x80000001, 0, EAX, 11:8, basefamily, Description of Family +0x80000001, 0, EAX, 11:8, basemodel, Model numbers vary with product +0x80000001, 0, EAX, 3:0, stepping, Processor stepping (revision) for a specific model + +0x80000001, 0, EBX, 31:28, pkgtype, Specifies the package type + 0x80000001, 0, ECX, 0, lahf_lm, LAHF/SAHF available in 64-bit mode +0x80000001, 0, ECX, 1, cmplegacy, Core multi-processing legacy mode +0x80000001, 0, ECX, 2, svm, Indicates support for: VMRUN, VMLOAD, VMSAVE, CLGI, VMMCALL, and INVLPGA +0x80000001, 0, ECX, 3, extapicspace, Extended APIC register space +0x80000001, 0, ECX, 4, altmovecr8, Indicates support for LOCK MOV CR0 means MOV CR8 0x80000001, 0, ECX, 5, lzcnt, LZCNT +0x80000001, 0, ECX, 6, sse4a, EXTRQ, INSERTQ, MOVNTSS, and MOVNTSD instruction support +0x80000001, 0, ECX, 7, misalignsse, Misaligned SSE Mode 0x80000001, 0, ECX, 8, prefetchw, PREFETCHW - +0x80000001, 0, ECX, 9, osvw, OS Visible Work-around support +0x80000001, 0, ECX, 10, ibs, Instruction Based Sampling +0x80000001, 0, ECX, 11, xop, Extended operation support +0x80000001, 0, ECX, 12, skinit, SKINIT and STGI support +0x80000001, 0, ECX, 13, wdt, Watchdog timer support +0x80000001, 0, ECX, 15, lwp, Lightweight profiling support +0x80000001, 0, ECX, 16, fma4, Four-operand FMA instruction support +0x80000001, 0, ECX, 17, tce, Translation cache extension +0x80000001, 0, ECX, 22, TopologyExtensions, Indicates support for Core::X86::Cpuid::CachePropEax0 and Core::X86::Cpuid::ExtApicId +0x80000001, 0, ECX, 23, perfctrextcore, Indicates support for Core::X86::Msr::PERF_CTL0 - 5 and Core::X86::Msr::PERF_CTR +0x80000001, 0, ECX, 24, perfctrextdf, Indicates support for Core::X86::Msr::DF_PERF_CTL and Core::X86::Msr::DF_PERF_CTR +0x80000001, 0, ECX, 26, databreakpointextension, Indicates data breakpoint support for Core::X86::Msr::DR0_ADDR_MASK, Core::X86::Msr::DR1_ADDR_MASK, Core::X86::Msr::DR2_ADDR_MASK and Core::X86::Msr::DR3_ADDR_MASK +0x80000001, 0, ECX, 27, perftsc, Performance time-stamp counter supported +0x80000001, 0, ECX, 28, perfctrextllc, Indicates support for L3 performance counter extensions +0x80000001, 0, ECX, 29, mwaitextended, MWAITX and MONITORX capability is supported +0x80000001, 0, ECX, 30, admskextn, Indicates support for address mask extension (to 32 bits and to all 4 DRs) for instruction breakpoints + +0x80000001, 0, EDX, 0, fpu, x87 floating point unit on-chip +0x80000001, 0, EDX, 1, vme, Virtual-mode enhancements +0x80000001, 0, EDX, 2, de, Debugging extensions, IO breakpoints, CR4.DE +0x80000001, 0, EDX, 3, pse, Page-size extensions (4 MB pages) +0x80000001, 0, EDX, 4, tsc, Time stamp counter, RDTSC/RDTSCP instructions, CR4.TSD +0x80000001, 0, EDX, 5, msr, Model-specific registers (MSRs), with RDMSR and WRMSR instructions +0x80000001, 0, EDX, 6, pae, Physical-address extensions (PAE) +0x80000001, 0, EDX, 7, mce, Machine Check Exception, CR4.MCE +0x80000001, 0, EDX, 8, cmpxchg8b, CMPXCHG8B instruction +0x80000001, 0, EDX, 9, apic, advanced programmable interrupt controller (APIC) exists and is enabled 0x80000001, 0, EDX, 11, sysret, SYSCALL/SYSRET supported +0x80000001, 0, EDX, 12, mtrr, Memory-type range registers +0x80000001, 0, EDX, 13, pge, Page global extension, CR4.PGE +0x80000001, 0, EDX, 14, mca, Machine check architecture, MCG_CAP +0x80000001, 0, EDX, 15, cmov, Conditional move instructions, CMOV, FCOMI, FCMOV +0x80000001, 0, EDX, 16, pat, Page attribute table +0x80000001, 0, EDX, 17, pse36, Page-size extensions 0x80000001, 0, EDX, 20, exec_dis, Execute Disable Bit available +0x80000001, 0, EDX, 22, mmxext, AMD extensions to MMX instructions +0x80000001, 0, EDX, 23, mmx, MMX instructions +0x80000001, 0, EDX, 24, fxsr, FXSAVE and FXRSTOR instructions +0x80000001, 0, EDX, 25, ffxsr, FXSAVE and FXRSTOR instruction optimizations 0x80000001, 0, EDX, 26, 1gb_page, 1GB page supported 0x80000001, 0, EDX, 27, rdtscp, RDTSCP and IA32_TSC_AUX are available -#0x80000001, 0, EDX, 29, 64b, 64b Architecture supported +0x80000001, 0, EDX, 29, lm, 64b Architecture supported +0x80000001, 0, EDX, 30, threednowext, AMD extensions to 3DNow! instructions +0x80000001, 0, EDX, 31, threednow, 3DNow! instructions # Leaf 80000002H/80000003H/80000004H # Processor Brand String diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index dae75511fef7..416f5b35dd8f 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -33,7 +33,7 @@ struct reg_desc { struct bits_desc descs[32]; }; -enum { +enum cpuid_reg { R_EAX = 0, R_EBX, R_ECX, @@ -41,6 +41,10 @@ enum { NR_REGS }; +static const char * const reg_names[] = { + "EAX", "EBX", "ECX", "EDX", +}; + struct subleaf { u32 index; u32 sub; @@ -428,12 +432,18 @@ static void parse_text(void) /* Decode every eax/ebx/ecx/edx */ -static void decode_bits(u32 value, struct reg_desc *rdesc) +static void decode_bits(u32 value, struct reg_desc *rdesc, enum cpuid_reg reg) { struct bits_desc *bdesc; int start, end, i; u32 mask; + if (!rdesc->nr) { + if (show_details) + printf("\t %s: 0x%08x\n", reg_names[reg], value); + return; + } + for (i = 0; i < rdesc->nr; i++) { bdesc = &rdesc->descs[i]; @@ -468,13 +478,21 @@ static void show_leaf(struct subleaf *leaf) if (!leaf) return; - if (show_raw) + if (show_raw) { leaf_print_raw(leaf); + } else { + if (show_details) + printf("CPUID_0x%x_ECX[0x%x]:\n", + leaf->index, leaf->sub); + } + + decode_bits(leaf->eax, &leaf->info[R_EAX], R_EAX); + decode_bits(leaf->ebx, &leaf->info[R_EBX], R_EBX); + decode_bits(leaf->ecx, &leaf->info[R_ECX], R_ECX); + decode_bits(leaf->edx, &leaf->info[R_EDX], R_EDX); - decode_bits(leaf->eax, &leaf->info[R_EAX]); - decode_bits(leaf->ebx, &leaf->info[R_EBX]); - decode_bits(leaf->ecx, &leaf->info[R_ECX]); - decode_bits(leaf->edx, &leaf->info[R_EDX]); + if (!show_raw && show_details) + printf("\n"); } static void show_func(struct cpuid_func *func) diff --git a/tools/include/linux/err.h b/tools/include/linux/err.h index 25f2bb3a991d..332b983ead1e 100644 --- a/tools/include/linux/err.h +++ b/tools/include/linux/err.h @@ -20,7 +20,7 @@ * Userspace note: * The same principle works for userspace, because 'error' pointers * fall down to the unused hole far from user space, as described - * in Documentation/x86/x86_64/mm.rst for x86_64 arch: + * in Documentation/arch/x86/x86_64/mm.rst for x86_64 arch: * * 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm hole caused by [48:63] sign extension * ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole diff --git a/tools/include/nolibc/.gitignore b/tools/include/nolibc/.gitignore new file mode 100644 index 000000000000..dea22eaaed2b --- /dev/null +++ b/tools/include/nolibc/.gitignore @@ -0,0 +1 @@ +sysroot diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile index cfd06764b5ae..9839feafd38a 100644 --- a/tools/include/nolibc/Makefile +++ b/tools/include/nolibc/Makefile @@ -25,8 +25,8 @@ endif nolibc_arch := $(patsubst arm64,aarch64,$(ARCH)) arch_file := arch-$(nolibc_arch).h -all_files := ctype.h errno.h nolibc.h signal.h std.h stdio.h stdlib.h string.h \ - sys.h time.h types.h unistd.h +all_files := ctype.h errno.h nolibc.h signal.h stackprotector.h std.h stdint.h \ + stdio.h stdlib.h string.h sys.h time.h types.h unistd.h # install all headers needed to support a bare-metal compiler all: headers diff --git a/tools/include/nolibc/arch-i386.h b/tools/include/nolibc/arch-i386.h index e8d0cf545bf1..2d98d78fd3f3 100644 --- a/tools/include/nolibc/arch-i386.h +++ b/tools/include/nolibc/arch-i386.h @@ -181,6 +181,8 @@ struct sys_stat_struct { char **environ __attribute__((weak)); const unsigned long *_auxv __attribute__((weak)); +#define __ARCH_SUPPORTS_STACK_PROTECTOR + /* startup code */ /* * i386 System V ABI mandates: @@ -188,9 +190,12 @@ const unsigned long *_auxv __attribute__((weak)); * 2) The deepest stack frame should be set to zero * */ -void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void) +void __attribute__((weak,noreturn,optimize("omit-frame-pointer"),no_stack_protector)) _start(void) { __asm__ volatile ( +#ifdef NOLIBC_STACKPROTECTOR + "call __stack_chk_init\n" // initialize stack protector +#endif "pop %eax\n" // argc (first arg, %eax) "mov %esp, %ebx\n" // argv[] (second arg, %ebx) "lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx) diff --git a/tools/include/nolibc/arch-loongarch.h b/tools/include/nolibc/arch-loongarch.h new file mode 100644 index 000000000000..029ee3cd6baf --- /dev/null +++ b/tools/include/nolibc/arch-loongarch.h @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * LoongArch specific definitions for NOLIBC + * Copyright (C) 2023 Loongson Technology Corporation Limited + */ + +#ifndef _NOLIBC_ARCH_LOONGARCH_H +#define _NOLIBC_ARCH_LOONGARCH_H + +/* Syscalls for LoongArch : + * - stack is 16-byte aligned + * - syscall number is passed in a7 + * - arguments are in a0, a1, a2, a3, a4, a5 + * - the system call is performed by calling "syscall 0" + * - syscall return comes in a0 + * - the arguments are cast to long and assigned into the target + * registers which are then simply passed as registers to the asm code, + * so that we don't have to experience issues with register constraints. + * + * On LoongArch, select() is not implemented so we have to use pselect6(). + */ +#define __ARCH_WANT_SYS_PSELECT6 + +#define my_syscall0(num) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0"); \ + \ + __asm__ volatile ( \ + "syscall 0\n" \ + : "=r"(_arg1) \ + : "r"(_num) \ + : "memory", "$t0", "$t1", "$t2", "$t3", \ + "$t4", "$t5", "$t6", "$t7", "$t8" \ + ); \ + _arg1; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + \ + __asm__ volatile ( \ + "syscall 0\n" \ + : "+r"(_arg1) \ + : "r"(_num) \ + : "memory", "$t0", "$t1", "$t2", "$t3", \ + "$t4", "$t5", "$t6", "$t7", "$t8" \ + ); \ + _arg1; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + \ + __asm__ volatile ( \ + "syscall 0\n" \ + : "+r"(_arg1) \ + : "r"(_arg2), \ + "r"(_num) \ + : "memory", "$t0", "$t1", "$t2", "$t3", \ + "$t4", "$t5", "$t6", "$t7", "$t8" \ + ); \ + _arg1; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + \ + __asm__ volatile ( \ + "syscall 0\n" \ + : "+r"(_arg1) \ + : "r"(_arg2), "r"(_arg3), \ + "r"(_num) \ + : "memory", "$t0", "$t1", "$t2", "$t3", \ + "$t4", "$t5", "$t6", "$t7", "$t8" \ + ); \ + _arg1; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ + \ + __asm__ volatile ( \ + "syscall 0\n" \ + : "+r"(_arg1) \ + : "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "r"(_num) \ + : "memory", "$t0", "$t1", "$t2", "$t3", \ + "$t4", "$t5", "$t6", "$t7", "$t8" \ + ); \ + _arg1; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ + register long _arg5 __asm__ ("a4") = (long)(arg5); \ + \ + __asm__ volatile ( \ + "syscall 0\n" \ + : "+r"(_arg1) \ + : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "$t0", "$t1", "$t2", "$t3", \ + "$t4", "$t5", "$t6", "$t7", "$t8" \ + ); \ + _arg1; \ +}) + +#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ + register long _arg5 __asm__ ("a4") = (long)(arg5); \ + register long _arg6 __asm__ ("a5") = (long)(arg6); \ + \ + __asm__ volatile ( \ + "syscall 0\n" \ + : "+r"(_arg1) \ + : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), "r"(_arg6), \ + "r"(_num) \ + : "memory", "$t0", "$t1", "$t2", "$t3", \ + "$t4", "$t5", "$t6", "$t7", "$t8" \ + ); \ + _arg1; \ +}) + +char **environ __attribute__((weak)); +const unsigned long *_auxv __attribute__((weak)); + +#if __loongarch_grlen == 32 +#define LONGLOG "2" +#define SZREG "4" +#define REG_L "ld.w" +#define LONG_S "st.w" +#define LONG_ADD "add.w" +#define LONG_ADDI "addi.w" +#define LONG_SLL "slli.w" +#define LONG_BSTRINS "bstrins.w" +#else // __loongarch_grlen == 64 +#define LONGLOG "3" +#define SZREG "8" +#define REG_L "ld.d" +#define LONG_S "st.d" +#define LONG_ADD "add.d" +#define LONG_ADDI "addi.d" +#define LONG_SLL "slli.d" +#define LONG_BSTRINS "bstrins.d" +#endif + +/* startup code */ +void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void) +{ + __asm__ volatile ( + REG_L " $a0, $sp, 0\n" // argc (a0) was in the stack + LONG_ADDI " $a1, $sp, "SZREG"\n" // argv (a1) = sp + SZREG + LONG_SLL " $a2, $a0, "LONGLOG"\n" // envp (a2) = SZREG*argc ... + LONG_ADDI " $a2, $a2, "SZREG"\n" // + SZREG (skip null) + LONG_ADD " $a2, $a2, $a1\n" // + argv + + "move $a3, $a2\n" // iterate a3 over envp to find auxv (after NULL) + "0:\n" // do { + REG_L " $a4, $a3, 0\n" // a4 = *a3; + LONG_ADDI " $a3, $a3, "SZREG"\n" // a3 += sizeof(void*); + "bne $a4, $zero, 0b\n" // } while (a4); + "la.pcrel $a4, _auxv\n" // a4 = &_auxv + LONG_S " $a3, $a4, 0\n" // store a3 into _auxv + + "la.pcrel $a3, environ\n" // a3 = &environ + LONG_S " $a2, $a3, 0\n" // store envp(a2) into environ + LONG_BSTRINS " $sp, $zero, 3, 0\n" // sp must be 16-byte aligned + "bl main\n" // main() returns the status code, we'll exit with it. + "li.w $a7, 93\n" // NR_exit == 93 + "syscall 0\n" + ); + __builtin_unreachable(); +} + +#endif // _NOLIBC_ARCH_LOONGARCH_H diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h index 17f6751208e7..f7f2a11d4c3b 100644 --- a/tools/include/nolibc/arch-x86_64.h +++ b/tools/include/nolibc/arch-x86_64.h @@ -181,6 +181,8 @@ struct sys_stat_struct { char **environ __attribute__((weak)); const unsigned long *_auxv __attribute__((weak)); +#define __ARCH_SUPPORTS_STACK_PROTECTOR + /* startup code */ /* * x86-64 System V ABI mandates: @@ -191,6 +193,9 @@ const unsigned long *_auxv __attribute__((weak)); void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void) { __asm__ volatile ( +#ifdef NOLIBC_STACKPROTECTOR + "call __stack_chk_init\n" // initialize stack protector +#endif "pop %rdi\n" // argc (first arg, %rdi) "mov %rsp, %rsi\n" // argv[] (second arg, %rsi) "lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx) diff --git a/tools/include/nolibc/arch.h b/tools/include/nolibc/arch.h index 78b067a4fa47..2d5386a8d6aa 100644 --- a/tools/include/nolibc/arch.h +++ b/tools/include/nolibc/arch.h @@ -29,6 +29,8 @@ #include "arch-riscv.h" #elif defined(__s390x__) #include "arch-s390.h" +#elif defined(__loongarch__) +#include "arch-loongarch.h" #endif #endif /* _NOLIBC_ARCH_H */ diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index b2bc48d3cfe4..04739a6293c4 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -104,6 +104,7 @@ #include "string.h" #include "time.h" #include "unistd.h" +#include "stackprotector.h" /* Used by programs to avoid std includes */ #define NOLIBC diff --git a/tools/include/nolibc/stackprotector.h b/tools/include/nolibc/stackprotector.h new file mode 100644 index 000000000000..d119cbbbc256 --- /dev/null +++ b/tools/include/nolibc/stackprotector.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Stack protector support for NOLIBC + * Copyright (C) 2023 Thomas Weißschuh <linux@weissschuh.net> + */ + +#ifndef _NOLIBC_STACKPROTECTOR_H +#define _NOLIBC_STACKPROTECTOR_H + +#include "arch.h" + +#if defined(NOLIBC_STACKPROTECTOR) + +#if !defined(__ARCH_SUPPORTS_STACK_PROTECTOR) +#error "nolibc does not support stack protectors on this arch" +#endif + +#include "sys.h" +#include "stdlib.h" + +/* The functions in this header are using raw syscall macros to avoid + * triggering stack protector errors themselves + */ + +__attribute__((weak,noreturn,section(".text.nolibc_stack_chk"))) +void __stack_chk_fail(void) +{ + pid_t pid; + my_syscall3(__NR_write, STDERR_FILENO, "!!Stack smashing detected!!\n", 28); + pid = my_syscall0(__NR_getpid); + my_syscall2(__NR_kill, pid, SIGABRT); + for (;;); +} + +__attribute__((weak,noreturn,section(".text.nolibc_stack_chk"))) +void __stack_chk_fail_local(void) +{ + __stack_chk_fail(); +} + +__attribute__((weak,section(".data.nolibc_stack_chk"))) +uintptr_t __stack_chk_guard; + +__attribute__((weak,no_stack_protector,section(".text.nolibc_stack_chk"))) +void __stack_chk_init(void) +{ + my_syscall3(__NR_getrandom, &__stack_chk_guard, sizeof(__stack_chk_guard), 0); + /* a bit more randomness in case getrandom() fails */ + __stack_chk_guard ^= (uintptr_t) &__stack_chk_guard; +} +#endif // defined(NOLIBC_STACKPROTECTOR) + +#endif // _NOLIBC_STACKPROTECTOR_H diff --git a/tools/include/nolibc/std.h b/tools/include/nolibc/std.h index 1747ae125392..933bc0be7e1c 100644 --- a/tools/include/nolibc/std.h +++ b/tools/include/nolibc/std.h @@ -18,20 +18,7 @@ #define NULL ((void *)0) #endif -/* stdint types */ -typedef unsigned char uint8_t; -typedef signed char int8_t; -typedef unsigned short uint16_t; -typedef signed short int16_t; -typedef unsigned int uint32_t; -typedef signed int int32_t; -typedef unsigned long long uint64_t; -typedef signed long long int64_t; -typedef unsigned long size_t; -typedef signed long ssize_t; -typedef unsigned long uintptr_t; -typedef signed long intptr_t; -typedef signed long ptrdiff_t; +#include "stdint.h" /* those are commonly provided by sys/types.h */ typedef unsigned int dev_t; diff --git a/tools/include/nolibc/stdint.h b/tools/include/nolibc/stdint.h new file mode 100644 index 000000000000..c1ce4f5e0603 --- /dev/null +++ b/tools/include/nolibc/stdint.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Standard definitions and types for NOLIBC + * Copyright (C) 2023 Vincent Dagonneau <v@vda.io> + */ + +#ifndef _NOLIBC_STDINT_H +#define _NOLIBC_STDINT_H + +typedef unsigned char uint8_t; +typedef signed char int8_t; +typedef unsigned short uint16_t; +typedef signed short int16_t; +typedef unsigned int uint32_t; +typedef signed int int32_t; +typedef unsigned long long uint64_t; +typedef signed long long int64_t; +typedef unsigned long size_t; +typedef signed long ssize_t; +typedef unsigned long uintptr_t; +typedef signed long intptr_t; +typedef signed long ptrdiff_t; + +typedef int8_t int_least8_t; +typedef uint8_t uint_least8_t; +typedef int16_t int_least16_t; +typedef uint16_t uint_least16_t; +typedef int32_t int_least32_t; +typedef uint32_t uint_least32_t; +typedef int64_t int_least64_t; +typedef uint64_t uint_least64_t; + +typedef int8_t int_fast8_t; +typedef uint8_t uint_fast8_t; +typedef ssize_t int_fast16_t; +typedef size_t uint_fast16_t; +typedef ssize_t int_fast32_t; +typedef size_t uint_fast32_t; +typedef ssize_t int_fast64_t; +typedef size_t uint_fast64_t; + +typedef int64_t intmax_t; +typedef uint64_t uintmax_t; + +/* limits of integral types */ + +#define INT8_MIN (-128) +#define INT16_MIN (-32767-1) +#define INT32_MIN (-2147483647-1) +#define INT64_MIN (-9223372036854775807LL-1) + +#define INT8_MAX (127) +#define INT16_MAX (32767) +#define INT32_MAX (2147483647) +#define INT64_MAX (9223372036854775807LL) + +#define UINT8_MAX (255) +#define UINT16_MAX (65535) +#define UINT32_MAX (4294967295U) +#define UINT64_MAX (18446744073709551615ULL) + +#define INT_LEAST8_MIN INT8_MIN +#define INT_LEAST16_MIN INT16_MIN +#define INT_LEAST32_MIN INT32_MIN +#define INT_LEAST64_MIN INT64_MIN + +#define INT_LEAST8_MAX INT8_MAX +#define INT_LEAST16_MAX INT16_MAX +#define INT_LEAST32_MAX INT32_MAX +#define INT_LEAST64_MAX INT64_MAX + +#define UINT_LEAST8_MAX UINT8_MAX +#define UINT_LEAST16_MAX UINT16_MAX +#define UINT_LEAST32_MAX UINT32_MAX +#define UINT_LEAST64_MAX UINT64_MAX + +#define SIZE_MAX ((size_t)(__LONG_MAX__) * 2 + 1) +#define INTPTR_MIN (-__LONG_MAX__ - 1) +#define INTPTR_MAX __LONG_MAX__ +#define PTRDIFF_MIN INTPTR_MIN +#define PTRDIFF_MAX INTPTR_MAX +#define UINTPTR_MAX SIZE_MAX + +#define INT_FAST8_MIN INT8_MIN +#define INT_FAST16_MIN INTPTR_MIN +#define INT_FAST32_MIN INTPTR_MIN +#define INT_FAST64_MIN INTPTR_MIN + +#define INT_FAST8_MAX INT8_MAX +#define INT_FAST16_MAX INTPTR_MAX +#define INT_FAST32_MAX INTPTR_MAX +#define INT_FAST64_MAX INTPTR_MAX + +#define UINT_FAST8_MAX UINT8_MAX +#define UINT_FAST16_MAX SIZE_MAX +#define UINT_FAST32_MAX SIZE_MAX +#define UINT_FAST64_MAX SIZE_MAX + +#endif /* _NOLIBC_STDINT_H */ diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index 96ac8afc5aee..6cbbb52836a0 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -273,6 +273,12 @@ int vfprintf(FILE *stream, const char *fmt, va_list args) return written; } +static __attribute__((unused)) +int vprintf(const char *fmt, va_list args) +{ + return vfprintf(stdout, fmt, args); +} + static __attribute__((unused, format(printf, 2, 3))) int fprintf(FILE *stream, const char *fmt, ...) { diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index b5f8cd35c03b..5d624dc63a42 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -11,7 +11,6 @@ #include "std.h" /* system includes */ -#include <asm/fcntl.h> // for O_* #include <asm/unistd.h> #include <asm/signal.h> // for SIGCHLD #include <asm/ioctls.h> @@ -20,6 +19,8 @@ #include <linux/loop.h> #include <linux/time.h> #include <linux/auxvec.h> +#include <linux/fcntl.h> // for O_* and AT_* +#include <linux/stat.h> // for statx() #include "arch.h" #include "errno.h" @@ -411,6 +412,27 @@ int getdents64(int fd, struct linux_dirent64 *dirp, int count) /* + * uid_t geteuid(void); + */ + +static __attribute__((unused)) +uid_t sys_geteuid(void) +{ +#ifdef __NR_geteuid32 + return my_syscall0(__NR_geteuid32); +#else + return my_syscall0(__NR_geteuid); +#endif +} + +static __attribute__((unused)) +uid_t geteuid(void) +{ + return sys_geteuid(); +} + + +/* * pid_t getpgid(pid_t pid); */ @@ -545,6 +567,27 @@ int gettimeofday(struct timeval *tv, struct timezone *tz) /* + * uid_t getuid(void); + */ + +static __attribute__((unused)) +uid_t sys_getuid(void) +{ +#ifdef __NR_getuid32 + return my_syscall0(__NR_getuid32); +#else + return my_syscall0(__NR_getuid); +#endif +} + +static __attribute__((unused)) +uid_t getuid(void) +{ + return sys_getuid(); +} + + +/* * int ioctl(int fd, unsigned long req, void *value); */ @@ -1048,12 +1091,66 @@ pid_t setsid(void) return ret; } +#if defined(__NR_statx) +/* + * int statx(int fd, const char *path, int flags, unsigned int mask, struct statx *buf); + */ + +static __attribute__((unused)) +int sys_statx(int fd, const char *path, int flags, unsigned int mask, struct statx *buf) +{ + return my_syscall5(__NR_statx, fd, path, flags, mask, buf); +} + +static __attribute__((unused)) +int statx(int fd, const char *path, int flags, unsigned int mask, struct statx *buf) +{ + int ret = sys_statx(fd, path, flags, mask, buf); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} +#endif /* * int stat(const char *path, struct stat *buf); * Warning: the struct stat's layout is arch-dependent. */ +#if defined(__NR_statx) && !defined(__NR_newfstatat) && !defined(__NR_stat) +/* + * Maybe we can just use statx() when available for all architectures? + */ +static __attribute__((unused)) +int sys_stat(const char *path, struct stat *buf) +{ + struct statx statx; + long ret; + + ret = sys_statx(AT_FDCWD, path, AT_NO_AUTOMOUNT, STATX_BASIC_STATS, &statx); + buf->st_dev = ((statx.stx_dev_minor & 0xff) + | (statx.stx_dev_major << 8) + | ((statx.stx_dev_minor & ~0xff) << 12)); + buf->st_ino = statx.stx_ino; + buf->st_mode = statx.stx_mode; + buf->st_nlink = statx.stx_nlink; + buf->st_uid = statx.stx_uid; + buf->st_gid = statx.stx_gid; + buf->st_rdev = ((statx.stx_rdev_minor & 0xff) + | (statx.stx_rdev_major << 8) + | ((statx.stx_rdev_minor & ~0xff) << 12)); + buf->st_size = statx.stx_size; + buf->st_blksize = statx.stx_blksize; + buf->st_blocks = statx.stx_blocks; + buf->st_atime = statx.stx_atime.tv_sec; + buf->st_mtime = statx.stx_mtime.tv_sec; + buf->st_ctime = statx.stx_ctime.tv_sec; + return ret; +} +#else static __attribute__((unused)) int sys_stat(const char *path, struct stat *buf) { @@ -1083,6 +1180,7 @@ int sys_stat(const char *path, struct stat *buf) buf->st_ctime = stat.st_ctime; return ret; } +#endif static __attribute__((unused)) int stat(const char *path, struct stat *buf) diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h index fbbc0e68c001..aedd7d9e3f64 100644 --- a/tools/include/nolibc/types.h +++ b/tools/include/nolibc/types.h @@ -9,6 +9,7 @@ #include "std.h" #include <linux/time.h> +#include <linux/stat.h> /* Only the generic macros and types may be defined here. The arch-specific @@ -16,7 +17,11 @@ * the layout of sys_stat_struct must not be defined here. */ -/* stat flags (WARNING, octal here) */ +/* stat flags (WARNING, octal here). We need to check for an existing + * definition because linux/stat.h may omit to define those if it finds + * that any glibc header was already included. + */ +#if !defined(S_IFMT) #define S_IFDIR 0040000 #define S_IFCHR 0020000 #define S_IFBLK 0060000 @@ -34,6 +39,22 @@ #define S_ISLNK(mode) (((mode) & S_IFMT) == S_IFLNK) #define S_ISSOCK(mode) (((mode) & S_IFMT) == S_IFSOCK) +#define S_IRWXU 00700 +#define S_IRUSR 00400 +#define S_IWUSR 00200 +#define S_IXUSR 00100 + +#define S_IRWXG 00070 +#define S_IRGRP 00040 +#define S_IWGRP 00020 +#define S_IXGRP 00010 + +#define S_IRWXO 00007 +#define S_IROTH 00004 +#define S_IWOTH 00002 +#define S_IXOTH 00001 +#endif + /* dirent types */ #define DT_UNKNOWN 0x0 #define DT_FIFO 0x1 @@ -60,11 +81,6 @@ #define MAXPATHLEN (PATH_MAX) #endif -/* Special FD used by all the *at functions */ -#ifndef AT_FDCWD -#define AT_FDCWD (-100) -#endif - /* whence values for lseek() */ #define SEEK_SET 0 #define SEEK_CUR 1 @@ -81,6 +97,8 @@ /* Macros used on waitpid()'s return status */ #define WEXITSTATUS(status) (((status) & 0xff00) >> 8) #define WIFEXITED(status) (((status) & 0x7f) == 0) +#define WTERMSIG(status) ((status) & 0x7f) +#define WIFSIGNALED(status) ((status) - 1 < 0xff) /* waitpid() flags */ #define WNOHANG 1 diff --git a/tools/include/nolibc/unistd.h b/tools/include/nolibc/unistd.h index 1cfcd52106a4..ac7d53d986cd 100644 --- a/tools/include/nolibc/unistd.h +++ b/tools/include/nolibc/unistd.h @@ -13,6 +13,11 @@ #include "sys.h" +#define STDIN_FILENO 0 +#define STDOUT_FILENO 1 +#define STDERR_FILENO 2 + + static __attribute__((unused)) int msleep(unsigned int msecs) { diff --git a/tools/include/uapi/asm-generic/fcntl.h b/tools/include/uapi/asm-generic/fcntl.h index b02c8e0f4057..1c7a0f6632c0 100644 --- a/tools/include/uapi/asm-generic/fcntl.h +++ b/tools/include/uapi/asm-generic/fcntl.h @@ -91,7 +91,6 @@ /* a horrid kludge trying to make sure that this will fail on old kernels */ #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) -#define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT) #ifndef O_NDELAY #define O_NDELAY O_NONBLOCK diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt index 8e7085238470..6dc8b3642458 100644 --- a/tools/memory-model/Documentation/explanation.txt +++ b/tools/memory-model/Documentation/explanation.txt @@ -28,9 +28,10 @@ Explanation of the Linux-Kernel Memory Consistency Model 20. THE HAPPENS-BEFORE RELATION: hb 21. THE PROPAGATES-BEFORE RELATION: pb 22. RCU RELATIONS: rcu-link, rcu-gp, rcu-rscsi, rcu-order, rcu-fence, and rb - 23. LOCKING - 24. PLAIN ACCESSES AND DATA RACES - 25. ODDS AND ENDS + 23. SRCU READ-SIDE CRITICAL SECTIONS + 24. LOCKING + 25. PLAIN ACCESSES AND DATA RACES + 26. ODDS AND ENDS @@ -1848,14 +1849,169 @@ section in P0 both starts before P1's grace period does and ends before it does, and the critical section in P2 both starts after P1's grace period does and ends after it does. -Addendum: The LKMM now supports SRCU (Sleepable Read-Copy-Update) in -addition to normal RCU. The ideas involved are much the same as -above, with new relations srcu-gp and srcu-rscsi added to represent -SRCU grace periods and read-side critical sections. There is a -restriction on the srcu-gp and srcu-rscsi links that can appear in an -rcu-order sequence (the srcu-rscsi links must be paired with srcu-gp -links having the same SRCU domain with proper nesting); the details -are relatively unimportant. +The LKMM supports SRCU (Sleepable Read-Copy-Update) in addition to +normal RCU. The ideas involved are much the same as above, with new +relations srcu-gp and srcu-rscsi added to represent SRCU grace periods +and read-side critical sections. However, there are some significant +differences between RCU read-side critical sections and their SRCU +counterparts, as described in the next section. + + +SRCU READ-SIDE CRITICAL SECTIONS +-------------------------------- + +The LKMM uses the srcu-rscsi relation to model SRCU read-side critical +sections. They differ from RCU read-side critical sections in the +following respects: + +1. Unlike the analogous RCU primitives, synchronize_srcu(), + srcu_read_lock(), and srcu_read_unlock() take a pointer to a + struct srcu_struct as an argument. This structure is called + an SRCU domain, and calls linked by srcu-rscsi must have the + same domain. Read-side critical sections and grace periods + associated with different domains are independent of one + another; the SRCU version of the RCU Guarantee applies only + to pairs of critical sections and grace periods having the + same domain. + +2. srcu_read_lock() returns a value, called the index, which must + be passed to the matching srcu_read_unlock() call. Unlike + rcu_read_lock() and rcu_read_unlock(), an srcu_read_lock() + call does not always have to match the next unpaired + srcu_read_unlock(). In fact, it is possible for two SRCU + read-side critical sections to overlap partially, as in the + following example (where s is an srcu_struct and idx1 and idx2 + are integer variables): + + idx1 = srcu_read_lock(&s); // Start of first RSCS + idx2 = srcu_read_lock(&s); // Start of second RSCS + srcu_read_unlock(&s, idx1); // End of first RSCS + srcu_read_unlock(&s, idx2); // End of second RSCS + + The matching is determined entirely by the domain pointer and + index value. By contrast, if the calls had been + rcu_read_lock() and rcu_read_unlock() then they would have + created two nested (fully overlapping) read-side critical + sections: an inner one and an outer one. + +3. The srcu_down_read() and srcu_up_read() primitives work + exactly like srcu_read_lock() and srcu_read_unlock(), except + that matching calls don't have to execute on the same CPU. + (The names are meant to be suggestive of operations on + semaphores.) Since the matching is determined by the domain + pointer and index value, these primitives make it possible for + an SRCU read-side critical section to start on one CPU and end + on another, so to speak. + +In order to account for these properties of SRCU, the LKMM models +srcu_read_lock() as a special type of load event (which is +appropriate, since it takes a memory location as argument and returns +a value, just as a load does) and srcu_read_unlock() as a special type +of store event (again appropriate, since it takes as arguments a +memory location and a value). These loads and stores are annotated as +belonging to the "srcu-lock" and "srcu-unlock" event classes +respectively. + +This approach allows the LKMM to tell whether two events are +associated with the same SRCU domain, simply by checking whether they +access the same memory location (i.e., they are linked by the loc +relation). It also gives a way to tell which unlock matches a +particular lock, by checking for the presence of a data dependency +from the load (srcu-lock) to the store (srcu-unlock). For example, +given the situation outlined earlier (with statement labels added): + + A: idx1 = srcu_read_lock(&s); + B: idx2 = srcu_read_lock(&s); + C: srcu_read_unlock(&s, idx1); + D: srcu_read_unlock(&s, idx2); + +the LKMM will treat A and B as loads from s yielding values saved in +idx1 and idx2 respectively. Similarly, it will treat C and D as +though they stored the values from idx1 and idx2 in s. The end result +is much as if we had written: + + A: idx1 = READ_ONCE(s); + B: idx2 = READ_ONCE(s); + C: WRITE_ONCE(s, idx1); + D: WRITE_ONCE(s, idx2); + +except for the presence of the special srcu-lock and srcu-unlock +annotations. You can see at once that we have A ->data C and +B ->data D. These dependencies tell the LKMM that C is the +srcu-unlock event matching srcu-lock event A, and D is the +srcu-unlock event matching srcu-lock event B. + +This approach is admittedly a hack, and it has the potential to lead +to problems. For example, in: + + idx1 = srcu_read_lock(&s); + srcu_read_unlock(&s, idx1); + idx2 = srcu_read_lock(&s); + srcu_read_unlock(&s, idx2); + +the LKMM will believe that idx2 must have the same value as idx1, +since it reads from the immediately preceding store of idx1 in s. +Fortunately this won't matter, assuming that litmus tests never do +anything with SRCU index values other than pass them to +srcu_read_unlock() or srcu_up_read() calls. + +However, sometimes it is necessary to store an index value in a +shared variable temporarily. In fact, this is the only way for +srcu_down_read() to pass the index it gets to an srcu_up_read() call +on a different CPU. In more detail, we might have soething like: + + struct srcu_struct s; + int x; + + P0() + { + int r0; + + A: r0 = srcu_down_read(&s); + B: WRITE_ONCE(x, r0); + } + + P1() + { + int r1; + + C: r1 = READ_ONCE(x); + D: srcu_up_read(&s, r1); + } + +Assuming that P1 executes after P0 and does read the index value +stored in x, we can write this (using brackets to represent event +annotations) as: + + A[srcu-lock] ->data B[once] ->rf C[once] ->data D[srcu-unlock]. + +The LKMM defines a carry-srcu-data relation to express this pattern; +it permits an arbitrarily long sequence of + + data ; rf + +pairs (that is, a data link followed by an rf link) to occur between +an srcu-lock event and the final data dependency leading to the +matching srcu-unlock event. carry-srcu-data is complicated by the +need to ensure that none of the intermediate store events in this +sequence are instances of srcu-unlock. This is necessary because in a +pattern like the one above: + + A: idx1 = srcu_read_lock(&s); + B: srcu_read_unlock(&s, idx1); + C: idx2 = srcu_read_lock(&s); + D: srcu_read_unlock(&s, idx2); + +the LKMM treats B as a store to the variable s and C as a load from +that variable, creating an undesirable rf link from B to C: + + A ->data B ->rf C ->data D. + +This would cause carry-srcu-data to mistakenly extend a data +dependency from A to D, giving the impression that D was the +srcu-unlock event matching A's srcu-lock. To avoid such problems, +carry-srcu-data does not accept sequences in which the ends of any of +the intermediate ->data links (B above) is an srcu-unlock event. LOCKING diff --git a/tools/memory-model/Documentation/litmus-tests.txt b/tools/memory-model/Documentation/litmus-tests.txt index 26554b1c5575..acac527328a1 100644 --- a/tools/memory-model/Documentation/litmus-tests.txt +++ b/tools/memory-model/Documentation/litmus-tests.txt @@ -1028,32 +1028,7 @@ Limitations of the Linux-kernel memory model (LKMM) include: additional call_rcu() process to the site of the emulated rcu-barrier(). - e. Although sleepable RCU (SRCU) is now modeled, there - are some subtle differences between its semantics and - those in the Linux kernel. For example, the kernel - might interpret the following sequence as two partially - overlapping SRCU read-side critical sections: - - 1 r1 = srcu_read_lock(&my_srcu); - 2 do_something_1(); - 3 r2 = srcu_read_lock(&my_srcu); - 4 do_something_2(); - 5 srcu_read_unlock(&my_srcu, r1); - 6 do_something_3(); - 7 srcu_read_unlock(&my_srcu, r2); - - In contrast, LKMM will interpret this as a nested pair of - SRCU read-side critical sections, with the outer critical - section spanning lines 1-7 and the inner critical section - spanning lines 3-5. - - This difference would be more of a concern had anyone - identified a reasonable use case for partially overlapping - SRCU read-side critical sections. For more information - on the trickiness of such overlapping, please see: - https://paulmck.livejournal.com/40593.html - - f. Reader-writer locking is not modeled. It can be + e. Reader-writer locking is not modeled. It can be emulated in litmus tests using atomic read-modify-write operations. diff --git a/tools/memory-model/Documentation/locking.txt b/tools/memory-model/Documentation/locking.txt new file mode 100644 index 000000000000..65c898c64a93 --- /dev/null +++ b/tools/memory-model/Documentation/locking.txt @@ -0,0 +1,298 @@ +Locking +======= + +Locking is well-known and the common use cases are straightforward: Any +CPU holding a given lock sees any changes previously seen or made by any +CPU before it previously released that same lock. This last sentence +is the only part of this document that most developers will need to read. + +However, developers who would like to also access lock-protected shared +variables outside of their corresponding locks should continue reading. + + +Locking and Prior Accesses +-------------------------- + +The basic rule of locking is worth repeating: + + Any CPU holding a given lock sees any changes previously seen + or made by any CPU before it previously released that same lock. + +Note that this statement is a bit stronger than "Any CPU holding a +given lock sees all changes made by any CPU during the time that CPU was +previously holding this same lock". For example, consider the following +pair of code fragments: + + /* See MP+polocks.litmus. */ + void CPU0(void) + { + WRITE_ONCE(x, 1); + spin_lock(&mylock); + WRITE_ONCE(y, 1); + spin_unlock(&mylock); + } + + void CPU1(void) + { + spin_lock(&mylock); + r0 = READ_ONCE(y); + spin_unlock(&mylock); + r1 = READ_ONCE(x); + } + +The basic rule guarantees that if CPU0() acquires mylock before CPU1(), +then both r0 and r1 must be set to the value 1. This also has the +consequence that if the final value of r0 is equal to 1, then the final +value of r1 must also be equal to 1. In contrast, the weaker rule would +say nothing about the final value of r1. + + +Locking and Subsequent Accesses +------------------------------- + +The converse to the basic rule also holds: Any CPU holding a given +lock will not see any changes that will be made by any CPU after it +subsequently acquires this same lock. This converse statement is +illustrated by the following litmus test: + + /* See MP+porevlocks.litmus. */ + void CPU0(void) + { + r0 = READ_ONCE(y); + spin_lock(&mylock); + r1 = READ_ONCE(x); + spin_unlock(&mylock); + } + + void CPU1(void) + { + spin_lock(&mylock); + WRITE_ONCE(x, 1); + spin_unlock(&mylock); + WRITE_ONCE(y, 1); + } + +This converse to the basic rule guarantees that if CPU0() acquires +mylock before CPU1(), then both r0 and r1 must be set to the value 0. +This also has the consequence that if the final value of r1 is equal +to 0, then the final value of r0 must also be equal to 0. In contrast, +the weaker rule would say nothing about the final value of r0. + +These examples show only a single pair of CPUs, but the effects of the +locking basic rule extend across multiple acquisitions of a given lock +across multiple CPUs. + + +Double-Checked Locking +---------------------- + +It is well known that more than just a lock is required to make +double-checked locking work correctly, This litmus test illustrates +one incorrect approach: + + /* See Documentation/litmus-tests/locking/DCL-broken.litmus. */ + void CPU0(void) + { + r0 = READ_ONCE(flag); + if (r0 == 0) { + spin_lock(&lck); + r1 = READ_ONCE(flag); + if (r1 == 0) { + WRITE_ONCE(data, 1); + WRITE_ONCE(flag, 1); + } + spin_unlock(&lck); + } + r2 = READ_ONCE(data); + } + /* CPU1() is the exactly the same as CPU0(). */ + +There are two problems. First, there is no ordering between the first +READ_ONCE() of "flag" and the READ_ONCE() of "data". Second, there is +no ordering between the two WRITE_ONCE() calls. It should therefore be +no surprise that "r2" can be zero, and a quick herd7 run confirms this. + +One way to fix this is to use smp_load_acquire() and smp_store_release() +as shown in this corrected version: + + /* See Documentation/litmus-tests/locking/DCL-fixed.litmus. */ + void CPU0(void) + { + r0 = smp_load_acquire(&flag); + if (r0 == 0) { + spin_lock(&lck); + r1 = READ_ONCE(flag); + if (r1 == 0) { + WRITE_ONCE(data, 1); + smp_store_release(&flag, 1); + } + spin_unlock(&lck); + } + r2 = READ_ONCE(data); + } + /* CPU1() is the exactly the same as CPU0(). */ + +The smp_load_acquire() guarantees that its load from "flags" will +be ordered before the READ_ONCE() from data, thus solving the first +problem. The smp_store_release() guarantees that its store will be +ordered after the WRITE_ONCE() to "data", solving the second problem. +The smp_store_release() pairs with the smp_load_acquire(), thus ensuring +that the ordering provided by each actually takes effect. Again, a +quick herd7 run confirms this. + +In short, if you access a lock-protected variable without holding the +corresponding lock, you will need to provide additional ordering, in +this case, via the smp_load_acquire() and the smp_store_release(). + + +Ordering Provided by a Lock to CPUs Not Holding That Lock +--------------------------------------------------------- + +It is not necessarily the case that accesses ordered by locking will be +seen as ordered by CPUs not holding that lock. Consider this example: + + /* See Z6.0+pooncelock+pooncelock+pombonce.litmus. */ + void CPU0(void) + { + spin_lock(&mylock); + WRITE_ONCE(x, 1); + WRITE_ONCE(y, 1); + spin_unlock(&mylock); + } + + void CPU1(void) + { + spin_lock(&mylock); + r0 = READ_ONCE(y); + WRITE_ONCE(z, 1); + spin_unlock(&mylock); + } + + void CPU2(void) + { + WRITE_ONCE(z, 2); + smp_mb(); + r1 = READ_ONCE(x); + } + +Counter-intuitive though it might be, it is quite possible to have +the final value of r0 be 1, the final value of z be 2, and the final +value of r1 be 0. The reason for this surprising outcome is that CPU2() +never acquired the lock, and thus did not fully benefit from the lock's +ordering properties. + +Ordering can be extended to CPUs not holding the lock by careful use +of smp_mb__after_spinlock(): + + /* See Z6.0+pooncelock+poonceLock+pombonce.litmus. */ + void CPU0(void) + { + spin_lock(&mylock); + WRITE_ONCE(x, 1); + WRITE_ONCE(y, 1); + spin_unlock(&mylock); + } + + void CPU1(void) + { + spin_lock(&mylock); + smp_mb__after_spinlock(); + r0 = READ_ONCE(y); + WRITE_ONCE(z, 1); + spin_unlock(&mylock); + } + + void CPU2(void) + { + WRITE_ONCE(z, 2); + smp_mb(); + r1 = READ_ONCE(x); + } + +This addition of smp_mb__after_spinlock() strengthens the lock +acquisition sufficiently to rule out the counter-intuitive outcome. +In other words, the addition of the smp_mb__after_spinlock() prohibits +the counter-intuitive result where the final value of r0 is 1, the final +value of z is 2, and the final value of r1 is 0. + + +No Roach-Motel Locking! +----------------------- + +This example requires familiarity with the herd7 "filter" clause, so +please read up on that topic in litmus-tests.txt. + +It is tempting to allow memory-reference instructions to be pulled +into a critical section, but this cannot be allowed in the general case. +For example, consider a spin loop preceding a lock-based critical section. +Now, herd7 does not model spin loops, but we can emulate one with two +loads, with a "filter" clause to constrain the first to return the +initial value and the second to return the updated value, as shown below: + + /* See Documentation/litmus-tests/locking/RM-fixed.litmus. */ + void CPU0(void) + { + spin_lock(&lck); + r2 = atomic_inc_return(&y); + WRITE_ONCE(x, 1); + spin_unlock(&lck); + } + + void CPU1(void) + { + r0 = READ_ONCE(x); + r1 = READ_ONCE(x); + spin_lock(&lck); + r2 = atomic_inc_return(&y); + spin_unlock(&lck); + } + + filter (1:r0=0 /\ 1:r1=1) + exists (1:r2=1) + +The variable "x" is the control variable for the emulated spin loop. +CPU0() sets it to "1" while holding the lock, and CPU1() emulates the +spin loop by reading it twice, first into "1:r0" (which should get the +initial value "0") and then into "1:r1" (which should get the updated +value "1"). + +The "filter" clause takes this into account, constraining "1:r0" to +equal "0" and "1:r1" to equal 1. + +Then the "exists" clause checks to see if CPU1() acquired its lock first, +which should not happen given the filter clause because CPU0() updates +"x" while holding the lock. And herd7 confirms this. + +But suppose that the compiler was permitted to reorder the spin loop +into CPU1()'s critical section, like this: + + /* See Documentation/litmus-tests/locking/RM-broken.litmus. */ + void CPU0(void) + { + int r2; + + spin_lock(&lck); + r2 = atomic_inc_return(&y); + WRITE_ONCE(x, 1); + spin_unlock(&lck); + } + + void CPU1(void) + { + spin_lock(&lck); + r0 = READ_ONCE(x); + r1 = READ_ONCE(x); + r2 = atomic_inc_return(&y); + spin_unlock(&lck); + } + + filter (1:r0=0 /\ 1:r1=1) + exists (1:r2=1) + +If "1:r0" is equal to "0", "1:r1" can never equal "1" because CPU0() +cannot update "x" while CPU1() holds the lock. And herd7 confirms this, +showing zero executions matching the "filter" criteria. + +And this is why Linux-kernel lock and unlock primitives must prevent +code from entering critical sections. It is not sufficient to only +prevent code from leaving them. diff --git a/tools/memory-model/linux-kernel.bell b/tools/memory-model/linux-kernel.bell index 70a9073dec3e..ce068700939c 100644 --- a/tools/memory-model/linux-kernel.bell +++ b/tools/memory-model/linux-kernel.bell @@ -31,7 +31,8 @@ enum Barriers = 'wmb (*smp_wmb*) || 'before-atomic (*smp_mb__before_atomic*) || 'after-atomic (*smp_mb__after_atomic*) || 'after-spinlock (*smp_mb__after_spinlock*) || - 'after-unlock-lock (*smp_mb__after_unlock_lock*) + 'after-unlock-lock (*smp_mb__after_unlock_lock*) || + 'after-srcu-read-unlock (*smp_mb__after_srcu_read_unlock*) instructions F[Barriers] (* SRCU *) @@ -53,38 +54,31 @@ let rcu-rscs = let rec in matched (* Validate nesting *) -flag ~empty Rcu-lock \ domain(rcu-rscs) as unbalanced-rcu-locking -flag ~empty Rcu-unlock \ range(rcu-rscs) as unbalanced-rcu-locking +flag ~empty Rcu-lock \ domain(rcu-rscs) as unmatched-rcu-lock +flag ~empty Rcu-unlock \ range(rcu-rscs) as unmatched-rcu-unlock (* Compute matching pairs of nested Srcu-lock and Srcu-unlock *) -let srcu-rscs = let rec - unmatched-locks = Srcu-lock \ domain(matched) - and unmatched-unlocks = Srcu-unlock \ range(matched) - and unmatched = unmatched-locks | unmatched-unlocks - and unmatched-po = ([unmatched] ; po ; [unmatched]) & loc - and unmatched-locks-to-unlocks = - ([unmatched-locks] ; po ; [unmatched-unlocks]) & loc - and matched = matched | (unmatched-locks-to-unlocks \ - (unmatched-po ; unmatched-po)) - in matched +let carry-srcu-data = (data ; [~ Srcu-unlock] ; rf)* +let srcu-rscs = ([Srcu-lock] ; carry-srcu-data ; data ; [Srcu-unlock]) & loc (* Validate nesting *) -flag ~empty Srcu-lock \ domain(srcu-rscs) as unbalanced-srcu-locking -flag ~empty Srcu-unlock \ range(srcu-rscs) as unbalanced-srcu-locking +flag ~empty Srcu-lock \ domain(srcu-rscs) as unmatched-srcu-lock +flag ~empty Srcu-unlock \ range(srcu-rscs) as unmatched-srcu-unlock +flag ~empty (srcu-rscs^-1 ; srcu-rscs) \ id as multiple-srcu-matches (* Check for use of synchronize_srcu() inside an RCU critical section *) flag ~empty rcu-rscs & (po ; [Sync-srcu] ; po) as invalid-sleep (* Validate SRCU dynamic match *) -flag ~empty different-values(srcu-rscs) as srcu-bad-nesting +flag ~empty different-values(srcu-rscs) as srcu-bad-value-match (* Compute marked and plain memory accesses *) let Marked = (~M) | IW | Once | Release | Acquire | domain(rmw) | range(rmw) | - LKR | LKW | UL | LF | RL | RU + LKR | LKW | UL | LF | RL | RU | Srcu-lock | Srcu-unlock let Plain = M \ Marked (* Redefine dependencies to include those carried through plain accesses *) -let carry-dep = (data ; rfi)* +let carry-dep = (data ; [~ Srcu-unlock] ; rfi)* let addr = carry-dep ; addr let ctrl = carry-dep ; ctrl let data = carry-dep ; data diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat index 07f884f9b2bf..adf3c4f41229 100644 --- a/tools/memory-model/linux-kernel.cat +++ b/tools/memory-model/linux-kernel.cat @@ -37,8 +37,20 @@ let mb = ([M] ; fencerel(Mb) ; [M]) | ([M] ; fencerel(Before-atomic) ; [RMW] ; po? ; [M]) | ([M] ; po? ; [RMW] ; fencerel(After-atomic) ; [M]) | ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | - ([M] ; po ; [UL] ; (co | po) ; [LKW] ; - fencerel(After-unlock-lock) ; [M]) +(* + * Note: The po-unlock-lock-po relation only passes the lock to the direct + * successor, perhaps giving the impression that the ordering of the + * smp_mb__after_unlock_lock() fence only affects a single lock handover. + * However, in a longer sequence of lock handovers, the implicit + * A-cumulative release fences of lock-release ensure that any stores that + * propagate to one of the involved CPUs before it hands over the lock to + * the next CPU will also propagate to the final CPU handing over the lock + * to the CPU that executes the fence. Therefore, all those stores are + * also affected by the fence. + *) + ([M] ; po-unlock-lock-po ; + [After-unlock-lock] ; po ; [M]) | + ([M] ; po? ; [Srcu-unlock] ; fencerel(After-srcu-read-unlock) ; [M]) let gp = po ; [Sync-rcu | Sync-srcu] ; po? let strong-fence = mb | gp @@ -69,8 +81,8 @@ let dep = addr | data let rwdep = (dep | ctrl) ; [W] let overwrite = co | fr let to-w = rwdep | (overwrite & int) | (addr ; [Plain] ; wmb) -let to-r = addr | (dep ; [Marked] ; rfi) -let ppo = to-r | to-w | fence | (po-unlock-lock-po & int) +let to-r = (addr ; [R]) | (dep ; [Marked] ; rfi) +let ppo = to-r | to-w | (fence & int) | (po-unlock-lock-po & int) (* Propagation: Ordering from release operations and strong fences. *) let A-cumul(r) = (rfe ; [Marked])? ; r diff --git a/tools/memory-model/linux-kernel.def b/tools/memory-model/linux-kernel.def index ef0f3c1850de..88a39601f525 100644 --- a/tools/memory-model/linux-kernel.def +++ b/tools/memory-model/linux-kernel.def @@ -24,6 +24,7 @@ smp_mb__before_atomic() { __fence{before-atomic}; } smp_mb__after_atomic() { __fence{after-atomic}; } smp_mb__after_spinlock() { __fence{after-spinlock}; } smp_mb__after_unlock_lock() { __fence{after-unlock-lock}; } +smp_mb__after_srcu_read_unlock() { __fence{after-srcu-read-unlock}; } barrier() { __fence{barrier}; } // Exchange @@ -49,8 +50,10 @@ synchronize_rcu() { __fence{sync-rcu}; } synchronize_rcu_expedited() { __fence{sync-rcu}; } // SRCU -srcu_read_lock(X) __srcu{srcu-lock}(X) -srcu_read_unlock(X,Y) { __srcu{srcu-unlock}(X,Y); } +srcu_read_lock(X) __load{srcu-lock}(*X) +srcu_read_unlock(X,Y) { __store{srcu-unlock}(*X,Y); } +srcu_down_read(X) __load{srcu-lock}(*X) +srcu_up_read(X,Y) { __store{srcu-unlock}(*X,Y); } synchronize_srcu(X) { __srcu{sync-srcu}(X); } synchronize_srcu_expedited(X) { __srcu{sync-srcu}(X); } diff --git a/tools/memory-model/litmus-tests/.gitignore b/tools/memory-model/litmus-tests/.gitignore index c492a1ddad91..19c379cf069d 100644 --- a/tools/memory-model/litmus-tests/.gitignore +++ b/tools/memory-model/litmus-tests/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -*.litmus.out +*.litmus.* diff --git a/tools/memory-model/lock.cat b/tools/memory-model/lock.cat index 6b52f365d73a..53b5a492739d 100644 --- a/tools/memory-model/lock.cat +++ b/tools/memory-model/lock.cat @@ -36,9 +36,9 @@ let RU = try RU with emptyset (* Treat RL as a kind of LF: a read with no ordering properties *) let LF = LF | RL -(* There should be no ordinary R or W accesses to spinlocks *) -let ALL-LOCKS = LKR | LKW | UL | LF | RU -flag ~empty [M \ IW] ; loc ; [ALL-LOCKS] as mixed-lock-accesses +(* There should be no ordinary R or W accesses to spinlocks or SRCU structs *) +let ALL-LOCKS = LKR | LKW | UL | LF | RU | Srcu-lock | Srcu-unlock | Sync-srcu +flag ~empty [M \ IW \ ALL-LOCKS] ; loc ; [ALL-LOCKS] as mixed-lock-accesses (* Link Lock-Reads to their RMW-partner Lock-Writes *) let lk-rmw = ([LKR] ; po-loc ; [LKW]) \ (po ; po) diff --git a/tools/memory-model/scripts/README b/tools/memory-model/scripts/README index 095c7eb36f9f..fb39bd0fd1b9 100644 --- a/tools/memory-model/scripts/README +++ b/tools/memory-model/scripts/README @@ -27,6 +27,14 @@ checklitmushist.sh checklitmus.sh Check a single litmus test against its "Result:" expected result. + Not intended to for manual use. + +checktheselitmus.sh + + Check the specified list of litmus tests against their "Result:" + expected results. This takes optional parseargs.sh arguments, + followed by "--" followed by pathnames starting from the current + directory. cmplitmushist.sh @@ -43,10 +51,10 @@ initlitmushist.sh judgelitmus.sh - Given a .litmus file and its .litmus.out herd7 output, check the - .litmus.out file against the .litmus file's "Result:" comment to - judge whether the test ran correctly. Not normally run manually, - provided instead for use by other scripts. + Given a .litmus file and its herd7 output, check the output file + against the .litmus file's "Result:" comment to judge whether + the test ran correctly. Not normally run manually, provided + instead for use by other scripts. newlitmushist.sh @@ -68,3 +76,35 @@ runlitmushist.sh README This file + +Testing a change to LKMM might go as follows: + + # Populate expected results without that change, and + # runs for about an hour on an 8-CPU x86 system: + scripts/initlitmushist.sh --timeout 10m --procs 10 + # Incorporate the change: + git am -s -3 /path/to/patch # Or whatever it takes. + + # Test the new version of LKMM as follows... + + # Runs in seconds, good smoke test: + scripts/checkalllitmus.sh + + # Compares results to those produced by initlitmushist.sh, + # and runs for about an hour on an 8-CPU x86 system: + scripts/checklitmushist.sh --timeout 10m --procs 10 + + # Checks results against Result tags, runs in minutes: + scripts/checkghlitmus.sh --timeout 10m --procs 10 + +The checkghlitmus.sh should not report errors in cases where the +checklitmushist.sh script did not also report a change. However, +this check is nevertheless valuable because it can find errors in the +original version of LKMM. Note however, that given the above procedure, +an error in the original LKMM version that is fixed by the patch will +be reported both as a mismatch by checklitmushist.sh and as an error +by checkghlitmus.sh. One exception to this rule of thumb is when the +test fails completely on the original version of LKMM and passes on the +new version. In this case, checklitmushist.sh will report a mismatch +and checkghlitmus.sh will report success. This happens when the change +to LKMM introduces a new primitive for which litmus tests already existed. diff --git a/tools/memory-model/scripts/checkalllitmus.sh b/tools/memory-model/scripts/checkalllitmus.sh index 3c0c7fbbd223..2d3ee850a839 100755 --- a/tools/memory-model/scripts/checkalllitmus.sh +++ b/tools/memory-model/scripts/checkalllitmus.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0+ # # Run herd7 tests on all .litmus files in the litmus-tests directory @@ -8,6 +8,11 @@ # "^^^". It also outputs verification results to a file whose name is # that of the specified litmus test, but with ".out" appended. # +# If the --hw argument is specified, this script translates the .litmus +# C-language file to the specified type of assembly and verifies that. +# But in this case, litmus tests using complex synchronization (such as +# locking, RCU, and SRCU) are cheerfully ignored. +# # Usage: # checkalllitmus.sh # @@ -17,7 +22,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> +# Author: Paul E. McKenney <paulmck@linux.ibm.com> . scripts/parseargs.sh @@ -30,29 +35,23 @@ else exit 255 fi -# Create any new directories that have appeared in the github litmus -# repo since the last run. +# Create any new directories that have appeared in the litmus-tests +# directory since the last run. if test "$LKMM_DESTDIR" != "." then find $litmusdir -type d -print | ( cd "$LKMM_DESTDIR"; sed -e 's/^/mkdir -p /' | sh ) fi -# Find the checklitmus script. If it is not where we expect it, then -# assume that the caller has the PATH environment variable set -# appropriately. -if test -x scripts/checklitmus.sh -then - clscript=scripts/checklitmus.sh -else - clscript=checklitmus.sh -fi - # Run the script on all the litmus tests in the specified directory ret=0 for i in $litmusdir/*.litmus do - if ! $clscript $i + if test -n "$LKMM_HW_MAP_FILE" && ! scripts/simpletest.sh $i + then + continue + fi + if ! scripts/checklitmus.sh $i then ret=1 fi diff --git a/tools/memory-model/scripts/checkghlitmus.sh b/tools/memory-model/scripts/checkghlitmus.sh index 6589fbb6f653..d3dfb321259f 100755 --- a/tools/memory-model/scripts/checkghlitmus.sh +++ b/tools/memory-model/scripts/checkghlitmus.sh @@ -10,6 +10,7 @@ # parseargs.sh scripts for arguments. . scripts/parseargs.sh +. scripts/hwfnseg.sh T=/tmp/checkghlitmus.sh.$$ trap 'rm -rf $T' 0 @@ -32,19 +33,19 @@ then ( cd "$LKMM_DESTDIR"; sed -e 's/^/mkdir -p /' | sh ) fi -# Create a list of the C-language litmus tests previously run. -( cd $LKMM_DESTDIR; find litmus -name '*.litmus.out' -print ) | - sed -e 's/\.out$//' | - xargs -r egrep -l '^ \* Result: (Never|Sometimes|Always|DEADLOCK)' | +# Create a list of the specified litmus tests previously run. +( cd $LKMM_DESTDIR; find litmus -name "*.litmus${hwfnseg}.out" -print ) | + sed -e "s/${hwfnseg}"'\.out$//' | + xargs -r grep -E -l '^ \* Result: (Never|Sometimes|Always|DEADLOCK)' | xargs -r grep -L "^P${LKMM_PROCS}"> $T/list-C-already # Create a list of C-language litmus tests with "Result:" commands and # no more than the specified number of processes. -find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C -xargs < $T/list-C -r egrep -l '^ \* Result: (Never|Sometimes|Always|DEADLOCK)' > $T/list-C-result +find litmus -name '*.litmus' -print | mselect7 -arch C > $T/list-C +xargs < $T/list-C -r grep -E -l '^ \* Result: (Never|Sometimes|Always|DEADLOCK)' > $T/list-C-result xargs < $T/list-C-result -r grep -L "^P${LKMM_PROCS}" > $T/list-C-result-short -# Form list of tests without corresponding .litmus.out files +# Form list of tests without corresponding .out files sort $T/list-C-already $T/list-C-result-short | uniq -u > $T/list-C-needed # Run any needed tests. diff --git a/tools/memory-model/scripts/checklitmus.sh b/tools/memory-model/scripts/checklitmus.sh index 11461ed40b5e..4c1d0cf0ddad 100755 --- a/tools/memory-model/scripts/checklitmus.sh +++ b/tools/memory-model/scripts/checklitmus.sh @@ -1,10 +1,8 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0+ # -# Run a herd7 test and invokes judgelitmus.sh to check the result against -# a "Result:" comment within the litmus test. It also outputs verification -# results to a file whose name is that of the specified litmus test, but -# with ".out" appended. +# Invokes runlitmus.sh and judgelitmus.sh on its arguments to run the +# specified litmus test and pass judgment on the results. # # Usage: # checklitmus.sh file.litmus @@ -15,20 +13,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> +# Author: Paul E. McKenney <paulmck@linux.ibm.com> -litmus=$1 -herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg} - -if test -f "$litmus" -a -r "$litmus" -then - : -else - echo ' --- ' error: \"$litmus\" is not a readable file - exit 255 -fi - -echo Herd options: $herdoptions > $LKMM_DESTDIR/$litmus.out -/usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 - -scripts/judgelitmus.sh $litmus +scripts/runlitmus.sh $1 +scripts/judgelitmus.sh $1 diff --git a/tools/memory-model/scripts/checklitmushist.sh b/tools/memory-model/scripts/checklitmushist.sh index 1d210ffb7c8a..406ecfc0aee4 100755 --- a/tools/memory-model/scripts/checklitmushist.sh +++ b/tools/memory-model/scripts/checklitmushist.sh @@ -12,7 +12,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> +# Author: Paul E. McKenney <paulmck@linux.ibm.com> . scripts/parseargs.sh diff --git a/tools/memory-model/scripts/checktheselitmus.sh b/tools/memory-model/scripts/checktheselitmus.sh new file mode 100755 index 000000000000..10eeb5ecea6d --- /dev/null +++ b/tools/memory-model/scripts/checktheselitmus.sh @@ -0,0 +1,43 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Invokes checklitmus.sh on its arguments to run the specified litmus +# test and pass judgment on the results. +# +# Usage: +# checktheselitmus.sh -- [ file1.litmus [ file2.litmus ... ] ] +# +# Run this in the directory containing the memory model, specifying the +# pathname of the litmus test to check. The usual parseargs.sh arguments +# can be specified prior to the "--". +# +# This script is intended for use with pathnames that start from the +# tools/memory-model directory. If some of the pathnames instead start at +# the root directory, they all must do so and the "--destdir /" parseargs.sh +# argument must be specified prior to the "--". Alternatively, some other +# "--destdir" argument can be supplied as long as the needed subdirectories +# are populated. +# +# Copyright IBM Corporation, 2018 +# +# Author: Paul E. McKenney <paulmck@linux.ibm.com> + +. scripts/parseargs.sh + +ret=0 +for i in "$@" +do + if scripts/checklitmus.sh $i + then + : + else + ret=1 + fi +done +if test "$ret" -ne 0 +then + echo " ^^^ VERIFICATION MISMATCHES" 1>&2 +else + echo All litmus tests verified as was expected. 1>&2 +fi +exit $ret diff --git a/tools/memory-model/scripts/cmplitmushist.sh b/tools/memory-model/scripts/cmplitmushist.sh index 0f498aeeccf5..ca1ac8b64614 100755 --- a/tools/memory-model/scripts/cmplitmushist.sh +++ b/tools/memory-model/scripts/cmplitmushist.sh @@ -12,12 +12,49 @@ trap 'rm -rf $T' 0 mkdir $T # comparetest oldpath newpath +badmacnam=0 +timedout=0 perfect=0 obsline=0 noobsline=0 obsresult=0 badcompare=0 comparetest () { + if grep -q ': Unknown macro ' $1 || grep -q ': Unknown macro ' $2 + then + if grep -q ': Unknown macro ' $1 + then + badname=`grep ': Unknown macro ' $1 | + sed -e 's/^.*: Unknown macro //' | + sed -e 's/ (User error).*$//'` + echo 'Current LKMM version does not know "'$badname'"' $1 + fi + if grep -q ': Unknown macro ' $2 + then + badname=`grep ': Unknown macro ' $2 | + sed -e 's/^.*: Unknown macro //' | + sed -e 's/ (User error).*$//'` + echo 'Current LKMM version does not know "'$badname'"' $2 + fi + badmacnam=`expr "$badmacnam" + 1` + return 0 + elif grep -q '^Command exited with non-zero status 124' $1 || + grep -q '^Command exited with non-zero status 124' $2 + then + if grep -q '^Command exited with non-zero status 124' $1 && + grep -q '^Command exited with non-zero status 124' $2 + then + echo Both runs timed out: $2 + elif grep -q '^Command exited with non-zero status 124' $1 + then + echo Old run timed out: $2 + elif grep -q '^Command exited with non-zero status 124' $2 + then + echo New run timed out: $2 + fi + timedout=`expr "$timedout" + 1` + return 0 + fi grep -v 'maxresident)k\|minor)pagefaults\|^Time' $1 > $T/oldout grep -v 'maxresident)k\|minor)pagefaults\|^Time' $2 > $T/newout if cmp -s $T/oldout $T/newout && grep -q '^Observation' $1 @@ -38,7 +75,7 @@ comparetest () { return 0 fi else - echo Missing Observation line "(e.g., herd7 timeout)": $2 + echo Missing Observation line "(e.g., syntax error)": $2 noobsline=`expr "$noobsline" + 1` return 0 fi @@ -72,12 +109,20 @@ then fi if test "$noobsline" -ne 0 then - echo Missing Observation line "(e.g., herd7 timeout)": $noobsline 1>&2 + echo Missing Observation line "(e.g., syntax error)": $noobsline 1>&2 fi if test "$obsresult" -ne 0 then echo Matching Observation Always/Sometimes/Never result: $obsresult 1>&2 fi +if test "$timedout" -ne 0 +then + echo "!!!" Timed out: $timedout 1>&2 +fi +if test "$badmacnam" -ne 0 +then + echo "!!!" Unknown primitive: $badmacnam 1>&2 +fi if test "$badcompare" -ne 0 then echo "!!!" Result changed: $badcompare 1>&2 diff --git a/tools/memory-model/scripts/hwfnseg.sh b/tools/memory-model/scripts/hwfnseg.sh new file mode 100755 index 000000000000..580c3281181c --- /dev/null +++ b/tools/memory-model/scripts/hwfnseg.sh @@ -0,0 +1,20 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Generate the hardware extension to the litmus-test filename, or the +# empty string if this is an LKMM run. The extension is placed in +# the shell variable hwfnseg. +# +# Usage: +# . hwfnseg.sh +# +# Copyright IBM Corporation, 2019 +# +# Author: Paul E. McKenney <paulmck@linux.ibm.com> + +if test -z "$LKMM_HW_MAP_FILE" +then + hwfnseg= +else + hwfnseg=".$LKMM_HW_MAP_FILE" +fi diff --git a/tools/memory-model/scripts/initlitmushist.sh b/tools/memory-model/scripts/initlitmushist.sh index 956b6957484d..31ea782955d3 100755 --- a/tools/memory-model/scripts/initlitmushist.sh +++ b/tools/memory-model/scripts/initlitmushist.sh @@ -60,7 +60,7 @@ fi # Create a list of the C-language litmus tests with no more than the # specified number of processes (per the --procs argument). -find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C +find litmus -name '*.litmus' -print | mselect7 -arch C > $T/list-C xargs < $T/list-C -r grep -L "^P${LKMM_PROCS}" > $T/list-C-short scripts/runlitmushist.sh < $T/list-C-short diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index 0cc63875e395..1ec5d89fcfbb 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -1,9 +1,22 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0+ # -# Given a .litmus test and the corresponding .litmus.out file, check -# the .litmus.out file against the "Result:" comment to judge whether -# the test ran correctly. +# Given a .litmus test and the corresponding litmus output file, check +# the .litmus.out file against the "Result:" comment to judge whether the +# test ran correctly. If the --hw argument is omitted, check against the +# LKMM output, which is assumed to be in file.litmus.out. If either a +# "DATARACE" marker in the "Result:" comment or a "Flag data-race" marker +# in the LKMM output is present, the other must also be as well, at least +# for litmus tests having a "Result:" comment. In this case, a failure of +# the Always/Sometimes/Never portion of the "Result:" prediction will be +# noted, but forgiven. +# +# If the --hw argument is provided, this is assumed to be a hardware +# test, and the output is assumed to be in file.litmus.HW.out, where +# "HW" is the --hw argument. In addition, non-Sometimes verification +# results will be noted, but forgiven. Furthermore, if there is no +# "Result:" comment but there is an LKMM .litmus.out file, the observation +# in that file will be used to judge the assembly-language verification. # # Usage: # judgelitmus.sh file.litmus @@ -13,7 +26,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> +# Author: Paul E. McKenney <paulmck@linux.ibm.com> litmus=$1 @@ -24,55 +37,120 @@ else echo ' --- ' error: \"$litmus\" is not a readable file exit 255 fi -if test -f "$LKMM_DESTDIR/$litmus".out -a -r "$LKMM_DESTDIR/$litmus".out +if test -z "$LKMM_HW_MAP_FILE" +then + litmusout=$litmus.out + lkmmout= +else + litmusout="`echo $litmus | + sed -e 's/\.litmus$/.litmus.'${LKMM_HW_MAP_FILE}'/'`.out" + lkmmout=$litmus.out +fi +if test -f "$LKMM_DESTDIR/$litmusout" -a -r "$LKMM_DESTDIR/$litmusout" then : else - echo ' --- ' error: \"$LKMM_DESTDIR/$litmus\".out is not a readable file + echo ' --- ' error: \"$LKMM_DESTDIR/$litmusout is not a readable file exit 255 fi -if grep -q '^ \* Result: ' $litmus +if grep -q '^Flag data-race$' "$LKMM_DESTDIR/$litmusout" +then + datarace_modeled=1 +fi +if grep -q '^[( ]\* Result: ' $litmus +then + outcome=`grep -m 1 '^[( ]\* Result: ' $litmus | awk '{ print $3 }'` + if grep -m1 '^[( ]\* Result: .* DATARACE' $litmus + then + datarace_predicted=1 + fi + if test -n "$datarace_predicted" -a -z "$datarace_modeled" -a -z "$LKMM_HW_MAP_FILE" + then + echo '!!! Predicted data race not modeled' $litmus + exit 252 + elif test -z "$datarace_predicted" -a -n "$datarace_modeled" + then + # Note that hardware models currently don't model data races + echo '!!! Unexpected data race modeled' $litmus + exit 253 + fi +elif test -n "$LKMM_HW_MAP_FILE" && grep -q '^Observation' $LKMM_DESTDIR/$lkmmout > /dev/null 2>&1 then - outcome=`grep -m 1 '^ \* Result: ' $litmus | awk '{ print $3 }'` + outcome=`grep -m 1 '^Observation ' $LKMM_DESTDIR/$lkmmout | awk '{ print $3 }'` else outcome=specified fi -grep '^Observation' $LKMM_DESTDIR/$litmus.out -if grep -q '^Observation' $LKMM_DESTDIR/$litmus.out +grep '^Observation' $LKMM_DESTDIR/$litmusout +if grep -q '^Observation' $LKMM_DESTDIR/$litmusout then : +elif grep ': Unknown macro ' $LKMM_DESTDIR/$litmusout +then + badname=`grep ': Unknown macro ' $LKMM_DESTDIR/$litmusout | + sed -e 's/^.*: Unknown macro //' | + sed -e 's/ (User error).*$//'` + badmsg=' !!! Current LKMM version does not know "'$badname'"'" $litmus" + echo $badmsg + if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout + then + echo ' !!! '$badmsg >> $LKMM_DESTDIR/$litmusout 2>&1 + fi + exit 254 +elif grep '^Command exited with non-zero status 124' $LKMM_DESTDIR/$litmusout +then + echo ' !!! Timeout' $litmus + if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout + then + echo ' !!! Timeout' >> $LKMM_DESTDIR/$litmusout 2>&1 + fi + exit 124 else echo ' !!! Verification error' $litmus - if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout then - echo ' !!! Verification error' >> $LKMM_DESTDIR/$litmus.out 2>&1 + echo ' !!! Verification error' >> $LKMM_DESTDIR/$litmusout 2>&1 fi exit 255 fi if test "$outcome" = DEADLOCK then - if grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q 'Never 0 0$' + if grep '^Observation' $LKMM_DESTDIR/$litmusout | grep -q 'Never 0 0$' then ret=0 else echo " !!! Unexpected non-$outcome verification" $litmus - if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout then - echo " !!! Unexpected non-$outcome verification" >> $LKMM_DESTDIR/$litmus.out 2>&1 + echo " !!! Unexpected non-$outcome verification" >> $LKMM_DESTDIR/$litmusout 2>&1 fi ret=1 fi -elif grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q $outcome || test "$outcome" = Maybe +elif grep '^Observation' $LKMM_DESTDIR/$litmusout | grep -q 'Never 0 0$' +then + echo " !!! Unexpected non-$outcome deadlock" $litmus + if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout + then + echo " !!! Unexpected non-$outcome deadlock" $litmus >> $LKMM_DESTDIR/$litmusout 2>&1 + fi + ret=1 +elif grep '^Observation' $LKMM_DESTDIR/$litmusout | grep -q $outcome || test "$outcome" = Maybe then ret=0 else - echo " !!! Unexpected non-$outcome verification" $litmus - if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + if test \( -n "$LKMM_HW_MAP_FILE" -a "$outcome" = Sometimes \) -o -n "$datarace_modeled" then - echo " !!! Unexpected non-$outcome verification" >> $LKMM_DESTDIR/$litmus.out 2>&1 + flag="--- Forgiven" + ret=0 + else + flag="!!! Unexpected" + ret=1 + fi + echo " $flag non-$outcome verification" $litmus + if ! grep -qe "$flag" $LKMM_DESTDIR/$litmusout + then + echo " $flag non-$outcome verification" >> $LKMM_DESTDIR/$litmusout 2>&1 fi - ret=1 fi -tail -2 $LKMM_DESTDIR/$litmus.out | head -1 +tail -2 $LKMM_DESTDIR/$litmusout | head -1 exit $ret diff --git a/tools/memory-model/scripts/newlitmushist.sh b/tools/memory-model/scripts/newlitmushist.sh index 991f8f814881..25235e2049cf 100755 --- a/tools/memory-model/scripts/newlitmushist.sh +++ b/tools/memory-model/scripts/newlitmushist.sh @@ -12,7 +12,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> +# Author: Paul E. McKenney <paulmck@linux.ibm.com> . scripts/parseargs.sh @@ -43,7 +43,7 @@ fi # Form full list of litmus tests with no more than the specified # number of processes (per the --procs argument). -find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C-all +find litmus -name '*.litmus' -print | mselect7 -arch C > $T/list-C-all xargs < $T/list-C-all -r grep -L "^P${LKMM_PROCS}" > $T/list-C-short # Form list of new tests. Note: This does not handle litmus-test deletion! diff --git a/tools/memory-model/scripts/parseargs.sh b/tools/memory-model/scripts/parseargs.sh index 40f52080fdbd..08ded5909860 100755 --- a/tools/memory-model/scripts/parseargs.sh +++ b/tools/memory-model/scripts/parseargs.sh @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0+ # -# the corresponding .litmus.out file, and does not judge the result. +# Parse arguments common to the various scripts. # # . scripts/parseargs.sh # @@ -9,7 +9,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> +# Author: Paul E. McKenney <paulmck@linux.ibm.com> T=/tmp/parseargs.sh.$$ mkdir $T @@ -27,6 +27,7 @@ initparam () { initparam LKMM_DESTDIR "." initparam LKMM_HERD_OPTIONS "-conf linux-kernel.cfg" +initparam LKMM_HW_MAP_FILE "" initparam LKMM_JOBS `getconf _NPROCESSORS_ONLN` initparam LKMM_PROCS "3" initparam LKMM_TIMEOUT "1m" @@ -37,10 +38,11 @@ usagehelp () { echo "Usage $scriptname [ arguments ]" echo " --destdir path (place for .litmus.out, default by .litmus)" echo " --herdopts -conf linux-kernel.cfg ..." + echo " --hw AArch64" echo " --jobs N (number of jobs, default one per CPU)" echo " --procs N (litmus tests with at most this many processes)" echo " --timeout N (herd7 timeout (e.g., 10s, 1m, 2hr, 1d, '')" - echo "Defaults: --destdir '$LKMM_DESTDIR_DEF' --herdopts '$LKMM_HERD_OPTIONS_DEF' --jobs '$LKMM_JOBS_DEF' --procs '$LKMM_PROCS_DEF' --timeout '$LKMM_TIMEOUT_DEF'" + echo "Defaults: --destdir '$LKMM_DESTDIR_DEF' --herdopts '$LKMM_HERD_OPTIONS_DEF' --hw '$LKMM_HW_MAP_FILE' --jobs '$LKMM_JOBS_DEF' --procs '$LKMM_PROCS_DEF' --timeout '$LKMM_TIMEOUT_DEF'" exit 1 } @@ -81,7 +83,7 @@ do echo "Cannot create directory --destdir '$LKMM_DESTDIR'" usage fi - if test -d "$LKMM_DESTDIR" -a -w "$LKMM_DESTDIR" -a -x "$LKMM_DESTDIR" + if test -d "$LKMM_DESTDIR" -a -x "$LKMM_DESTDIR" then : else @@ -95,6 +97,11 @@ do LKMM_HERD_OPTIONS="$2" shift ;; + --hw) + checkarg --hw "(.map file architecture name)" "$#" "$2" '^[A-Za-z0-9_-]\+' '^--' + LKMM_HW_MAP_FILE="$2" + shift + ;; -j[1-9]*) njobs="`echo $1 | sed -e 's/^-j//'`" trailchars="`echo $njobs | sed -e 's/[0-9]\+\(.*\)$/\1/'`" @@ -106,7 +113,7 @@ do LKMM_JOBS="`echo $njobs | sed -e 's/^\([0-9]\+\).*$/\1/'`" ;; --jobs|--job|-j) - checkarg --jobs "(number)" "$#" "$2" '^[1-9][0-9]\+$' '^--' + checkarg --jobs "(number)" "$#" "$2" '^[1-9][0-9]*$' '^--' LKMM_JOBS="$2" shift ;; @@ -120,6 +127,10 @@ do LKMM_TIMEOUT="$2" shift ;; + --) + shift + break + ;; *) echo Unknown argument $1 usage diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh new file mode 100755 index 000000000000..94608d4b6502 --- /dev/null +++ b/tools/memory-model/scripts/runlitmus.sh @@ -0,0 +1,80 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Without the -hw argument, runs a herd7 test and outputs verification +# results to a file whose name is that of the specified litmus test, +# but with ".out" appended. +# +# If the --hw argument is specified, this script translates the .litmus +# C-language file to the specified type of assembly and verifies that. +# But in this case, litmus tests using complex synchronization (such as +# locking, RCU, and SRCU) are cheerfully ignored. +# +# Either way, return the status of the herd7 command. +# +# Usage: +# runlitmus.sh file.litmus +# +# Run this in the directory containing the memory model, specifying the +# pathname of the litmus test to check. The caller is expected to have +# properly set up the LKMM environment variables. +# +# Copyright IBM Corporation, 2019 +# +# Author: Paul E. McKenney <paulmck@linux.ibm.com> + +litmus=$1 +if test -f "$litmus" -a -r "$litmus" +then + : +else + echo ' !!! ' error: \"$litmus\" is not a readable file + exit 255 +fi + +if test -z "$LKMM_HW_MAP_FILE" -o ! -e $LKMM_DESTDIR/$litmus.out +then + # LKMM run + herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg} + echo Herd options: $herdoptions > $LKMM_DESTDIR/$litmus.out + /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 + ret=$? + if test -z "$LKMM_HW_MAP_FILE" + then + exit $ret + fi + echo " --- " Automatically generated LKMM output for '"'--hw $LKMM_HW_MAP_FILE'"' run +fi + +# Hardware run + +T=/tmp/checklitmushw.sh.$$ +trap 'rm -rf $T' 0 2 +mkdir $T + +# Generate filenames +mapfile="Linux2${LKMM_HW_MAP_FILE}.map" +themefile="$T/${LKMM_HW_MAP_FILE}.theme" +herdoptions="-model $LKMM_HW_CAT_FILE" +hwlitmus=`echo $litmus | sed -e 's/\.litmus$/.litmus.'${LKMM_HW_MAP_FILE}'/'` +hwlitmusfile=`echo $hwlitmus | sed -e 's,^.*/,,'` + +# Don't run on litmus tests with complex synchronization +if ! scripts/simpletest.sh $litmus +then + echo ' --- ' error: \"$litmus\" contains locking, RCU, or SRCU + exit 254 +fi + +# Generate the assembly code and run herd7 on it. +gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile +jingle7 -v -theme $themefile $litmus > $LKMM_DESTDIR/$hwlitmus 2> $T/$hwlitmusfile.jingle7.out +if grep -q "Generated 0 tests" $T/$hwlitmusfile.jingle7.out +then + echo ' !!! ' jingle7 failed, errors in $hwlitmus.err + cp $T/$hwlitmusfile.jingle7.out $LKMM_DESTDIR/$hwlitmus.err + exit 253 +fi +/usr/bin/time $LKMM_TIMEOUT_CMD herd7 -unroll 0 $LKMM_DESTDIR/$hwlitmus > $LKMM_DESTDIR/$hwlitmus.out 2>&1 + +exit $? diff --git a/tools/memory-model/scripts/runlitmushist.sh b/tools/memory-model/scripts/runlitmushist.sh index 6ed376f495bb..c6c2bdc67a50 100755 --- a/tools/memory-model/scripts/runlitmushist.sh +++ b/tools/memory-model/scripts/runlitmushist.sh @@ -13,7 +13,9 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> +# Author: Paul E. McKenney <paulmck@linux.ibm.com> + +. scripts/hwfnseg.sh T=/tmp/runlitmushist.sh.$$ trap 'rm -rf $T' 0 @@ -30,15 +32,12 @@ fi # Prefixes for per-CPU scripts for ((i=0;i<$LKMM_JOBS;i++)) do - echo dir="$LKMM_DESTDIR" > $T/$i.sh echo T=$T >> $T/$i.sh - echo herdoptions=\"$LKMM_HERD_OPTIONS\" >> $T/$i.sh cat << '___EOF___' >> $T/$i.sh runtest () { - echo ' ... ' /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $1 '>' $dir/$1.out '2>&1' - if /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $1 > $dir/$1.out 2>&1 + if scripts/runlitmus.sh $1 then - if ! grep -q '^Observation ' $dir/$1.out + if ! grep -q '^Observation ' $LKMM_DESTDIR/$1$2.out then echo ' !!! Herd failed, no Observation:' $1 fi @@ -47,10 +46,16 @@ do if test "$exitcode" -eq 124 then exitmsg="timed out" + elif test "$exitcode" -eq 253 + then + exitmsg= else exitmsg="failed, exit code $exitcode" fi - echo ' !!! Herd' ${exitmsg}: $1 + if test -n "$exitmsg" + then + echo ' !!! Herd' ${exitmsg}: $1 + fi fi } ___EOF___ @@ -59,11 +64,13 @@ done awk -v q="'" -v b='\\' ' { print "echo `grep " q "^P[0-9]" b "+(" q " " $0 " | tail -1 | sed -e " q "s/^P" b "([0-9]" b "+" b ")(.*$/" b "1/" q "` " $0 -}' | bash | -sort -k1n | -awk -v ncpu=$LKMM_JOBS -v t=$T ' +}' | sh | sort -k1n | +awk -v dq='"' -v hwfnseg="$hwfnseg" -v ncpu="$LKMM_JOBS" -v t="$T" ' { - print "runtest " $2 >> t "/" NR % ncpu ".sh"; + print "if test -z " dq hwfnseg dq " || scripts/simpletest.sh " dq $2 dq + print "then" + print "\techo runtest " dq $2 dq " " hwfnseg " >> " t "/" NR % ncpu ".sh"; + print "fi" } END { diff --git a/tools/memory-model/scripts/simpletest.sh b/tools/memory-model/scripts/simpletest.sh new file mode 100755 index 000000000000..7edc5d361665 --- /dev/null +++ b/tools/memory-model/scripts/simpletest.sh @@ -0,0 +1,35 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Give zero status if this is a simple test and non-zero otherwise. +# Simple tests do not contain locking, RCU, or SRCU. +# +# Usage: +# simpletest.sh file.litmus +# +# Copyright IBM Corporation, 2019 +# +# Author: Paul E. McKenney <paulmck@linux.ibm.com> + + +litmus=$1 + +if test -f "$litmus" -a -r "$litmus" +then + : +else + echo ' --- ' error: \"$litmus\" is not a readable file + exit 255 +fi +exclude="^[[:space:]]*\(" +exclude="${exclude}spin_lock(\|spin_unlock(\|spin_trylock(\|spin_is_locked(" +exclude="${exclude}\|rcu_read_lock(\|rcu_read_unlock(" +exclude="${exclude}\|synchronize_rcu(\|synchronize_rcu_expedited(" +exclude="${exclude}\|srcu_read_lock(\|srcu_read_unlock(" +exclude="${exclude}\|synchronize_srcu(\|synchronize_srcu_expedited(" +exclude="${exclude}\)" +if grep -q $exclude $litmus +then + exit 255 +fi +exit 0 diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c index 7c2ac124cdc8..99798894b879 100644 --- a/tools/mm/page_owner_sort.c +++ b/tools/mm/page_owner_sort.c @@ -857,7 +857,7 @@ int main(int argc, char **argv) if (cull & CULL_PID || filter & FILTER_PID) fprintf(fout, ", PID %d", list[i].pid); if (cull & CULL_TGID || filter & FILTER_TGID) - fprintf(fout, ", TGID %d", list[i].pid); + fprintf(fout, ", TGID %d", list[i].tgid); if (cull & CULL_COMM || filter & FILTER_COMM) fprintf(fout, ", task_comm_name: %s", list[i].comm); if (cull & CULL_ALLOCATOR) { diff --git a/tools/objtool/Documentation/objtool.txt b/tools/objtool/Documentation/objtool.txt index 8e53fc6735ef..744db4218e7a 100644 --- a/tools/objtool/Documentation/objtool.txt +++ b/tools/objtool/Documentation/objtool.txt @@ -181,7 +181,7 @@ b) ORC (Oops Rewind Capability) unwind table generation band. So it doesn't affect runtime performance and it can be reliable even when interrupts or exceptions are involved. - For more details, see Documentation/x86/orc-unwinder.rst. + For more details, see Documentation/arch/x86/orc-unwinder.rst. c) Higher live patching compatibility rate diff --git a/tools/objtool/check.c b/tools/objtool/check.c index f937be1afe65..50ed63f701f1 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1284,9 +1284,9 @@ static const char *uaccess_safe_builtin[] = { "copy_mc_fragile_handle_tail", "copy_mc_enhanced_fast_string", "ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */ - "clear_user_erms", - "clear_user_rep_good", - "clear_user_original", + "rep_stos_alternative", + "rep_movs_alternative", + "__copy_user_nocache", NULL }; diff --git a/tools/rcu/extract-stall.sh b/tools/rcu/extract-stall.sh index e565697c9f90..08a39ad44320 100644..100755 --- a/tools/rcu/extract-stall.sh +++ b/tools/rcu/extract-stall.sh @@ -1,11 +1,25 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0+ -# -# Extract any RCU CPU stall warnings present in specified file. -# Filter out clocksource lines. Note that preceding-lines excludes the -# initial line of the stall warning but trailing-lines includes it. -# -# Usage: extract-stall.sh dmesg-file [ preceding-lines [ trailing-lines ] ] + +usage() { + echo Extract any RCU CPU stall warnings present in specified file. + echo Filter out clocksource lines. Note that preceding-lines excludes the + echo initial line of the stall warning but trailing-lines includes it. + echo + echo Usage: $(basename $0) dmesg-file [ preceding-lines [ trailing-lines ] ] + echo + echo Error: $1 +} + +# Terminate the script, if the argument is missing + +if test -f "$1" && test -r "$1" +then + : +else + usage "Console log file \"$1\" missing or unreadable." + exit 1 +fi echo $1 preceding_lines="${2-3}" diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index 741f15420467..3905c43369c3 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -123,7 +123,7 @@ def _suites_from_test_list(tests: List[str]) -> List[str]: parts = t.split('.', maxsplit=2) if len(parts) != 2: raise ValueError(f'internal KUnit error, test name should be of the form "<suite>.<test>", got "{t}"') - suite, case = parts + suite, _ = parts if not suites or suites[-1] != suite: suites.append(suite) return suites @@ -269,7 +269,7 @@ def massage_argv(argv: Sequence[str]) -> Sequence[str]: def get_default_jobs() -> int: return len(os.sched_getaffinity(0)) -def add_common_opts(parser) -> None: +def add_common_opts(parser: argparse.ArgumentParser) -> None: parser.add_argument('--build_dir', help='As in the make command, it specifies the build ' 'directory.', @@ -320,13 +320,13 @@ def add_common_opts(parser) -> None: help='Additional QEMU arguments, e.g. "-smp 8"', action='append', metavar='') -def add_build_opts(parser) -> None: +def add_build_opts(parser: argparse.ArgumentParser) -> None: parser.add_argument('--jobs', help='As in the make command, "Specifies the number of ' 'jobs (commands) to run simultaneously."', type=int, default=get_default_jobs(), metavar='N') -def add_exec_opts(parser) -> None: +def add_exec_opts(parser: argparse.ArgumentParser) -> None: parser.add_argument('--timeout', help='maximum number of seconds to allow for all tests ' 'to run. This does not include time taken to build the ' @@ -351,7 +351,7 @@ def add_exec_opts(parser) -> None: type=str, choices=['suite', 'test']) -def add_parse_opts(parser) -> None: +def add_parse_opts(parser: argparse.ArgumentParser) -> None: parser.add_argument('--raw_output', help='If set don\'t parse output from kernel. ' 'By default, filters to just KUnit output. Use ' '--raw_output=all to show everything', @@ -386,7 +386,7 @@ def tree_from_args(cli_args: argparse.Namespace) -> kunit_kernel.LinuxSourceTree extra_qemu_args=qemu_args) -def run_handler(cli_args): +def run_handler(cli_args: argparse.Namespace) -> None: if not os.path.exists(cli_args.build_dir): os.mkdir(cli_args.build_dir) @@ -405,7 +405,7 @@ def run_handler(cli_args): sys.exit(1) -def config_handler(cli_args): +def config_handler(cli_args: argparse.Namespace) -> None: if cli_args.build_dir and ( not os.path.exists(cli_args.build_dir)): os.mkdir(cli_args.build_dir) @@ -421,7 +421,7 @@ def config_handler(cli_args): sys.exit(1) -def build_handler(cli_args): +def build_handler(cli_args: argparse.Namespace) -> None: linux = tree_from_args(cli_args) request = KunitBuildRequest(build_dir=cli_args.build_dir, make_options=cli_args.make_options, @@ -434,7 +434,7 @@ def build_handler(cli_args): sys.exit(1) -def exec_handler(cli_args): +def exec_handler(cli_args: argparse.Namespace) -> None: linux = tree_from_args(cli_args) exec_request = KunitExecRequest(raw_output=cli_args.raw_output, build_dir=cli_args.build_dir, @@ -450,10 +450,10 @@ def exec_handler(cli_args): sys.exit(1) -def parse_handler(cli_args): +def parse_handler(cli_args: argparse.Namespace) -> None: if cli_args.file is None: - sys.stdin.reconfigure(errors='backslashreplace') # pytype: disable=attribute-error - kunit_output = sys.stdin + sys.stdin.reconfigure(errors='backslashreplace') # type: ignore + kunit_output = sys.stdin # type: Iterable[str] else: with open(cli_args.file, 'r', errors='backslashreplace') as f: kunit_output = f.read().splitlines() @@ -475,7 +475,7 @@ subcommand_handlers_map = { } -def main(argv): +def main(argv: Sequence[str]) -> None: parser = argparse.ArgumentParser( description='Helps writing and running KUnit tests.') subparser = parser.add_subparsers(dest='subcommand') diff --git a/tools/testing/kunit/kunit_config.py b/tools/testing/kunit/kunit_config.py index 48b5f34b2e5d..eb5dd01210b1 100644 --- a/tools/testing/kunit/kunit_config.py +++ b/tools/testing/kunit/kunit_config.py @@ -8,7 +8,7 @@ from dataclasses import dataclass import re -from typing import Dict, Iterable, List, Set, Tuple +from typing import Any, Dict, Iterable, List, Tuple CONFIG_IS_NOT_SET_PATTERN = r'^# CONFIG_(\w+) is not set$' CONFIG_PATTERN = r'^CONFIG_(\w+)=(\S+|".*")$' @@ -34,7 +34,7 @@ class Kconfig: def __init__(self) -> None: self._entries = {} # type: Dict[str, str] - def __eq__(self, other) -> bool: + def __eq__(self, other: Any) -> bool: if not isinstance(other, self.__class__): return False return self._entries == other._entries diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index 53e90c335834..f01f94106129 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -16,9 +16,9 @@ import shutil import signal import threading from typing import Iterator, List, Optional, Tuple +from types import FrameType import kunit_config -from kunit_printer import stdout import qemu_config KCONFIG_PATH = '.config' @@ -57,7 +57,7 @@ class LinuxSourceTreeOperations: def make_arch_config(self, base_kunitconfig: kunit_config.Kconfig) -> kunit_config.Kconfig: return base_kunitconfig - def make_olddefconfig(self, build_dir: str, make_options) -> None: + def make_olddefconfig(self, build_dir: str, make_options: Optional[List[str]]) -> None: command = ['make', 'ARCH=' + self._linux_arch, 'O=' + build_dir, 'olddefconfig'] if self._cross_compile: command += ['CROSS_COMPILE=' + self._cross_compile] @@ -71,7 +71,7 @@ class LinuxSourceTreeOperations: except subprocess.CalledProcessError as e: raise ConfigError(e.output.decode()) - def make(self, jobs, build_dir: str, make_options) -> None: + def make(self, jobs: int, build_dir: str, make_options: Optional[List[str]]) -> None: command = ['make', 'ARCH=' + self._linux_arch, 'O=' + build_dir, '--jobs=' + str(jobs)] if make_options: command.extend(make_options) @@ -92,7 +92,7 @@ class LinuxSourceTreeOperations: if stderr: # likely only due to build warnings print(stderr.decode()) - def start(self, params: List[str], build_dir: str) -> subprocess.Popen: + def start(self, params: List[str], build_dir: str) -> subprocess.Popen[str]: raise RuntimeError('not implemented!') @@ -106,13 +106,14 @@ class LinuxSourceTreeOperationsQemu(LinuxSourceTreeOperations): self._kernel_path = qemu_arch_params.kernel_path self._kernel_command_line = qemu_arch_params.kernel_command_line + ' kunit_shutdown=reboot' self._extra_qemu_params = qemu_arch_params.extra_qemu_params + self._serial = qemu_arch_params.serial def make_arch_config(self, base_kunitconfig: kunit_config.Kconfig) -> kunit_config.Kconfig: kconfig = kunit_config.parse_from_string(self._kconfig) kconfig.merge_in_entries(base_kunitconfig) return kconfig - def start(self, params: List[str], build_dir: str) -> subprocess.Popen: + def start(self, params: List[str], build_dir: str) -> subprocess.Popen[str]: kernel_path = os.path.join(build_dir, self._kernel_path) qemu_command = ['qemu-system-' + self._qemu_arch, '-nodefaults', @@ -121,7 +122,7 @@ class LinuxSourceTreeOperationsQemu(LinuxSourceTreeOperations): '-append', ' '.join(params + [self._kernel_command_line]), '-no-reboot', '-nographic', - '-serial', 'stdio'] + self._extra_qemu_params + '-serial', self._serial] + self._extra_qemu_params # Note: shlex.join() does what we want, but requires python 3.8+. print('Running tests with:\n$', ' '.join(shlex.quote(arg) for arg in qemu_command)) return subprocess.Popen(qemu_command, @@ -133,7 +134,7 @@ class LinuxSourceTreeOperationsQemu(LinuxSourceTreeOperations): class LinuxSourceTreeOperationsUml(LinuxSourceTreeOperations): """An abstraction over command line operations performed on a source tree.""" - def __init__(self, cross_compile=None): + def __init__(self, cross_compile: Optional[str]=None): super().__init__(linux_arch='um', cross_compile=cross_compile) def make_arch_config(self, base_kunitconfig: kunit_config.Kconfig) -> kunit_config.Kconfig: @@ -141,7 +142,7 @@ class LinuxSourceTreeOperationsUml(LinuxSourceTreeOperations): kconfig.merge_in_entries(base_kunitconfig) return kconfig - def start(self, params: List[str], build_dir: str) -> subprocess.Popen: + def start(self, params: List[str], build_dir: str) -> subprocess.Popen[str]: """Runs the Linux UML binary. Must be named 'linux'.""" linux_bin = os.path.join(build_dir, 'linux') params.extend(['mem=1G', 'console=tty', 'kunit_shutdown=halt']) @@ -216,7 +217,7 @@ def _get_qemu_ops(config_path: str, if not hasattr(config, 'QEMU_ARCH'): raise ValueError('qemu_config module missing "QEMU_ARCH": ' + config_path) - params: qemu_config.QemuArchParams = config.QEMU_ARCH # type: ignore + params: qemu_config.QemuArchParams = config.QEMU_ARCH if extra_qemu_args: params.extra_qemu_params.extend(extra_qemu_args) return params.linux_arch, LinuxSourceTreeOperationsQemu( @@ -230,10 +231,10 @@ class LinuxSourceTree: build_dir: str, kunitconfig_paths: Optional[List[str]]=None, kconfig_add: Optional[List[str]]=None, - arch=None, - cross_compile=None, - qemu_config_path=None, - extra_qemu_args=None) -> None: + arch: Optional[str]=None, + cross_compile: Optional[str]=None, + qemu_config_path: Optional[str]=None, + extra_qemu_args: Optional[List[str]]=None) -> None: signal.signal(signal.SIGINT, self.signal_handler) if qemu_config_path: self._arch, self._ops = _get_qemu_ops(qemu_config_path, extra_qemu_args, cross_compile) @@ -276,7 +277,7 @@ class LinuxSourceTree: logging.error(message) return False - def build_config(self, build_dir: str, make_options) -> bool: + def build_config(self, build_dir: str, make_options: Optional[List[str]]) -> bool: kconfig_path = get_kconfig_path(build_dir) if build_dir and not os.path.exists(build_dir): os.mkdir(build_dir) @@ -304,7 +305,7 @@ class LinuxSourceTree: old_kconfig = kunit_config.parse_file(old_path) return old_kconfig != self._kconfig - def build_reconfig(self, build_dir: str, make_options) -> bool: + def build_reconfig(self, build_dir: str, make_options: Optional[List[str]]) -> bool: """Creates a new .config if it is not a subset of the .kunitconfig.""" kconfig_path = get_kconfig_path(build_dir) if not os.path.exists(kconfig_path): @@ -320,7 +321,7 @@ class LinuxSourceTree: os.remove(kconfig_path) return self.build_config(build_dir, make_options) - def build_kernel(self, jobs, build_dir: str, make_options) -> bool: + def build_kernel(self, jobs: int, build_dir: str, make_options: Optional[List[str]]) -> bool: try: self._ops.make_olddefconfig(build_dir, make_options) self._ops.make(jobs, build_dir, make_options) @@ -329,7 +330,7 @@ class LinuxSourceTree: return False return self.validate_config(build_dir) - def run_kernel(self, args=None, build_dir='', filter_glob='', timeout=None) -> Iterator[str]: + def run_kernel(self, args: Optional[List[str]]=None, build_dir: str='', filter_glob: str='', timeout: Optional[int]=None) -> Iterator[str]: if not args: args = [] if filter_glob: @@ -340,7 +341,7 @@ class LinuxSourceTree: assert process.stdout is not None # tell mypy it's set # Enforce the timeout in a background thread. - def _wait_proc(): + def _wait_proc() -> None: try: process.wait(timeout=timeout) except Exception as e: @@ -366,6 +367,6 @@ class LinuxSourceTree: waiter.join() subprocess.call(['stty', 'sane']) - def signal_handler(self, unused_sig, unused_frame) -> None: + def signal_handler(self, unused_sig: int, unused_frame: Optional[FrameType]) -> None: logging.error('Build interruption occurred. Cleaning console.') subprocess.call(['stty', 'sane']) diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py index a225799f6b1b..fbc094f0567e 100644 --- a/tools/testing/kunit/kunit_parser.py +++ b/tools/testing/kunit/kunit_parser.py @@ -12,7 +12,6 @@ from __future__ import annotations from dataclasses import dataclass import re -import sys import textwrap from enum import Enum, auto diff --git a/tools/testing/kunit/kunit_printer.py b/tools/testing/kunit/kunit_printer.py index 5f1cc55ecdf5..015adf87dc2c 100644 --- a/tools/testing/kunit/kunit_printer.py +++ b/tools/testing/kunit/kunit_printer.py @@ -15,7 +15,7 @@ _RESET = '\033[0;0m' class Printer: """Wraps a file object, providing utilities for coloring output, etc.""" - def __init__(self, output: typing.IO): + def __init__(self, output: typing.IO[str]): self._output = output self._use_color = output.isatty() diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 0c2190514103..be35999bb84f 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -328,7 +328,7 @@ class KUnitParserTest(unittest.TestCase): def test_parse_subtest_header(self): ktap_log = test_data_path('test_parse_subtest_header.log') with open(ktap_log) as file: - result = kunit_parser.parse_run_tests(file.readlines()) + kunit_parser.parse_run_tests(file.readlines()) self.print_mock.assert_any_call(StrContains('suite (1 subtest)')) def test_show_test_output_on_failure(self): diff --git a/tools/testing/kunit/qemu_config.py b/tools/testing/kunit/qemu_config.py index 0b6a80398ccc..b1fba9016eed 100644 --- a/tools/testing/kunit/qemu_config.py +++ b/tools/testing/kunit/qemu_config.py @@ -17,3 +17,4 @@ class QemuArchParams: kernel_path: str kernel_command_line: str extra_qemu_params: List[str] + serial: str = 'stdio' diff --git a/tools/testing/kunit/qemu_configs/m68k.py b/tools/testing/kunit/qemu_configs/m68k.py new file mode 100644 index 000000000000..287fc386f8a7 --- /dev/null +++ b/tools/testing/kunit/qemu_configs/m68k.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +from ..qemu_config import QemuArchParams + +QEMU_ARCH = QemuArchParams(linux_arch='m68k', + kconfig=''' +CONFIG_VIRT=y''', + qemu_arch='m68k', + kernel_path='vmlinux', + kernel_command_line='console=hvc0', + extra_qemu_params=['-machine', 'virt']) diff --git a/tools/testing/kunit/qemu_configs/sh.py b/tools/testing/kunit/qemu_configs/sh.py new file mode 100644 index 000000000000..78a474a5b95f --- /dev/null +++ b/tools/testing/kunit/qemu_configs/sh.py @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0-only +from ..qemu_config import QemuArchParams + +QEMU_ARCH = QemuArchParams(linux_arch='sh', + kconfig=''' +CONFIG_CPU_SUBTYPE_SH7751R=y +CONFIG_MEMORY_START=0x0c000000 +CONFIG_SH_RTS7751R2D=y +CONFIG_RTS7751R2D_PLUS=y +CONFIG_SERIAL_SH_SCI=y''', + qemu_arch='sh4', + kernel_path='arch/sh/boot/zImage', + kernel_command_line='console=ttySC1', + serial='null', + extra_qemu_params=[ + '-machine', 'r2d', + '-serial', 'mon:stdio']) diff --git a/tools/testing/kunit/run_checks.py b/tools/testing/kunit/run_checks.py index 066e6f938f6d..8208c3b3135e 100755 --- a/tools/testing/kunit/run_checks.py +++ b/tools/testing/kunit/run_checks.py @@ -23,7 +23,7 @@ commands: Dict[str, Sequence[str]] = { 'kunit_tool_test.py': ['./kunit_tool_test.py'], 'kunit smoke test': ['./kunit.py', 'run', '--kunitconfig=lib/kunit', '--build_dir=kunit_run_checks'], 'pytype': ['/bin/sh', '-c', 'pytype *.py'], - 'mypy': ['/bin/sh', '-c', 'mypy *.py'], + 'mypy': ['mypy', '--strict', '--exclude', '_test.py$', '--exclude', 'qemu_configs/', '.'], } # The user might not have mypy or pytype installed, skip them if so. @@ -37,7 +37,7 @@ def main(argv: Sequence[str]) -> None: if argv: raise RuntimeError('This script takes no arguments') - future_to_name: Dict[futures.Future, str] = {} + future_to_name: Dict[futures.Future[None], str] = {} executor = futures.ThreadPoolExecutor(max_workers=len(commands)) for name, argv in commands.items(): if name in necessary_deps and shutil.which(necessary_deps[name]) is None: @@ -73,7 +73,7 @@ def main(argv: Sequence[str]) -> None: sys.exit(1) -def run_cmd(argv: Sequence[str]): +def run_cmd(argv: Sequence[str]) -> None: subprocess.check_output(argv, stderr=subprocess.STDOUT, cwd=ABS_TOOL_PATH, timeout=TIMEOUT) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 13a6837a0c6b..97dcdaa656f6 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -58,6 +58,7 @@ TARGETS += nsfs TARGETS += pidfd TARGETS += pid_namespace TARGETS += powerpc +TARGETS += prctl TARGETS += proc TARGETS += pstore TARGETS += ptrace diff --git a/tools/testing/selftests/amd-pstate/gitsource.sh b/tools/testing/selftests/amd-pstate/gitsource.sh index dbc1fe45599d..5f2171f0116d 100755 --- a/tools/testing/selftests/amd-pstate/gitsource.sh +++ b/tools/testing/selftests/amd-pstate/gitsource.sh @@ -117,7 +117,7 @@ parse_gitsource() printf "Gitsource-$1-#$2 power consumption(J): $en_sum\n" | tee -a $OUTFILE_GIT.result # Permance is the number of run gitsource per second, denoted 1/t, where 1 is the number of run gitsource in t - # senconds. It is well known that P=E/t, where P is power measured in watts(W), E is energy measured in joules(J), + # seconds. It is well known that P=E/t, where P is power measured in watts(W), E is energy measured in joules(J), # and t is time measured in seconds(s). This means that performance per watt becomes # 1/t 1/t 1 # ----- = ----- = --- @@ -175,7 +175,7 @@ gather_gitsource() printf "Gitsource-$1 avg power consumption(J): $avg_en\n" | tee -a $OUTFILE_GIT.result # Permance is the number of run gitsource per second, denoted 1/t, where 1 is the number of run gitsource in t - # senconds. It is well known that P=E/t, where P is power measured in watts(W), E is energy measured in joules(J), + # seconds. It is well known that P=E/t, where P is power measured in watts(W), E is energy measured in joules(J), # and t is time measured in seconds(s). This means that performance per watt becomes # 1/t 1/t 1 # ----- = ----- = --- diff --git a/tools/testing/selftests/amd-pstate/run.sh b/tools/testing/selftests/amd-pstate/run.sh index 57cad57e59c0..de4d8e9c9565 100755 --- a/tools/testing/selftests/amd-pstate/run.sh +++ b/tools/testing/selftests/amd-pstate/run.sh @@ -244,7 +244,7 @@ prerequisite() if [ "$scaling_driver" != "$CURRENT_TEST" ]; then echo "$0 # Skipped: Test can only run on $CURRENT_TEST driver or run comparative test." echo "$0 # Please set X86_AMD_PSTATE enabled or run comparative test." - echo "$0 # Current cpufreq scaling drvier is $scaling_driver." + echo "$0 # Current cpufreq scaling driver is $scaling_driver." exit $ksft_skip fi else @@ -252,7 +252,7 @@ prerequisite() "tbench" | "gitsource") if [ "$scaling_driver" != "$COMPARATIVE_TEST" ]; then echo "$0 # Skipped: Comparison test can only run on $COMPARISON_TEST driver." - echo "$0 # Current cpufreq scaling drvier is $scaling_driver." + echo "$0 # Current cpufreq scaling driver is $scaling_driver." exit $ksft_skip fi ;; diff --git a/tools/testing/selftests/arm64/fp/Makefile b/tools/testing/selftests/arm64/fp/Makefile index 48f56c86ad45..b413b0af07f9 100644 --- a/tools/testing/selftests/arm64/fp/Makefile +++ b/tools/testing/selftests/arm64/fp/Makefile @@ -38,7 +38,7 @@ $(OUTPUT)/vec-syscfg: vec-syscfg.c $(OUTPUT)/rdvl.o $(OUTPUT)/vlset: vlset.c $(OUTPUT)/za-fork: za-fork.c $(OUTPUT)/za-fork-asm.o $(CC) -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ - -include ../../../../include/nolibc/nolibc.h \ + -include ../../../../include/nolibc/nolibc.h -I../..\ -static -ffreestanding -Wall $^ -o $@ $(OUTPUT)/za-ptrace: za-ptrace.c $(OUTPUT)/za-test: za-test.S $(OUTPUT)/asm-utils.o diff --git a/tools/testing/selftests/arm64/fp/za-fork.c b/tools/testing/selftests/arm64/fp/za-fork.c index ff475c649e96..b86cb1049497 100644 --- a/tools/testing/selftests/arm64/fp/za-fork.c +++ b/tools/testing/selftests/arm64/fp/za-fork.c @@ -9,42 +9,9 @@ #include <linux/sched.h> #include <linux/wait.h> -#define EXPECTED_TESTS 1 - -static void putstr(const char *str) -{ - write(1, str, strlen(str)); -} - -static void putnum(unsigned int num) -{ - char c; - - if (num / 10) - putnum(num / 10); - - c = '0' + (num % 10); - write(1, &c, 1); -} +#include "kselftest.h" -static int tests_run; -static int tests_passed; -static int tests_failed; -static int tests_skipped; - -static void print_summary(void) -{ - if (tests_passed + tests_failed + tests_skipped != EXPECTED_TESTS) - putstr("# UNEXPECTED TEST COUNT: "); - - putstr("# Totals: pass:"); - putnum(tests_passed); - putstr(" fail:"); - putnum(tests_failed); - putstr(" xfail:0 xpass:0 skip:"); - putnum(tests_skipped); - putstr(" error:0\n"); -} +#define EXPECTED_TESTS 1 int fork_test(void); int verify_fork(void); @@ -63,22 +30,21 @@ int fork_test_c(void) if (newpid == 0) { /* In child */ if (!verify_fork()) { - putstr("# ZA state invalid in child\n"); + ksft_print_msg("ZA state invalid in child\n"); exit(0); } else { exit(1); } } if (newpid < 0) { - putstr("# fork() failed: -"); - putnum(-newpid); - putstr("\n"); + ksft_print_msg("fork() failed: %d\n", newpid); + return 0; } parent_result = verify_fork(); if (!parent_result) - putstr("# ZA state invalid in parent\n"); + ksft_print_msg("ZA state invalid in parent\n"); for (;;) { waiting = waitpid(newpid, &child_status, 0); @@ -86,18 +52,16 @@ int fork_test_c(void) if (waiting < 0) { if (errno == EINTR) continue; - putstr("# waitpid() failed: "); - putnum(errno); - putstr("\n"); + ksft_print_msg("waitpid() failed: %d\n", errno); return 0; } if (waiting != newpid) { - putstr("# waitpid() returned wrong PID\n"); + ksft_print_msg("waitpid() returned wrong PID\n"); return 0; } if (!WIFEXITED(child_status)) { - putstr("# child did not exit\n"); + ksft_print_msg("child did not exit\n"); return 0; } @@ -105,29 +69,14 @@ int fork_test_c(void) } } -#define run_test(name) \ - if (name()) { \ - tests_passed++; \ - } else { \ - tests_failed++; \ - putstr("not "); \ - } \ - putstr("ok "); \ - putnum(++tests_run); \ - putstr(" " #name "\n"); - int main(int argc, char **argv) { int ret, i; - putstr("TAP version 13\n"); - putstr("1.."); - putnum(EXPECTED_TESTS); - putstr("\n"); + ksft_print_header(); + ksft_set_plan(EXPECTED_TESTS); - putstr("# PID: "); - putnum(getpid()); - putstr("\n"); + ksft_print_msg("PID: %d\n", getpid()); /* * This test is run with nolibc which doesn't support hwcap and @@ -136,21 +85,16 @@ int main(int argc, char **argv) */ ret = open("/proc/sys/abi/sme_default_vector_length", O_RDONLY, 0); if (ret >= 0) { - run_test(fork_test); + ksft_test_result(fork_test(), "fork_test"); } else { - putstr("# SME support not present\n"); - + ksft_print_msg("SME not supported\n"); for (i = 0; i < EXPECTED_TESTS; i++) { - putstr("ok "); - putnum(i); - putstr(" skipped\n"); + ksft_test_result_skip("fork_test\n"); } - - tests_skipped += EXPECTED_TESTS; } - print_summary(); + ksft_finished(); return 0; } diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c index 7271a18ab3e2..8251a0fc6ee9 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c @@ -167,8 +167,7 @@ void test_xdp_do_redirect(void) if (!ASSERT_EQ(query_opts.feature_flags, NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | - NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_RX_SG | - NETDEV_XDP_ACT_NDO_XMIT_SG, + NETDEV_XDP_ACT_RX_SG, "veth_src query_opts.feature_flags")) goto out; @@ -178,9 +177,34 @@ void test_xdp_do_redirect(void) if (!ASSERT_EQ(query_opts.feature_flags, NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_RX_SG, + "veth_dst query_opts.feature_flags")) + goto out; + + /* Enable GRO */ + SYS("ethtool -K veth_src gro on"); + SYS("ethtool -K veth_dst gro on"); + + err = bpf_xdp_query(ifindex_src, XDP_FLAGS_DRV_MODE, &query_opts); + if (!ASSERT_OK(err, "veth_src bpf_xdp_query gro on")) + goto out; + + if (!ASSERT_EQ(query_opts.feature_flags, + NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_RX_SG | NETDEV_XDP_ACT_NDO_XMIT_SG, - "veth_dst query_opts.feature_flags")) + "veth_src query_opts.feature_flags gro on")) + goto out; + + err = bpf_xdp_query(ifindex_dst, XDP_FLAGS_DRV_MODE, &query_opts); + if (!ASSERT_OK(err, "veth_dst bpf_xdp_query gro on")) + goto out; + + if (!ASSERT_EQ(query_opts.feature_flags, + NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_RX_SG | + NETDEV_XDP_ACT_NDO_XMIT_SG, + "veth_dst query_opts.feature_flags gro on")) goto out; memcpy(skel->rodata->expect_dst, &pkt_udp.eth.h_dest, ETH_ALEN); diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c index aa4beae99f4f..8c5e98da9ae9 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c @@ -273,6 +273,8 @@ static int verify_xsk_metadata(struct xsk *xsk) if (!ASSERT_NEQ(meta->rx_hash, 0, "rx_hash")) return -1; + ASSERT_EQ(meta->rx_hash_type, 0, "rx_hash_type"); + xsk_ring_cons__release(&xsk->rx, 1); refill_rx(xsk, comp_addr); diff --git a/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c b/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c index 4c55b4d79d3d..e1c787815e44 100644 --- a/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c @@ -12,10 +12,14 @@ struct { __type(value, __u32); } xsk SEC(".maps"); +__u64 pkts_skip = 0; +__u64 pkts_fail = 0; +__u64 pkts_redir = 0; + extern int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, __u64 *timestamp) __ksym; -extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, - __u32 *hash) __ksym; +extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, __u32 *hash, + enum xdp_rss_hash_type *rss_type) __ksym; SEC("xdp") int rx(struct xdp_md *ctx) @@ -26,7 +30,7 @@ int rx(struct xdp_md *ctx) struct udphdr *udp = NULL; struct iphdr *iph = NULL; struct xdp_meta *meta; - int ret; + int err; data = (void *)(long)ctx->data; data_end = (void *)(long)ctx->data_end; @@ -46,17 +50,20 @@ int rx(struct xdp_md *ctx) udp = NULL; } - if (!udp) + if (!udp) { + __sync_add_and_fetch(&pkts_skip, 1); return XDP_PASS; + } - if (udp->dest != bpf_htons(9091)) + /* Forwarding UDP:9091 to AF_XDP */ + if (udp->dest != bpf_htons(9091)) { + __sync_add_and_fetch(&pkts_skip, 1); return XDP_PASS; + } - bpf_printk("forwarding UDP:9091 to AF_XDP"); - - ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(struct xdp_meta)); - if (ret != 0) { - bpf_printk("bpf_xdp_adjust_meta returned %d", ret); + err = bpf_xdp_adjust_meta(ctx, -(int)sizeof(struct xdp_meta)); + if (err) { + __sync_add_and_fetch(&pkts_fail, 1); return XDP_PASS; } @@ -65,20 +72,19 @@ int rx(struct xdp_md *ctx) meta = data_meta; if (meta + 1 > data) { - bpf_printk("bpf_xdp_adjust_meta doesn't appear to work"); + __sync_add_and_fetch(&pkts_fail, 1); return XDP_PASS; } - if (!bpf_xdp_metadata_rx_timestamp(ctx, &meta->rx_timestamp)) - bpf_printk("populated rx_timestamp with %llu", meta->rx_timestamp); - else + err = bpf_xdp_metadata_rx_timestamp(ctx, &meta->rx_timestamp); + if (err) meta->rx_timestamp = 0; /* Used by AF_XDP as not avail signal */ - if (!bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash)) - bpf_printk("populated rx_hash with %u", meta->rx_hash); - else - meta->rx_hash = 0; /* Used by AF_XDP as not avail signal */ + err = bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash, &meta->rx_hash_type); + if (err < 0) + meta->rx_hash_err = err; /* Used by AF_XDP as no hash signal */ + __sync_add_and_fetch(&pkts_redir, 1); return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS); } diff --git a/tools/testing/selftests/bpf/progs/xdp_metadata.c b/tools/testing/selftests/bpf/progs/xdp_metadata.c index 77678b034389..d151d406a123 100644 --- a/tools/testing/selftests/bpf/progs/xdp_metadata.c +++ b/tools/testing/selftests/bpf/progs/xdp_metadata.c @@ -21,8 +21,8 @@ struct { extern int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, __u64 *timestamp) __ksym; -extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, - __u32 *hash) __ksym; +extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, __u32 *hash, + enum xdp_rss_hash_type *rss_type) __ksym; SEC("xdp") int rx(struct xdp_md *ctx) @@ -56,7 +56,7 @@ int rx(struct xdp_md *ctx) if (timestamp == 0) meta->rx_timestamp = 1; - bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash); + bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash, &meta->rx_hash_type); return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS); } diff --git a/tools/testing/selftests/bpf/progs/xdp_metadata2.c b/tools/testing/selftests/bpf/progs/xdp_metadata2.c index cf69d05451c3..85f88d9d7a78 100644 --- a/tools/testing/selftests/bpf/progs/xdp_metadata2.c +++ b/tools/testing/selftests/bpf/progs/xdp_metadata2.c @@ -5,17 +5,18 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> -extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, - __u32 *hash) __ksym; +extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, __u32 *hash, + enum xdp_rss_hash_type *rss_type) __ksym; int called; SEC("freplace/rx") int freplace_rx(struct xdp_md *ctx) { + enum xdp_rss_hash_type type = 0; u32 hash = 0; /* Call _any_ metadata function to make sure we don't crash. */ - bpf_xdp_metadata_rx_hash(ctx, &hash); + bpf_xdp_metadata_rx_hash(ctx, &hash, &type); called++; return XDP_PASS; } diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index 1c8acb68b977..987cf0db5ebc 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -141,7 +141,11 @@ static void verify_xdp_metadata(void *data) meta = data - sizeof(*meta); printf("rx_timestamp: %llu\n", meta->rx_timestamp); - printf("rx_hash: %u\n", meta->rx_hash); + if (meta->rx_hash_err < 0) + printf("No rx_hash err=%d\n", meta->rx_hash_err); + else + printf("rx_hash: 0x%X with RSS type:0x%X\n", + meta->rx_hash, meta->rx_hash_type); } static void verify_skb_metadata(int fd) @@ -212,7 +216,9 @@ static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd) while (true) { errno = 0; ret = poll(fds, rxq + 1, 1000); - printf("poll: %d (%d)\n", ret, errno); + printf("poll: %d (%d) skip=%llu fail=%llu redir=%llu\n", + ret, errno, bpf_obj->bss->pkts_skip, + bpf_obj->bss->pkts_fail, bpf_obj->bss->pkts_redir); if (ret < 0) break; if (ret == 0) diff --git a/tools/testing/selftests/bpf/xdp_metadata.h b/tools/testing/selftests/bpf/xdp_metadata.h index f6780fbb0a21..0c4624dc6f2f 100644 --- a/tools/testing/selftests/bpf/xdp_metadata.h +++ b/tools/testing/selftests/bpf/xdp_metadata.h @@ -12,4 +12,8 @@ struct xdp_meta { __u64 rx_timestamp; __u32 rx_hash; + union { + __u32 rx_hash_type; + __s32 rx_hash_err; + }; }; diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 1e616a8c6a9c..f4f7c0aef702 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -98,6 +98,11 @@ static int alloc_anon_50M_check(const char *cgroup, void *arg) int ret = -1; buf = malloc(size); + if (buf == NULL) { + fprintf(stderr, "malloc() failed\n"); + return -1; + } + for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) *ptr = 0; @@ -211,6 +216,11 @@ static int alloc_anon_noexit(const char *cgroup, void *arg) char *buf, *ptr; buf = malloc(size); + if (buf == NULL) { + fprintf(stderr, "malloc() failed\n"); + return -1; + } + for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) *ptr = 0; @@ -778,6 +788,11 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg) int ret = -1; buf = malloc(size); + if (buf == NULL) { + fprintf(stderr, "malloc() failed\n"); + return -1; + } + for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) *ptr = 0; diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c index 4fce46afe6db..e495f895a2cd 100644 --- a/tools/testing/selftests/clone3/clone3.c +++ b/tools/testing/selftests/clone3/clone3.c @@ -129,7 +129,7 @@ int main(int argc, char *argv[]) uid_t uid = getuid(); ksft_print_header(); - ksft_set_plan(17); + ksft_set_plan(18); test_clone3_supported(); /* Just a simple clone3() should return 0.*/ @@ -198,5 +198,5 @@ int main(int argc, char *argv[]) /* Do a clone3() in a new time namespace */ test_clone3(CLONE_NEWTIME, 0, 0, CLONE3_ARGS_NO_TEST); - return !ksft_get_fail_cnt() ? ksft_exit_pass() : ksft_exit_fail(); + ksft_finished(); } diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index a39bb2560d9b..03f92d7aeb19 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -8,11 +8,12 @@ TEST_PROGS := \ dev_addr_lists.sh \ mode-1-recovery-updelay.sh \ mode-2-recovery-updelay.sh \ - option_prio.sh \ + bond_options.sh \ bond-eth-type-change.sh TEST_FILES := \ lag_lib.sh \ + bond_topo_3d1c.sh \ net_forwarding_lib.sh include ../../../lib.mk diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh new file mode 100755 index 000000000000..db29a3146a86 --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh @@ -0,0 +1,264 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test bonding options with mode 1,5,6 + +ALL_TESTS=" + prio + arp_validate +" + +REQUIRE_MZ=no +NUM_NETIFS=0 +lib_dir=$(dirname "$0") +source ${lib_dir}/net_forwarding_lib.sh +source ${lib_dir}/bond_topo_3d1c.sh + +skip_prio() +{ + local skip=1 + + # check if iproute support prio option + ip -n ${s_ns} link set eth0 type bond_slave prio 10 + [[ $? -ne 0 ]] && skip=0 + + # check if kernel support prio option + ip -n ${s_ns} -d link show eth0 | grep -q "prio 10" + [[ $? -ne 0 ]] && skip=0 + + return $skip +} + +skip_ns() +{ + local skip=1 + + # check if iproute support ns_ip6_target option + ip -n ${s_ns} link add bond1 type bond ns_ip6_target ${g_ip6} + [[ $? -ne 0 ]] && skip=0 + + # check if kernel support ns_ip6_target option + ip -n ${s_ns} -d link show bond1 | grep -q "ns_ip6_target ${g_ip6}" + [[ $? -ne 0 ]] && skip=0 + + ip -n ${s_ns} link del bond1 + + return $skip +} + +active_slave="" +check_active_slave() +{ + local target_active_slave=$1 + active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") + test "$active_slave" = "$target_active_slave" + check_err $? "Current active slave is $active_slave but not $target_active_slave" +} + + +# Test bonding prio option +prio_test() +{ + local param="$1" + RET=0 + + # create bond + bond_reset "${param}" + + # check bonding member prio value + ip -n ${s_ns} link set eth0 type bond_slave prio 0 + ip -n ${s_ns} link set eth1 type bond_slave prio 10 + ip -n ${s_ns} link set eth2 type bond_slave prio 11 + cmd_jq "ip -n ${s_ns} -d -j link show eth0" \ + ".[].linkinfo.info_slave_data | select (.prio == 0)" "-e" &> /dev/null + check_err $? "eth0 prio is not 0" + cmd_jq "ip -n ${s_ns} -d -j link show eth1" \ + ".[].linkinfo.info_slave_data | select (.prio == 10)" "-e" &> /dev/null + check_err $? "eth1 prio is not 10" + cmd_jq "ip -n ${s_ns} -d -j link show eth2" \ + ".[].linkinfo.info_slave_data | select (.prio == 11)" "-e" &> /dev/null + check_err $? "eth2 prio is not 11" + + bond_check_connection "setup" + + # active slave should be the primary slave + check_active_slave eth1 + + # active slave should be the higher prio slave + ip -n ${s_ns} link set $active_slave down + bond_check_connection "fail over" + check_active_slave eth2 + + # when only 1 slave is up + ip -n ${s_ns} link set $active_slave down + bond_check_connection "only 1 slave up" + check_active_slave eth0 + + # when a higher prio slave change to up + ip -n ${s_ns} link set eth2 up + bond_check_connection "higher prio slave up" + case $primary_reselect in + "0") + check_active_slave "eth2" + ;; + "1") + check_active_slave "eth0" + ;; + "2") + check_active_slave "eth0" + ;; + esac + local pre_active_slave=$active_slave + + # when the primary slave change to up + ip -n ${s_ns} link set eth1 up + bond_check_connection "primary slave up" + case $primary_reselect in + "0") + check_active_slave "eth1" + ;; + "1") + check_active_slave "$pre_active_slave" + ;; + "2") + check_active_slave "$pre_active_slave" + ip -n ${s_ns} link set $active_slave down + bond_check_connection "pre_active slave down" + check_active_slave "eth1" + ;; + esac + + # Test changing bond slave prio + if [[ "$primary_reselect" == "0" ]];then + ip -n ${s_ns} link set eth0 type bond_slave prio 1000000 + ip -n ${s_ns} link set eth1 type bond_slave prio 0 + ip -n ${s_ns} link set eth2 type bond_slave prio -50 + ip -n ${s_ns} -d link show eth0 | grep -q 'prio 1000000' + check_err $? "eth0 prio is not 1000000" + ip -n ${s_ns} -d link show eth1 | grep -q 'prio 0' + check_err $? "eth1 prio is not 0" + ip -n ${s_ns} -d link show eth2 | grep -q 'prio -50' + check_err $? "eth3 prio is not -50" + check_active_slave "eth1" + + ip -n ${s_ns} link set $active_slave down + bond_check_connection "change slave prio" + check_active_slave "eth0" + fi +} + +prio_miimon() +{ + local primary_reselect + local mode=$1 + + for primary_reselect in 0 1 2; do + prio_test "mode $mode miimon 100 primary eth1 primary_reselect $primary_reselect" + log_test "prio" "$mode miimon primary_reselect $primary_reselect" + done +} + +prio_arp() +{ + local primary_reselect + local mode=$1 + + for primary_reselect in 0 1 2; do + prio_test "mode active-backup arp_interval 100 arp_ip_target ${g_ip4} primary eth1 primary_reselect $primary_reselect" + log_test "prio" "$mode arp_ip_target primary_reselect $primary_reselect" + done +} + +prio_ns() +{ + local primary_reselect + local mode=$1 + + if skip_ns; then + log_test_skip "prio ns" "Current iproute or kernel doesn't support bond option 'ns_ip6_target'." + return 0 + fi + + for primary_reselect in 0 1 2; do + prio_test "mode active-backup arp_interval 100 ns_ip6_target ${g_ip6} primary eth1 primary_reselect $primary_reselect" + log_test "prio" "$mode ns_ip6_target primary_reselect $primary_reselect" + done +} + +prio() +{ + local mode modes="active-backup balance-tlb balance-alb" + + if skip_prio; then + log_test_skip "prio" "Current iproute or kernel doesn't support bond option 'prio'." + return 0 + fi + + for mode in $modes; do + prio_miimon $mode + prio_arp $mode + prio_ns $mode + done +} + +arp_validate_test() +{ + local param="$1" + RET=0 + + # create bond + bond_reset "${param}" + + bond_check_connection + [ $RET -ne 0 ] && log_test "arp_validate" "$retmsg" + + # wait for a while to make sure the mii status stable + sleep 5 + for i in $(seq 0 2); do + mii_status=$(cmd_jq "ip -n ${s_ns} -j -d link show eth$i" ".[].linkinfo.info_slave_data.mii_status") + if [ ${mii_status} != "UP" ]; then + RET=1 + log_test "arp_validate" "interface eth$i mii_status $mii_status" + fi + done +} + +arp_validate_arp() +{ + local mode=$1 + local val + for val in $(seq 0 6); do + arp_validate_test "mode $mode arp_interval 100 arp_ip_target ${g_ip4} arp_validate $val" + log_test "arp_validate" "$mode arp_ip_target arp_validate $val" + done +} + +arp_validate_ns() +{ + local mode=$1 + local val + + if skip_ns; then + log_test_skip "arp_validate ns" "Current iproute or kernel doesn't support bond option 'ns_ip6_target'." + return 0 + fi + + for val in $(seq 0 6); do + arp_validate_test "mode $mode arp_interval 100 ns_ip6_target ${g_ip6} arp_validate $val" + log_test "arp_validate" "$mode ns_ip6_target arp_validate $val" + done +} + +arp_validate() +{ + arp_validate_arp "active-backup" + arp_validate_ns "active-backup" +} + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh new file mode 100644 index 000000000000..4045ca97fb22 --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh @@ -0,0 +1,143 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Topology for Bond mode 1,5,6 testing +# +# +-------------------------------------+ +# | bond0 | +# | + | Server +# | eth0 | eth1 eth2 | 192.0.2.1/24 +# | +-------------------+ | 2001:db8::1/24 +# | | | | | +# +-------------------------------------+ +# | | | +# +-------------------------------------+ +# | | | | | +# | +---+---------+---------+---+ | Gateway +# | | br0 | | 192.0.2.254/24 +# | +-------------+-------------+ | 2001:db8::254/24 +# | | | +# +-------------------------------------+ +# | +# +-------------------------------------+ +# | | | Client +# | + | 192.0.2.10/24 +# | eth0 | 2001:db8::10/24 +# +-------------------------------------+ + +s_ns="s-$(mktemp -u XXXXXX)" +c_ns="c-$(mktemp -u XXXXXX)" +g_ns="g-$(mktemp -u XXXXXX)" +s_ip4="192.0.2.1" +c_ip4="192.0.2.10" +g_ip4="192.0.2.254" +s_ip6="2001:db8::1" +c_ip6="2001:db8::10" +g_ip6="2001:db8::254" + +gateway_create() +{ + ip netns add ${g_ns} + ip -n ${g_ns} link add br0 type bridge + ip -n ${g_ns} link set br0 up + ip -n ${g_ns} addr add ${g_ip4}/24 dev br0 + ip -n ${g_ns} addr add ${g_ip6}/24 dev br0 +} + +gateway_destroy() +{ + ip -n ${g_ns} link del br0 + ip netns del ${g_ns} +} + +server_create() +{ + ip netns add ${s_ns} + ip -n ${s_ns} link add bond0 type bond mode active-backup miimon 100 + + for i in $(seq 0 2); do + ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns} + + ip -n ${g_ns} link set s${i} up + ip -n ${g_ns} link set s${i} master br0 + ip -n ${s_ns} link set eth${i} master bond0 + done + + ip -n ${s_ns} link set bond0 up + ip -n ${s_ns} addr add ${s_ip4}/24 dev bond0 + ip -n ${s_ns} addr add ${s_ip6}/24 dev bond0 + sleep 2 +} + +# Reset bond with new mode and options +bond_reset() +{ + local param="$1" + + ip -n ${s_ns} link set bond0 down + ip -n ${s_ns} link del bond0 + + ip -n ${s_ns} link add bond0 type bond $param + for i in $(seq 0 2); do + ip -n ${s_ns} link set eth$i master bond0 + done + + ip -n ${s_ns} link set bond0 up + ip -n ${s_ns} addr add ${s_ip4}/24 dev bond0 + ip -n ${s_ns} addr add ${s_ip6}/24 dev bond0 + sleep 2 +} + +server_destroy() +{ + for i in $(seq 0 2); do + ip -n ${s_ns} link del eth${i} + done + ip netns del ${s_ns} +} + +client_create() +{ + ip netns add ${c_ns} + ip -n ${c_ns} link add eth0 type veth peer name c0 netns ${g_ns} + + ip -n ${g_ns} link set c0 up + ip -n ${g_ns} link set c0 master br0 + + ip -n ${c_ns} link set eth0 up + ip -n ${c_ns} addr add ${c_ip4}/24 dev eth0 + ip -n ${c_ns} addr add ${c_ip6}/24 dev eth0 +} + +client_destroy() +{ + ip -n ${c_ns} link del eth0 + ip netns del ${c_ns} +} + +setup_prepare() +{ + gateway_create + server_create + client_create +} + +cleanup() +{ + pre_cleanup + + client_destroy + server_destroy + gateway_destroy +} + +bond_check_connection() +{ + local msg=${1:-"check connection"} + + sleep 2 + ip netns exec ${s_ns} ping ${c_ip4} -c5 -i 0.1 &>/dev/null + check_err $? "${msg}: ping failed" + ip netns exec ${s_ns} ping6 ${c_ip6} -c5 -i 0.1 &>/dev/null + check_err $? "${msg}: ping6 failed" +} diff --git a/tools/testing/selftests/drivers/net/bonding/option_prio.sh b/tools/testing/selftests/drivers/net/bonding/option_prio.sh deleted file mode 100755 index c32eebff5005..000000000000 --- a/tools/testing/selftests/drivers/net/bonding/option_prio.sh +++ /dev/null @@ -1,245 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Test bonding option prio -# - -ALL_TESTS=" - prio_arp_ip_target_test - prio_miimon_test -" - -REQUIRE_MZ=no -REQUIRE_JQ=no -NUM_NETIFS=0 -lib_dir=$(dirname "$0") -source "$lib_dir"/net_forwarding_lib.sh - -destroy() -{ - ip link del bond0 &>/dev/null - ip link del br0 &>/dev/null - ip link del veth0 &>/dev/null - ip link del veth1 &>/dev/null - ip link del veth2 &>/dev/null - ip netns del ns1 &>/dev/null - ip link del veth3 &>/dev/null -} - -cleanup() -{ - pre_cleanup - - destroy -} - -skip() -{ - local skip=1 - ip link add name bond0 type bond mode 1 miimon 100 &>/dev/null - ip link add name veth0 type veth peer name veth0_p - ip link set veth0 master bond0 - - # check if iproute support prio option - ip link set dev veth0 type bond_slave prio 10 - [[ $? -ne 0 ]] && skip=0 - - # check if bonding support prio option - ip -d link show veth0 | grep -q "prio 10" - [[ $? -ne 0 ]] && skip=0 - - ip link del bond0 &>/dev/null - ip link del veth0 - - return $skip -} - -active_slave="" -check_active_slave() -{ - local target_active_slave=$1 - active_slave="$(cat /sys/class/net/bond0/bonding/active_slave)" - test "$active_slave" = "$target_active_slave" - check_err $? "Current active slave is $active_slave but not $target_active_slave" -} - - -# Test bonding prio option with mode=$mode monitor=$monitor -# and primary_reselect=$primary_reselect -prio_test() -{ - RET=0 - - local monitor=$1 - local mode=$2 - local primary_reselect=$3 - - local bond_ip4="192.169.1.2" - local peer_ip4="192.169.1.1" - local bond_ip6="2009:0a:0b::02" - local peer_ip6="2009:0a:0b::01" - - - # create veths - ip link add name veth0 type veth peer name veth0_p - ip link add name veth1 type veth peer name veth1_p - ip link add name veth2 type veth peer name veth2_p - - # create bond - if [[ "$monitor" == "miimon" ]];then - ip link add name bond0 type bond mode $mode miimon 100 primary veth1 primary_reselect $primary_reselect - elif [[ "$monitor" == "arp_ip_target" ]];then - ip link add name bond0 type bond mode $mode arp_interval 1000 arp_ip_target $peer_ip4 primary veth1 primary_reselect $primary_reselect - elif [[ "$monitor" == "ns_ip6_target" ]];then - ip link add name bond0 type bond mode $mode arp_interval 1000 ns_ip6_target $peer_ip6 primary veth1 primary_reselect $primary_reselect - fi - ip link set bond0 up - ip link set veth0 master bond0 - ip link set veth1 master bond0 - ip link set veth2 master bond0 - # check bonding member prio value - ip link set dev veth0 type bond_slave prio 0 - ip link set dev veth1 type bond_slave prio 10 - ip link set dev veth2 type bond_slave prio 11 - ip -d link show veth0 | grep -q 'prio 0' - check_err $? "veth0 prio is not 0" - ip -d link show veth1 | grep -q 'prio 10' - check_err $? "veth0 prio is not 10" - ip -d link show veth2 | grep -q 'prio 11' - check_err $? "veth0 prio is not 11" - - ip link set veth0 up - ip link set veth1 up - ip link set veth2 up - ip link set veth0_p up - ip link set veth1_p up - ip link set veth2_p up - - # prepare ping target - ip link add name br0 type bridge - ip link set br0 up - ip link set veth0_p master br0 - ip link set veth1_p master br0 - ip link set veth2_p master br0 - ip link add name veth3 type veth peer name veth3_p - ip netns add ns1 - ip link set veth3_p master br0 up - ip link set veth3 netns ns1 up - ip netns exec ns1 ip addr add $peer_ip4/24 dev veth3 - ip netns exec ns1 ip addr add $peer_ip6/64 dev veth3 - ip addr add $bond_ip4/24 dev bond0 - ip addr add $bond_ip6/64 dev bond0 - sleep 5 - - ping $peer_ip4 -c5 -I bond0 &>/dev/null - check_err $? "ping failed 1." - ping6 $peer_ip6 -c5 -I bond0 &>/dev/null - check_err $? "ping6 failed 1." - - # active salve should be the primary slave - check_active_slave veth1 - - # active slave should be the higher prio slave - ip link set $active_slave down - ping $peer_ip4 -c5 -I bond0 &>/dev/null - check_err $? "ping failed 2." - check_active_slave veth2 - - # when only 1 slave is up - ip link set $active_slave down - ping $peer_ip4 -c5 -I bond0 &>/dev/null - check_err $? "ping failed 3." - check_active_slave veth0 - - # when a higher prio slave change to up - ip link set veth2 up - ping $peer_ip4 -c5 -I bond0 &>/dev/null - check_err $? "ping failed 4." - case $primary_reselect in - "0") - check_active_slave "veth2" - ;; - "1") - check_active_slave "veth0" - ;; - "2") - check_active_slave "veth0" - ;; - esac - local pre_active_slave=$active_slave - - # when the primary slave change to up - ip link set veth1 up - ping $peer_ip4 -c5 -I bond0 &>/dev/null - check_err $? "ping failed 5." - case $primary_reselect in - "0") - check_active_slave "veth1" - ;; - "1") - check_active_slave "$pre_active_slave" - ;; - "2") - check_active_slave "$pre_active_slave" - ip link set $active_slave down - ping $peer_ip4 -c5 -I bond0 &>/dev/null - check_err $? "ping failed 6." - check_active_slave "veth1" - ;; - esac - - # Test changing bond salve prio - if [[ "$primary_reselect" == "0" ]];then - ip link set dev veth0 type bond_slave prio 1000000 - ip link set dev veth1 type bond_slave prio 0 - ip link set dev veth2 type bond_slave prio -50 - ip -d link show veth0 | grep -q 'prio 1000000' - check_err $? "veth0 prio is not 1000000" - ip -d link show veth1 | grep -q 'prio 0' - check_err $? "veth1 prio is not 0" - ip -d link show veth2 | grep -q 'prio -50' - check_err $? "veth3 prio is not -50" - check_active_slave "veth1" - - ip link set $active_slave down - ping $peer_ip4 -c5 -I bond0 &>/dev/null - check_err $? "ping failed 7." - check_active_slave "veth0" - fi - - cleanup - - log_test "prio_test" "Test bonding option 'prio' with mode=$mode monitor=$monitor and primary_reselect=$primary_reselect" -} - -prio_miimon_test() -{ - local mode - local primary_reselect - - for mode in 1 5 6; do - for primary_reselect in 0 1 2; do - prio_test "miimon" $mode $primary_reselect - done - done -} - -prio_arp_ip_target_test() -{ - local primary_reselect - - for primary_reselect in 0 1 2; do - prio_test "arp_ip_target" 1 $primary_reselect - done -} - -if skip;then - log_test_skip "option_prio.sh" "Current iproute doesn't support 'prio'." - exit 0 -fi - -trap cleanup EXIT - -tests_run - -exit "$EXIT_STATUS" diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 33a0dbd26bd3..829be379545a 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -43,11 +43,13 @@ #ifndef __KSELFTEST_H #define __KSELFTEST_H +#ifndef NOLIBC #include <errno.h> #include <stdlib.h> #include <unistd.h> #include <stdarg.h> #include <stdio.h> +#endif #ifndef ARRAY_SIZE #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index cc9fd55ab869..2529226ce87c 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -48,3 +48,4 @@ CONFIG_BAREUDP=m CONFIG_IPV6_IOAM6_LWTUNNEL=y CONFIG_CRYPTO_SM4_GENERIC=y CONFIG_AMT=m +CONFIG_IP_SCTP=m diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 48e52f995a98..b1eb7bce599d 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -913,6 +913,7 @@ test_listener() $client4_port > /dev/null 2>&1 & local listener_pid=$! + sleep 0.5 verify_listener_events $client_evts $LISTENER_CREATED $AF_INET 10.0.2.2 $client4_port # ADD_ADDR from client to server machine reusing the subflow port @@ -928,6 +929,7 @@ test_listener() # Delete the listener from the client ns, if one was created kill_wait $listener_pid + sleep 0.5 verify_listener_events $client_evts $LISTENER_CLOSED $AF_INET 10.0.2.2 $client4_port } diff --git a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py index 3243c90d449e..5d467d1993cb 100644 --- a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py +++ b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py @@ -62,7 +62,7 @@ class OvsDatapath(GenericNetlinkSocket): nla_map = ( ("OVS_DP_ATTR_UNSPEC", "none"), ("OVS_DP_ATTR_NAME", "asciiz"), - ("OVS_DP_ATTR_UPCALL_PID", "uint32"), + ("OVS_DP_ATTR_UPCALL_PID", "array(uint32)"), ("OVS_DP_ATTR_STATS", "dpstats"), ("OVS_DP_ATTR_MEGAFLOW_STATS", "megaflowstats"), ("OVS_DP_ATTR_USER_FEATURES", "uint32"), diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 8fe61d3e3cce..bbce57420465 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -1,6 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for nolibc tests include ../../../scripts/Makefile.include +# We need this for the "cc-option" macro. +include ../../../build/Build.include # we're in ".../tools/testing/selftests/nolibc" ifeq ($(srctree),) @@ -13,52 +15,56 @@ ARCH = $(SUBARCH) endif # kernel image names by architecture -IMAGE_i386 = arch/x86/boot/bzImage -IMAGE_x86_64 = arch/x86/boot/bzImage -IMAGE_x86 = arch/x86/boot/bzImage -IMAGE_arm64 = arch/arm64/boot/Image -IMAGE_arm = arch/arm/boot/zImage -IMAGE_mips = vmlinuz -IMAGE_riscv = arch/riscv/boot/Image -IMAGE_s390 = arch/s390/boot/bzImage -IMAGE = $(IMAGE_$(ARCH)) -IMAGE_NAME = $(notdir $(IMAGE)) +IMAGE_i386 = arch/x86/boot/bzImage +IMAGE_x86_64 = arch/x86/boot/bzImage +IMAGE_x86 = arch/x86/boot/bzImage +IMAGE_arm64 = arch/arm64/boot/Image +IMAGE_arm = arch/arm/boot/zImage +IMAGE_mips = vmlinuz +IMAGE_riscv = arch/riscv/boot/Image +IMAGE_s390 = arch/s390/boot/bzImage +IMAGE_loongarch = arch/loongarch/boot/vmlinuz.efi +IMAGE = $(IMAGE_$(ARCH)) +IMAGE_NAME = $(notdir $(IMAGE)) # default kernel configurations that appear to be usable -DEFCONFIG_i386 = defconfig -DEFCONFIG_x86_64 = defconfig -DEFCONFIG_x86 = defconfig -DEFCONFIG_arm64 = defconfig -DEFCONFIG_arm = multi_v7_defconfig -DEFCONFIG_mips = malta_defconfig -DEFCONFIG_riscv = defconfig -DEFCONFIG_s390 = defconfig -DEFCONFIG = $(DEFCONFIG_$(ARCH)) +DEFCONFIG_i386 = defconfig +DEFCONFIG_x86_64 = defconfig +DEFCONFIG_x86 = defconfig +DEFCONFIG_arm64 = defconfig +DEFCONFIG_arm = multi_v7_defconfig +DEFCONFIG_mips = malta_defconfig +DEFCONFIG_riscv = defconfig +DEFCONFIG_s390 = defconfig +DEFCONFIG_loongarch = defconfig +DEFCONFIG = $(DEFCONFIG_$(ARCH)) # optional tests to run (default = all) TEST = # QEMU_ARCH: arch names used by qemu -QEMU_ARCH_i386 = i386 -QEMU_ARCH_x86_64 = x86_64 -QEMU_ARCH_x86 = x86_64 -QEMU_ARCH_arm64 = aarch64 -QEMU_ARCH_arm = arm -QEMU_ARCH_mips = mipsel # works with malta_defconfig -QEMU_ARCH_riscv = riscv64 -QEMU_ARCH_s390 = s390x -QEMU_ARCH = $(QEMU_ARCH_$(ARCH)) +QEMU_ARCH_i386 = i386 +QEMU_ARCH_x86_64 = x86_64 +QEMU_ARCH_x86 = x86_64 +QEMU_ARCH_arm64 = aarch64 +QEMU_ARCH_arm = arm +QEMU_ARCH_mips = mipsel # works with malta_defconfig +QEMU_ARCH_riscv = riscv64 +QEMU_ARCH_s390 = s390x +QEMU_ARCH_loongarch = loongarch64 +QEMU_ARCH = $(QEMU_ARCH_$(ARCH)) # QEMU_ARGS : some arch-specific args to pass to qemu -QEMU_ARGS_i386 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)" -QEMU_ARGS_x86_64 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)" -QEMU_ARGS_x86 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)" -QEMU_ARGS_arm64 = -M virt -cpu cortex-a53 -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" -QEMU_ARGS_arm = -M virt -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" -QEMU_ARGS_mips = -M malta -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" -QEMU_ARGS_riscv = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" -QEMU_ARGS_s390 = -M s390-ccw-virtio -m 1G -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" -QEMU_ARGS = $(QEMU_ARGS_$(ARCH)) +QEMU_ARGS_i386 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_x86_64 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_x86 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_arm64 = -M virt -cpu cortex-a53 -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_arm = -M virt -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_mips = -M malta -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_riscv = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_s390 = -M s390-ccw-virtio -m 1G -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_loongarch = -M virt -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS = $(QEMU_ARGS_$(ARCH)) # OUTPUT is only set when run from the main makefile, otherwise # it defaults to this nolibc directory. @@ -70,8 +76,16 @@ else Q=@ endif +CFLAGS_STACKPROTECTOR = -DNOLIBC_STACKPROTECTOR \ + $(call cc-option,-mstack-protector-guard=global) \ + $(call cc-option,-fstack-protector-all) +CFLAGS_STKP_i386 = $(CFLAGS_STACKPROTECTOR) +CFLAGS_STKP_x86_64 = $(CFLAGS_STACKPROTECTOR) +CFLAGS_STKP_x86 = $(CFLAGS_STACKPROTECTOR) CFLAGS_s390 = -m64 -CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables $(CFLAGS_$(ARCH)) +CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables \ + $(call cc-option,-fno-stack-protector) \ + $(CFLAGS_STKP_$(ARCH)) $(CFLAGS_$(ARCH)) LDFLAGS := -s help: diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index c4a0c915139c..21bacc928bf7 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -130,111 +130,111 @@ static int pad_spc(int llen, int cnt, const char *fmt, ...) */ #define EXPECT_ZR(cond, expr) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_zr(expr, llen); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_zr(expr, llen); } while (0) static int expect_zr(int expr, int llen) { int ret = !(expr == 0); llen += printf(" = %d ", expr); - pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + pad_spc(llen, 64, ret ? "[FAIL]\n" : " [OK]\n"); return ret; } #define EXPECT_NZ(cond, expr, val) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_nz(expr, llen; } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_nz(expr, llen; } while (0) static int expect_nz(int expr, int llen) { int ret = !(expr != 0); llen += printf(" = %d ", expr); - pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + pad_spc(llen, 64, ret ? "[FAIL]\n" : " [OK]\n"); return ret; } #define EXPECT_EQ(cond, expr, val) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_eq(expr, llen, val); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_eq(expr, llen, val); } while (0) -static int expect_eq(int expr, int llen, int val) +static int expect_eq(uint64_t expr, int llen, uint64_t val) { int ret = !(expr == val); - llen += printf(" = %d ", expr); - pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + llen += printf(" = %lld ", expr); + pad_spc(llen, 64, ret ? "[FAIL]\n" : " [OK]\n"); return ret; } #define EXPECT_NE(cond, expr, val) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_ne(expr, llen, val); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_ne(expr, llen, val); } while (0) static int expect_ne(int expr, int llen, int val) { int ret = !(expr != val); llen += printf(" = %d ", expr); - pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + pad_spc(llen, 64, ret ? "[FAIL]\n" : " [OK]\n"); return ret; } #define EXPECT_GE(cond, expr, val) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_ge(expr, llen, val); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_ge(expr, llen, val); } while (0) static int expect_ge(int expr, int llen, int val) { int ret = !(expr >= val); llen += printf(" = %d ", expr); - pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + pad_spc(llen, 64, ret ? "[FAIL]\n" : " [OK]\n"); return ret; } #define EXPECT_GT(cond, expr, val) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_gt(expr, llen, val); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_gt(expr, llen, val); } while (0) static int expect_gt(int expr, int llen, int val) { int ret = !(expr > val); llen += printf(" = %d ", expr); - pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + pad_spc(llen, 64, ret ? "[FAIL]\n" : " [OK]\n"); return ret; } #define EXPECT_LE(cond, expr, val) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_le(expr, llen, val); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_le(expr, llen, val); } while (0) static int expect_le(int expr, int llen, int val) { int ret = !(expr <= val); llen += printf(" = %d ", expr); - pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + pad_spc(llen, 64, ret ? "[FAIL]\n" : " [OK]\n"); return ret; } #define EXPECT_LT(cond, expr, val) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_lt(expr, llen, val); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_lt(expr, llen, val); } while (0) static int expect_lt(int expr, int llen, int val) { int ret = !(expr < val); llen += printf(" = %d ", expr); - pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + pad_spc(llen, 64, ret ? "[FAIL]\n" : " [OK]\n"); return ret; } #define EXPECT_SYSZR(cond, expr) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_syszr(expr, llen); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_syszr(expr, llen); } while (0) static int expect_syszr(int expr, int llen) { @@ -243,17 +243,17 @@ static int expect_syszr(int expr, int llen) if (expr) { ret = 1; llen += printf(" = %d %s ", expr, errorname(errno)); - llen += pad_spc(llen, 40, "[FAIL]\n"); + llen += pad_spc(llen, 64, "[FAIL]\n"); } else { llen += printf(" = %d ", expr); - llen += pad_spc(llen, 40, " [OK]\n"); + llen += pad_spc(llen, 64, " [OK]\n"); } return ret; } #define EXPECT_SYSEQ(cond, expr, val) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_syseq(expr, llen, val); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_syseq(expr, llen, val); } while (0) static int expect_syseq(int expr, int llen, int val) { @@ -262,17 +262,17 @@ static int expect_syseq(int expr, int llen, int val) if (expr != val) { ret = 1; llen += printf(" = %d %s ", expr, errorname(errno)); - llen += pad_spc(llen, 40, "[FAIL]\n"); + llen += pad_spc(llen, 64, "[FAIL]\n"); } else { llen += printf(" = %d ", expr); - llen += pad_spc(llen, 40, " [OK]\n"); + llen += pad_spc(llen, 64, " [OK]\n"); } return ret; } #define EXPECT_SYSNE(cond, expr, val) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_sysne(expr, llen, val); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_sysne(expr, llen, val); } while (0) static int expect_sysne(int expr, int llen, int val) { @@ -281,17 +281,17 @@ static int expect_sysne(int expr, int llen, int val) if (expr == val) { ret = 1; llen += printf(" = %d %s ", expr, errorname(errno)); - llen += pad_spc(llen, 40, "[FAIL]\n"); + llen += pad_spc(llen, 64, "[FAIL]\n"); } else { llen += printf(" = %d ", expr); - llen += pad_spc(llen, 40, " [OK]\n"); + llen += pad_spc(llen, 64, " [OK]\n"); } return ret; } #define EXPECT_SYSER(cond, expr, expret, experr) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_syserr(expr, expret, experr, llen); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_syserr(expr, expret, experr, llen); } while (0) static int expect_syserr(int expr, int expret, int experr, int llen) { @@ -302,16 +302,16 @@ static int expect_syserr(int expr, int expret, int experr, int llen) if (expr != expret || _errno != experr) { ret = 1; llen += printf(" != (%d %s) ", expret, errorname(experr)); - llen += pad_spc(llen, 40, "[FAIL]\n"); + llen += pad_spc(llen, 64, "[FAIL]\n"); } else { - llen += pad_spc(llen, 40, " [OK]\n"); + llen += pad_spc(llen, 64, " [OK]\n"); } return ret; } #define EXPECT_PTRZR(cond, expr) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_ptrzr(expr, llen); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_ptrzr(expr, llen); } while (0) static int expect_ptrzr(const void *expr, int llen) { @@ -320,16 +320,16 @@ static int expect_ptrzr(const void *expr, int llen) llen += printf(" = <%p> ", expr); if (expr) { ret = 1; - llen += pad_spc(llen, 40, "[FAIL]\n"); + llen += pad_spc(llen, 64, "[FAIL]\n"); } else { - llen += pad_spc(llen, 40, " [OK]\n"); + llen += pad_spc(llen, 64, " [OK]\n"); } return ret; } #define EXPECT_PTRNZ(cond, expr) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_ptrnz(expr, llen); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_ptrnz(expr, llen); } while (0) static int expect_ptrnz(const void *expr, int llen) { @@ -338,16 +338,16 @@ static int expect_ptrnz(const void *expr, int llen) llen += printf(" = <%p> ", expr); if (!expr) { ret = 1; - llen += pad_spc(llen, 40, "[FAIL]\n"); + llen += pad_spc(llen, 64, "[FAIL]\n"); } else { - llen += pad_spc(llen, 40, " [OK]\n"); + llen += pad_spc(llen, 64, " [OK]\n"); } return ret; } #define EXPECT_STRZR(cond, expr) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_strzr(expr, llen); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_strzr(expr, llen); } while (0) static int expect_strzr(const char *expr, int llen) { @@ -356,16 +356,16 @@ static int expect_strzr(const char *expr, int llen) llen += printf(" = <%s> ", expr); if (expr) { ret = 1; - llen += pad_spc(llen, 40, "[FAIL]\n"); + llen += pad_spc(llen, 64, "[FAIL]\n"); } else { - llen += pad_spc(llen, 40, " [OK]\n"); + llen += pad_spc(llen, 64, " [OK]\n"); } return ret; } #define EXPECT_STRNZ(cond, expr) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_strnz(expr, llen); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_strnz(expr, llen); } while (0) static int expect_strnz(const char *expr, int llen) { @@ -374,16 +374,16 @@ static int expect_strnz(const char *expr, int llen) llen += printf(" = <%s> ", expr); if (!expr) { ret = 1; - llen += pad_spc(llen, 40, "[FAIL]\n"); + llen += pad_spc(llen, 64, "[FAIL]\n"); } else { - llen += pad_spc(llen, 40, " [OK]\n"); + llen += pad_spc(llen, 64, " [OK]\n"); } return ret; } #define EXPECT_STREQ(cond, expr, cmp) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_streq(expr, llen, cmp); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_streq(expr, llen, cmp); } while (0) static int expect_streq(const char *expr, int llen, const char *cmp) { @@ -392,16 +392,16 @@ static int expect_streq(const char *expr, int llen, const char *cmp) llen += printf(" = <%s> ", expr); if (strcmp(expr, cmp) != 0) { ret = 1; - llen += pad_spc(llen, 40, "[FAIL]\n"); + llen += pad_spc(llen, 64, "[FAIL]\n"); } else { - llen += pad_spc(llen, 40, " [OK]\n"); + llen += pad_spc(llen, 64, " [OK]\n"); } return ret; } #define EXPECT_STRNE(cond, expr, cmp) \ - do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_strne(expr, llen, cmp); } while (0) + do { if (!cond) pad_spc(llen, 64, "[SKIPPED]\n"); else ret += expect_strne(expr, llen, cmp); } while (0) static int expect_strne(const char *expr, int llen, const char *cmp) { @@ -410,9 +410,9 @@ static int expect_strne(const char *expr, int llen, const char *cmp) llen += printf(" = <%s> ", expr); if (strcmp(expr, cmp) == 0) { ret = 1; - llen += pad_spc(llen, 40, "[FAIL]\n"); + llen += pad_spc(llen, 64, "[FAIL]\n"); } else { - llen += pad_spc(llen, 40, " [OK]\n"); + llen += pad_spc(llen, 64, " [OK]\n"); } return ret; } @@ -477,6 +477,7 @@ static int test_getpagesize(void) int run_syscall(int min, int max) { struct stat stat_buf; + int euid0; int proc; int test; int tmp; @@ -486,6 +487,9 @@ int run_syscall(int min, int max) /* <proc> indicates whether or not /proc is mounted */ proc = stat("/proc", &stat_buf) == 0; + /* this will be used to skip certain tests that can't be run unprivileged */ + euid0 = geteuid() == 0; + for (test = min; test >= 0 && test <= max; test++) { int llen = 0; // line length @@ -511,7 +515,7 @@ int run_syscall(int min, int max) CASE_TEST(chmod_net); EXPECT_SYSZR(proc, chmod("/proc/self/net", 0555)); break; CASE_TEST(chmod_self); EXPECT_SYSER(proc, chmod("/proc/self", 0555), -1, EPERM); break; CASE_TEST(chown_self); EXPECT_SYSER(proc, chown("/proc/self", 0, 0), -1, EPERM); break; - CASE_TEST(chroot_root); EXPECT_SYSZR(1, chroot("/")); break; + CASE_TEST(chroot_root); EXPECT_SYSZR(euid0, chroot("/")); break; CASE_TEST(chroot_blah); EXPECT_SYSER(1, chroot("/proc/self/blah"), -1, ENOENT); break; CASE_TEST(chroot_exe); EXPECT_SYSER(proc, chroot("/proc/self/exe"), -1, ENOTDIR); break; CASE_TEST(close_m1); EXPECT_SYSER(1, close(-1), -1, EBADF); break; @@ -536,7 +540,7 @@ int run_syscall(int min, int max) CASE_TEST(ioctl_tiocinq); EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break; CASE_TEST(link_root1); EXPECT_SYSER(1, link("/", "/"), -1, EEXIST); break; CASE_TEST(link_blah); EXPECT_SYSER(1, link("/proc/self/blah", "/blah"), -1, ENOENT); break; - CASE_TEST(link_dir); EXPECT_SYSER(1, link("/", "/blah"), -1, EPERM); break; + CASE_TEST(link_dir); EXPECT_SYSER(euid0, link("/", "/blah"), -1, EPERM); break; CASE_TEST(link_cross); EXPECT_SYSER(proc, link("/proc/self/net", "/blah"), -1, EXDEV); break; CASE_TEST(lseek_m1); EXPECT_SYSER(1, lseek(-1, 0, SEEK_SET), -1, EBADF); break; CASE_TEST(lseek_0); EXPECT_SYSER(1, lseek(0, 0, SEEK_SET), -1, ESPIPE); break; @@ -602,6 +606,59 @@ int run_stdlib(int min, int max) CASE_TEST(memcmp_e0_20); EXPECT_GT(1, memcmp("aaa\xe0", "aaa\x20", 4), 0); break; CASE_TEST(memcmp_80_e0); EXPECT_LT(1, memcmp("aaa\x80", "aaa\xe0", 4), 0); break; CASE_TEST(memcmp_e0_80); EXPECT_GT(1, memcmp("aaa\xe0", "aaa\x80", 4), 0); break; + CASE_TEST(limit_int8_max); EXPECT_EQ(1, INT8_MAX, (int8_t) 0x7f); break; + CASE_TEST(limit_int8_min); EXPECT_EQ(1, INT8_MIN, (int8_t) 0x80); break; + CASE_TEST(limit_uint8_max); EXPECT_EQ(1, UINT8_MAX, (uint8_t) 0xff); break; + CASE_TEST(limit_int16_max); EXPECT_EQ(1, INT16_MAX, (int16_t) 0x7fff); break; + CASE_TEST(limit_int16_min); EXPECT_EQ(1, INT16_MIN, (int16_t) 0x8000); break; + CASE_TEST(limit_uint16_max); EXPECT_EQ(1, UINT16_MAX, (uint16_t) 0xffff); break; + CASE_TEST(limit_int32_max); EXPECT_EQ(1, INT32_MAX, (int32_t) 0x7fffffff); break; + CASE_TEST(limit_int32_min); EXPECT_EQ(1, INT32_MIN, (int32_t) 0x80000000); break; + CASE_TEST(limit_uint32_max); EXPECT_EQ(1, UINT32_MAX, (uint32_t) 0xffffffff); break; + CASE_TEST(limit_int64_max); EXPECT_EQ(1, INT64_MAX, (int64_t) 0x7fffffffffffffff); break; + CASE_TEST(limit_int64_min); EXPECT_EQ(1, INT64_MIN, (int64_t) 0x8000000000000000); break; + CASE_TEST(limit_uint64_max); EXPECT_EQ(1, UINT64_MAX, (uint64_t) 0xffffffffffffffff); break; + CASE_TEST(limit_int_least8_max); EXPECT_EQ(1, INT_LEAST8_MAX, (int_least8_t) 0x7f); break; + CASE_TEST(limit_int_least8_min); EXPECT_EQ(1, INT_LEAST8_MIN, (int_least8_t) 0x80); break; + CASE_TEST(limit_uint_least8_max); EXPECT_EQ(1, UINT_LEAST8_MAX, (uint_least8_t) 0xff); break; + CASE_TEST(limit_int_least16_max); EXPECT_EQ(1, INT_LEAST16_MAX, (int_least16_t) 0x7fff); break; + CASE_TEST(limit_int_least16_min); EXPECT_EQ(1, INT_LEAST16_MIN, (int_least16_t) 0x8000); break; + CASE_TEST(limit_uint_least16_max); EXPECT_EQ(1, UINT_LEAST16_MAX, (uint_least16_t) 0xffff); break; + CASE_TEST(limit_int_least32_max); EXPECT_EQ(1, INT_LEAST32_MAX, (int_least32_t) 0x7fffffff); break; + CASE_TEST(limit_int_least32_min); EXPECT_EQ(1, INT_LEAST32_MIN, (int_least32_t) 0x80000000); break; + CASE_TEST(limit_uint_least32_max); EXPECT_EQ(1, UINT_LEAST32_MAX, (uint_least32_t) 0xffffffffU); break; + CASE_TEST(limit_int_least64_min); EXPECT_EQ(1, INT_LEAST64_MIN, (int_least64_t) 0x8000000000000000LL); break; + CASE_TEST(limit_int_least64_max); EXPECT_EQ(1, INT_LEAST64_MAX, (int_least64_t) 0x7fffffffffffffffLL); break; + CASE_TEST(limit_uint_least64_max); EXPECT_EQ(1, UINT_LEAST64_MAX, (uint_least64_t) 0xffffffffffffffffULL); break; + CASE_TEST(limit_int_fast8_max); EXPECT_EQ(1, INT_FAST8_MAX, (int_fast8_t) 0x7f); break; + CASE_TEST(limit_int_fast8_min); EXPECT_EQ(1, INT_FAST8_MIN, (int_fast8_t) 0x80); break; + CASE_TEST(limit_uint_fast8_max); EXPECT_EQ(1, UINT_FAST8_MAX, (uint_fast8_t) 0xff); break; + CASE_TEST(limit_int_fast16_min); EXPECT_EQ(1, INT_FAST16_MIN, (int_fast16_t) INTPTR_MIN); break; + CASE_TEST(limit_int_fast16_max); EXPECT_EQ(1, INT_FAST16_MAX, (int_fast16_t) INTPTR_MAX); break; + CASE_TEST(limit_uint_fast16_max); EXPECT_EQ(1, UINT_FAST16_MAX, (uint_fast16_t) UINTPTR_MAX); break; + CASE_TEST(limit_int_fast32_min); EXPECT_EQ(1, INT_FAST32_MIN, (int_fast32_t) INTPTR_MIN); break; + CASE_TEST(limit_int_fast32_max); EXPECT_EQ(1, INT_FAST32_MAX, (int_fast32_t) INTPTR_MAX); break; + CASE_TEST(limit_uint_fast32_max); EXPECT_EQ(1, UINT_FAST32_MAX, (uint_fast32_t) UINTPTR_MAX); break; + CASE_TEST(limit_int_fast64_min); EXPECT_EQ(1, INT_FAST64_MIN, (int_fast64_t) INTPTR_MIN); break; + CASE_TEST(limit_int_fast64_max); EXPECT_EQ(1, INT_FAST64_MAX, (int_fast64_t) INTPTR_MAX); break; + CASE_TEST(limit_uint_fast64_max); EXPECT_EQ(1, UINT_FAST64_MAX, (uint_fast64_t) UINTPTR_MAX); break; +#if __SIZEOF_LONG__ == 8 + CASE_TEST(limit_intptr_min); EXPECT_EQ(1, INTPTR_MIN, (intptr_t) 0x8000000000000000LL); break; + CASE_TEST(limit_intptr_max); EXPECT_EQ(1, INTPTR_MAX, (intptr_t) 0x7fffffffffffffffLL); break; + CASE_TEST(limit_uintptr_max); EXPECT_EQ(1, UINTPTR_MAX, (uintptr_t) 0xffffffffffffffffULL); break; + CASE_TEST(limit_ptrdiff_min); EXPECT_EQ(1, PTRDIFF_MIN, (ptrdiff_t) 0x8000000000000000LL); break; + CASE_TEST(limit_ptrdiff_max); EXPECT_EQ(1, PTRDIFF_MAX, (ptrdiff_t) 0x7fffffffffffffffLL); break; + CASE_TEST(limit_size_max); EXPECT_EQ(1, SIZE_MAX, (size_t) 0xffffffffffffffffULL); break; +#elif __SIZEOF_LONG__ == 4 + CASE_TEST(limit_intptr_min); EXPECT_EQ(1, INTPTR_MIN, (intptr_t) 0x80000000); break; + CASE_TEST(limit_intptr_max); EXPECT_EQ(1, INTPTR_MAX, (intptr_t) 0x7fffffff); break; + CASE_TEST(limit_uintptr_max); EXPECT_EQ(1, UINTPTR_MAX, (uintptr_t) 0xffffffffU); break; + CASE_TEST(limit_ptrdiff_min); EXPECT_EQ(1, PTRDIFF_MIN, (ptrdiff_t) 0x80000000); break; + CASE_TEST(limit_ptrdiff_max); EXPECT_EQ(1, PTRDIFF_MAX, (ptrdiff_t) 0x7fffffff); break; + CASE_TEST(limit_size_max); EXPECT_EQ(1, SIZE_MAX, (size_t) 0xffffffffU); break; +#else +# warning "__SIZEOF_LONG__ is undefined" +#endif /* __SIZEOF_LONG__ */ case __LINE__: return ret; /* must be last */ /* note: do not set any defaults so as to permit holes above */ @@ -610,6 +667,63 @@ int run_stdlib(int min, int max) return ret; } +#if defined(__clang__) +__attribute__((optnone)) +#elif defined(__GNUC__) +__attribute__((optimize("O0"))) +#endif +static int smash_stack(void) +{ + char buf[100]; + + for (size_t i = 0; i < 200; i++) + buf[i] = 'P'; + + return 1; +} + +static int run_protection(int min, int max) +{ + pid_t pid; + int llen = 0, status; + + llen += printf("0 -fstackprotector "); + +#if !defined(NOLIBC_STACKPROTECTOR) + llen += printf("not supported"); + pad_spc(llen, 64, "[SKIPPED]\n"); + return 0; +#endif + + pid = -1; + pid = fork(); + + switch (pid) { + case -1: + llen += printf("fork()"); + pad_spc(llen, 64, "[FAIL]\n"); + return 1; + + case 0: + close(STDOUT_FILENO); + close(STDERR_FILENO); + + smash_stack(); + return 1; + + default: + pid = waitpid(pid, &status, 0); + + if (pid == -1 || !WIFSIGNALED(status) || WTERMSIG(status) != SIGABRT) { + llen += printf("waitpid()"); + pad_spc(llen, 64, "[FAIL]\n"); + return 1; + } + pad_spc(llen, 64, " [OK]\n"); + return 0; + } +} + /* prepare what needs to be prepared for pid 1 (stdio, /dev, /proc, etc) */ int prepare(void) { @@ -660,10 +774,11 @@ int prepare(void) } /* This is the definition of known test names, with their functions */ -static struct test test_names[] = { +static const struct test test_names[] = { /* add new tests here */ - { .name = "syscall", .func = run_syscall }, - { .name = "stdlib", .func = run_stdlib }, + { .name = "syscall", .func = run_syscall }, + { .name = "stdlib", .func = run_stdlib }, + { .name = "protection", .func = run_protection }, { 0 } }; diff --git a/tools/testing/selftests/prctl/.gitignore b/tools/testing/selftests/prctl/.gitignore index 91af2b631bc9..7a657b25f686 100644 --- a/tools/testing/selftests/prctl/.gitignore +++ b/tools/testing/selftests/prctl/.gitignore @@ -2,3 +2,4 @@ disable-tsc-ctxt-sw-stress-test disable-tsc-on-off-stress-test disable-tsc-test +set-anon-vma-name-test diff --git a/tools/testing/selftests/prctl/Makefile b/tools/testing/selftests/prctl/Makefile index c7923b205222..c058b81eeb41 100644 --- a/tools/testing/selftests/prctl/Makefile +++ b/tools/testing/selftests/prctl/Makefile @@ -5,7 +5,7 @@ ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) ifeq ($(ARCH),x86) TEST_PROGS := disable-tsc-ctxt-sw-stress-test disable-tsc-on-off-stress-test \ - disable-tsc-test + disable-tsc-test set-anon-vma-name-test all: $(TEST_PROGS) include ../lib.mk diff --git a/tools/testing/selftests/prctl/config b/tools/testing/selftests/prctl/config new file mode 100644 index 000000000000..c6ed03c544e5 --- /dev/null +++ b/tools/testing/selftests/prctl/config @@ -0,0 +1 @@ +CONFIG_ANON_VMA_NAME=y diff --git a/tools/testing/selftests/prctl/set-anon-vma-name-test.c b/tools/testing/selftests/prctl/set-anon-vma-name-test.c new file mode 100644 index 000000000000..26d853c5a0c1 --- /dev/null +++ b/tools/testing/selftests/prctl/set-anon-vma-name-test.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This test covers the anonymous VMA naming functionality through prctl calls + */ + +#include <errno.h> +#include <sys/prctl.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <string.h> + +#include "../kselftest_harness.h" + +#define AREA_SIZE 1024 + +#define GOOD_NAME "goodname" +#define BAD_NAME "badname\1" + +#ifndef PR_SET_VMA +#define PR_SET_VMA 0x53564d41 +#define PR_SET_VMA_ANON_NAME 0 +#endif + + +int rename_vma(unsigned long addr, unsigned long size, char *name) +{ + int res; + + res = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, addr, size, name); + if (res < 0) + return -errno; + return res; +} + +int was_renaming_successful(char *target_name, unsigned long ptr) +{ + FILE *maps_file; + + char line_buf[512], name[128], mode[8]; + unsigned long start_addr, end_addr, offset; + unsigned int major_id, minor_id, node_id; + + char target_buf[128]; + int res = 0, sscanf_res; + + // The entry name in maps will be in format [anon:<target_name>] + sprintf(target_buf, "[anon:%s]", target_name); + maps_file = fopen("/proc/self/maps", "r"); + if (!maps_file) { + printf("## /proc/self/maps file opening error\n"); + return 0; + } + + // Parse the maps file to find the entry we renamed + while (fgets(line_buf, sizeof(line_buf), maps_file)) { + sscanf_res = sscanf(line_buf, "%lx-%lx %7s %lx %u:%u %u %s", &start_addr, + &end_addr, mode, &offset, &major_id, + &minor_id, &node_id, name); + if (sscanf_res == EOF) { + res = 0; + printf("## EOF while parsing the maps file\n"); + break; + } + if (!strcmp(name, target_buf) && start_addr == ptr) { + res = 1; + break; + } + } + fclose(maps_file); + return res; +} + +FIXTURE(vma) { + void *ptr_anon, *ptr_not_anon; +}; + +FIXTURE_SETUP(vma) { + self->ptr_anon = mmap(NULL, AREA_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + ASSERT_NE(self->ptr_anon, NULL); + self->ptr_not_anon = mmap(NULL, AREA_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE, 0, 0); + ASSERT_NE(self->ptr_not_anon, NULL); +} + +FIXTURE_TEARDOWN(vma) { + munmap(self->ptr_anon, AREA_SIZE); + munmap(self->ptr_not_anon, AREA_SIZE); +} + +TEST_F(vma, renaming) { + TH_LOG("Try to rename the VMA with correct parameters"); + EXPECT_GE(rename_vma((unsigned long)self->ptr_anon, AREA_SIZE, GOOD_NAME), 0); + EXPECT_TRUE(was_renaming_successful(GOOD_NAME, (unsigned long)self->ptr_anon)); + + TH_LOG("Try to pass invalid name (with non-printable character \\1) to rename the VMA"); + EXPECT_EQ(rename_vma((unsigned long)self->ptr_anon, AREA_SIZE, BAD_NAME), -EINVAL); + + TH_LOG("Try to rename non-anonynous VMA"); + EXPECT_EQ(rename_vma((unsigned long) self->ptr_not_anon, AREA_SIZE, GOOD_NAME), -EINVAL); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/ptrace/peeksiginfo.c b/tools/testing/selftests/ptrace/peeksiginfo.c index 54900657eb44..a6884f66dc01 100644 --- a/tools/testing/selftests/ptrace/peeksiginfo.c +++ b/tools/testing/selftests/ptrace/peeksiginfo.c @@ -151,7 +151,7 @@ out: int main(int argc, char *argv[]) { - siginfo_t siginfo[SIGNR]; + siginfo_t siginfo; int i, exit_code = 1; sigset_t blockmask; pid_t child; @@ -176,13 +176,13 @@ int main(int argc, char *argv[]) /* Send signals in process-wide and per-thread queues */ for (i = 0; i < SIGNR; i++) { - siginfo->si_code = TEST_SICODE_SHARE; - siginfo->si_int = i; - sys_rt_sigqueueinfo(child, SIGRTMIN, siginfo); + siginfo.si_code = TEST_SICODE_SHARE; + siginfo.si_int = i; + sys_rt_sigqueueinfo(child, SIGRTMIN, &siginfo); - siginfo->si_code = TEST_SICODE_PRIV; - siginfo->si_int = i; - sys_rt_tgsigqueueinfo(child, child, SIGRTMIN, siginfo); + siginfo.si_code = TEST_SICODE_PRIV; + siginfo.si_int = i; + sys_rt_tgsigqueueinfo(child, child, SIGRTMIN, &siginfo); } if (sys_ptrace(PTRACE_ATTACH, child, NULL, NULL) == -1) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh index 8a968fbda02c..88ca4e368489 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -193,7 +193,7 @@ do qemu_cmd_dir="`dirname "$i"`" kernel_dir="`echo $qemu_cmd_dir | sed -e 's/\.[0-9]\+$//'`" jitter_dir="`dirname "$kernel_dir"`" - kvm-transform.sh "$kernel_dir/bzImage" "$qemu_cmd_dir/console.log" "$jitter_dir" $dur "$bootargs" < $T/qemu-cmd > $i + kvm-transform.sh "$kernel_dir/bzImage" "$qemu_cmd_dir/console.log" "$jitter_dir" "$dur" "$bootargs" < $T/qemu-cmd > $i if test -n "$arg_remote" then echo "# TORTURE_KCONFIG_GDB_ARG=''" >> $i diff --git a/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh b/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh new file mode 100755 index 000000000000..2e63ef009d59 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Run SRCU-lockdep tests and report any that fail to meet expectations. +# +# Copyright (C) 2021 Meta Platforms, Inc. +# +# Authors: Paul E. McKenney <paulmck@kernel.org> + +usage () { + echo "Usage: $scriptname optional arguments:" + echo " --datestamp string" + exit 1 +} + +ds=`date +%Y.%m.%d-%H.%M.%S`-srcu_lockdep +scriptname="$0" + +T="`mktemp -d ${TMPDIR-/tmp}/srcu_lockdep.sh.XXXXXX`" +trap 'rm -rf $T' 0 + +RCUTORTURE="`pwd`/tools/testing/selftests/rcutorture"; export RCUTORTURE +PATH=${RCUTORTURE}/bin:$PATH; export PATH +. functions.sh + +while test $# -gt 0 +do + case "$1" in + --datestamp) + checkarg --datestamp "(relative pathname)" "$#" "$2" '^[a-zA-Z0-9._/-]*$' '^--' + ds=$2 + shift + ;; + *) + echo Unknown argument $1 + usage + ;; + esac + shift +done + +err= +nerrs=0 +for d in 0 1 +do + for t in 0 1 2 + do + for c in 1 2 3 + do + err= + val=$((d*1000+t*10+c)) + tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 5s --configs "SRCU-P" --bootargs "rcutorture.test_srcu_lockdep=$val" --trust-make --datestamp "$ds/$val" > "$T/kvm.sh.out" 2>&1 + ret=$? + mv "$T/kvm.sh.out" "$RCUTORTURE/res/$ds/$val" + if test "$d" -ne 0 && test "$ret" -eq 0 + then + err=1 + echo -n Unexpected success for > "$RCUTORTURE/res/$ds/$val/kvm.sh.err" + fi + if test "$d" -eq 0 && test "$ret" -ne 0 + then + err=1 + echo -n Unexpected failure for > "$RCUTORTURE/res/$ds/$val/kvm.sh.err" + fi + if test -n "$err" + then + grep "rcu_torture_init_srcu_lockdep: test_srcu_lockdep = " "$RCUTORTURE/res/$ds/$val/SRCU-P/console.log" | sed -e 's/^.*rcu_torture_init_srcu_lockdep://' >> "$RCUTORTURE/res/$ds/$val/kvm.sh.err" + cat "$RCUTORTURE/res/$ds/$val/kvm.sh.err" + nerrs=$((nerrs+1)) + fi + done + done +done +if test "$nerrs" -ne 0 +then + exit 1 +fi +exit 0 diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 130d0de4c3bb..5a2ae2264403 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -497,16 +497,16 @@ fi if test "$do_clocksourcewd" = "yes" then - torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000" + torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 tsc=watchdog" torture_set "clocksourcewd-1" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 45s --configs TREE03 --kconfig "CONFIG_TEST_CLOCKSOURCE_WATCHDOG=y" --trust-make - torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 clocksource.max_cswd_read_retries=1" + torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 clocksource.max_cswd_read_retries=1 tsc=watchdog" torture_set "clocksourcewd-2" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 45s --configs TREE03 --kconfig "CONFIG_TEST_CLOCKSOURCE_WATCHDOG=y" --trust-make # In case our work is already done... if test "$do_rcutorture" != "yes" then - torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000" + torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 tsc=watchdog" torture_set "clocksourcewd-3" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 45s --configs TREE03 --trust-make fi fi diff --git a/tools/testing/selftests/rcutorture/configs/lock/CFLIST b/tools/testing/selftests/rcutorture/configs/lock/CFLIST index 41bae5824339..28e23d05d5a5 100644 --- a/tools/testing/selftests/rcutorture/configs/lock/CFLIST +++ b/tools/testing/selftests/rcutorture/configs/lock/CFLIST @@ -5,3 +5,5 @@ LOCK04 LOCK05 LOCK06 LOCK07 +LOCK08 +LOCK09 diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK08 b/tools/testing/selftests/rcutorture/configs/lock/LOCK08 new file mode 100644 index 000000000000..1d1da1477fc3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK08 @@ -0,0 +1,6 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=4 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK08.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK08.boot new file mode 100644 index 000000000000..b8b6caebb89e --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK08.boot @@ -0,0 +1 @@ +locktorture.torture_type=mutex_lock locktorture.nested_locks=8 diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK09 b/tools/testing/selftests/rcutorture/configs/lock/LOCK09 new file mode 100644 index 000000000000..1d1da1477fc3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK09 @@ -0,0 +1,6 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=4 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK09.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK09.boot new file mode 100644 index 000000000000..fd5eff148a93 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK09.boot @@ -0,0 +1 @@ +locktorture.torture_type=rtmutex_lock locktorture.nested_locks=8 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01 index 8ae41d5f81a3..04831ef1f9b5 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01 @@ -15,3 +15,4 @@ CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y +CONFIG_BOOTPARAM_HOTPLUG_CPU0=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index ae395981b5e5..dc4985064b3a 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -15,3 +15,4 @@ CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y CONFIG_RCU_EQS_DEBUG=y +CONFIG_RCU_LAZY=y diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index 42acb1a64ce1..3f5fb66f16df 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt @@ -71,9 +71,5 @@ CONFIG_TASKS_RCU These are controlled by CONFIG_PREEMPT and/or CONFIG_SMP. -CONFIG_SRCU - - Selected by CONFIG_RCU_TORTURE_TEST, so cannot disable. - boot parameters ignored: TBD diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c index 68ff856d36f0..8a4fe8693be6 100644 --- a/tools/testing/selftests/resctrl/cache.c +++ b/tools/testing/selftests/resctrl/cache.c @@ -48,7 +48,7 @@ static int perf_event_open_llc_miss(pid_t pid, int cpu_no) return 0; } -static int initialize_llc_perf(void) +static void initialize_llc_perf(void) { memset(&pea_llc_miss, 0, sizeof(struct perf_event_attr)); memset(&rf_cqm, 0, sizeof(struct read_format)); @@ -59,8 +59,6 @@ static int initialize_llc_perf(void) pea_llc_miss.config = PERF_COUNT_HW_CACHE_MISSES; rf_cqm.nr = 1; - - return 0; } static int reset_enable_llc_perf(pid_t pid, int cpu_no) @@ -79,7 +77,7 @@ static int reset_enable_llc_perf(pid_t pid, int cpu_no) /* * get_llc_perf: llc cache miss through perf events - * @cpu_no: CPU number that the benchmark PID is binded to + * @llc_perf_miss: LLC miss counter that is filled on success * * Perf events like HW_CACHE_MISSES could be used to validate number of * cache lines allocated. @@ -234,20 +232,19 @@ int cat_val(struct resctrl_val_param *param) if (ret) return ret; - if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) { - ret = initialize_llc_perf(); - if (ret) - return ret; - } + if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) + initialize_llc_perf(); /* Test runs until the callback setup() tells the test to stop. */ while (1) { if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) { ret = param->setup(1, param); - if (ret) { + if (ret == END_OF_TESTS) { ret = 0; break; } + if (ret < 0) + break; ret = reset_enable_llc_perf(bm_pid, param->cpu_no); if (ret) break; diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index 1c5e90c63254..fb1443f888c4 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -40,7 +40,7 @@ static int cat_setup(int num, ...) /* Run NUM_OF_RUNS times */ if (p->num_of_runs >= NUM_OF_RUNS) - return -1; + return END_OF_TESTS; if (p->num_of_runs == 0) { sprintf(schemata, "%lx", p->mask); @@ -103,7 +103,6 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type) unsigned long l_mask, l_mask_1; int ret, pipefd[2], sibling_cpu_no; char pipe_message; - pid_t bm_pid; cache_size = 0; @@ -145,7 +144,7 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type) struct resctrl_val_param param = { .resctrl_val = CAT_STR, .cpu_no = cpu_no, - .mum_resctrlfs = 0, + .mum_resctrlfs = false, .setup = cat_setup, }; @@ -167,6 +166,7 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type) return errno; } + fflush(stdout); bm_pid = fork(); /* Set param values for child thread which will be allocated bitmask @@ -180,28 +180,31 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type) strcpy(param.filename, RESULT_FILE_NAME1); param.num_of_runs = 0; param.cpu_no = sibling_cpu_no; + } else { + ret = signal_handler_register(); + if (ret) { + kill(bm_pid, SIGKILL); + goto out; + } } remove(param.filename); ret = cat_val(¶m); - if (ret) - return ret; - - ret = check_results(¶m); - if (ret) - return ret; + if (ret == 0) + ret = check_results(¶m); if (bm_pid == 0) { /* Tell parent that child is ready */ close(pipefd[0]); pipe_message = 1; if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) < - sizeof(pipe_message)) { - close(pipefd[1]); + sizeof(pipe_message)) + /* + * Just print the error message. + * Let while(1) run and wait for itself to be killed. + */ perror("# failed signaling parent process"); - return errno; - } close(pipefd[1]); while (1) @@ -219,11 +222,13 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type) } close(pipefd[0]); kill(bm_pid, SIGKILL); + signal_handler_unregister(); } +out: cat_test_cleanup(); if (bm_pid) umount_resctrlfs(); - return 0; + return ret; } diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c index 8968e36db99d..af71b2141271 100644 --- a/tools/testing/selftests/resctrl/cmt_test.c +++ b/tools/testing/selftests/resctrl/cmt_test.c @@ -32,7 +32,7 @@ static int cmt_setup(int num, ...) /* Run NUM_OF_RUNS times */ if (p->num_of_runs >= NUM_OF_RUNS) - return -1; + return END_OF_TESTS; p->num_of_runs++; @@ -82,12 +82,11 @@ void cmt_test_cleanup(void) int cmt_resctrl_val(int cpu_no, int n, char **benchmark_cmd) { - int ret, mum_resctrlfs; + int ret; cache_size = 0; - mum_resctrlfs = 1; - ret = remount_resctrlfs(mum_resctrlfs); + ret = remount_resctrlfs(true); if (ret) return ret; @@ -118,7 +117,7 @@ int cmt_resctrl_val(int cpu_no, int n, char **benchmark_cmd) .ctrlgrp = "c1", .mongrp = "m1", .cpu_no = cpu_no, - .mum_resctrlfs = 0, + .mum_resctrlfs = false, .filename = RESULT_FILE_NAME, .mask = ~(long_mask << n) & long_mask, .span = cache_size * n / count_of_bits, @@ -133,13 +132,12 @@ int cmt_resctrl_val(int cpu_no, int n, char **benchmark_cmd) ret = resctrl_val(benchmark_cmd, ¶m); if (ret) - return ret; + goto out; ret = check_results(¶m, n); - if (ret) - return ret; +out: cmt_test_cleanup(); - return 0; + return ret; } diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c index 56ccbeae0638..341cc93ca84c 100644 --- a/tools/testing/selftests/resctrl/fill_buf.c +++ b/tools/testing/selftests/resctrl/fill_buf.c @@ -14,7 +14,6 @@ #include <sys/types.h> #include <sys/wait.h> #include <inttypes.h> -#include <malloc.h> #include <string.h> #include "resctrl.h" @@ -33,14 +32,6 @@ static void sb(void) #endif } -static void ctrl_handler(int signo) -{ - free(startptr); - printf("\nEnding\n"); - sb(); - exit(EXIT_SUCCESS); -} - static void cl_flush(void *p) { #if defined(__i386) || defined(__x86_64) @@ -64,10 +55,14 @@ static void mem_flush(void *p, size_t s) static void *malloc_and_init_memory(size_t s) { + void *p = NULL; uint64_t *p64; size_t s64; + int ret; - void *p = memalign(PAGE_SIZE, s); + ret = posix_memalign(&p, PAGE_SIZE, s); + if (ret < 0) + return NULL; p64 = (uint64_t *)p; s64 = s / sizeof(uint64_t); @@ -198,12 +193,6 @@ int run_fill_buf(unsigned long span, int malloc_and_init_memory, unsigned long long cache_size = span; int ret; - /* set up ctrl-c handler */ - if (signal(SIGINT, ctrl_handler) == SIG_ERR) - printf("Failed to catch SIGINT!\n"); - if (signal(SIGHUP, ctrl_handler) == SIG_ERR) - printf("Failed to catch SIGHUP!\n"); - ret = fill_cache(cache_size, malloc_and_init_memory, memflush, op, resctrl_val); if (ret) { diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c index 1a1bdb6180cf..cde3781a9ab0 100644 --- a/tools/testing/selftests/resctrl/mba_test.c +++ b/tools/testing/selftests/resctrl/mba_test.c @@ -28,6 +28,7 @@ static int mba_setup(int num, ...) struct resctrl_val_param *p; char allocation_str[64]; va_list param; + int ret; va_start(param, num); p = va_arg(param, struct resctrl_val_param *); @@ -41,20 +42,24 @@ static int mba_setup(int num, ...) return 0; if (allocation < ALLOCATION_MIN || allocation > ALLOCATION_MAX) - return -1; + return END_OF_TESTS; sprintf(allocation_str, "%d", allocation); - write_schemata(p->ctrlgrp, allocation_str, p->cpu_no, p->resctrl_val); + ret = write_schemata(p->ctrlgrp, allocation_str, p->cpu_no, + p->resctrl_val); + if (ret < 0) + return ret; + allocation -= ALLOCATION_STEP; return 0; } -static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc) +static bool show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc) { int allocation, runs; - bool failed = false; + bool ret = false; ksft_print_msg("Results are displayed in (MB)\n"); /* Memory bandwidth from 100% down to 10% */ @@ -90,13 +95,15 @@ static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc) ksft_print_msg("avg_bw_imc: %lu\n", avg_bw_imc); ksft_print_msg("avg_bw_resc: %lu\n", avg_bw_resc); if (avg_diff_per > MAX_DIFF_PERCENT) - failed = true; + ret = true; } ksft_print_msg("%s Check schemata change using MBA\n", - failed ? "Fail:" : "Pass:"); - if (failed) + ret ? "Fail:" : "Pass:"); + if (ret) ksft_print_msg("At least one test failed\n"); + + return ret; } static int check_results(void) @@ -132,9 +139,7 @@ static int check_results(void) fclose(fp); - show_mba_info(bw_imc, bw_resc); - - return 0; + return show_mba_info(bw_imc, bw_resc); } void mba_test_cleanup(void) @@ -149,7 +154,7 @@ int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd) .ctrlgrp = "c1", .mongrp = "m1", .cpu_no = cpu_no, - .mum_resctrlfs = 1, + .mum_resctrlfs = true, .filename = RESULT_FILE_NAME, .bw_report = bw_report, .setup = mba_setup @@ -160,13 +165,12 @@ int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd) ret = resctrl_val(benchmark_cmd, ¶m); if (ret) - return ret; + goto out; ret = check_results(); - if (ret) - return ret; +out: mba_test_cleanup(); - return 0; + return ret; } diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c index 8392e5c55ed0..538d35a6485a 100644 --- a/tools/testing/selftests/resctrl/mbm_test.c +++ b/tools/testing/selftests/resctrl/mbm_test.c @@ -89,23 +89,24 @@ static int check_results(int span) static int mbm_setup(int num, ...) { struct resctrl_val_param *p; - static int num_of_runs; va_list param; int ret = 0; - /* Run NUM_OF_RUNS times */ - if (num_of_runs++ >= NUM_OF_RUNS) - return -1; - va_start(param, num); p = va_arg(param, struct resctrl_val_param *); va_end(param); + /* Run NUM_OF_RUNS times */ + if (p->num_of_runs >= NUM_OF_RUNS) + return END_OF_TESTS; + /* Set up shemata with 100% allocation on the first run. */ - if (num_of_runs == 0) + if (p->num_of_runs == 0) ret = write_schemata(p->ctrlgrp, "100", p->cpu_no, p->resctrl_val); + p->num_of_runs++; + return ret; } @@ -122,7 +123,7 @@ int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd) .mongrp = "m1", .span = span, .cpu_no = cpu_no, - .mum_resctrlfs = 1, + .mum_resctrlfs = true, .filename = RESULT_FILE_NAME, .bw_report = bw_report, .setup = mbm_setup @@ -133,13 +134,12 @@ int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd) ret = resctrl_val(benchmark_cmd, ¶m); if (ret) - return ret; + goto out; ret = check_results(span); - if (ret) - return ret; +out: mbm_test_cleanup(); - return 0; + return ret; } diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index f0ded31fb3c7..87e39456dee0 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -28,7 +28,7 @@ #define MB (1024 * 1024) #define RESCTRL_PATH "/sys/fs/resctrl" #define PHYS_ID_PATH "/sys/devices/system/cpu/cpu" -#define CBM_MASK_PATH "/sys/fs/resctrl/info" +#define INFO_PATH "/sys/fs/resctrl/info" #define L3_PATH "/sys/fs/resctrl/info/L3" #define MB_PATH "/sys/fs/resctrl/info/MB" #define L3_MON_PATH "/sys/fs/resctrl/info/L3_MON" @@ -37,6 +37,8 @@ #define ARCH_INTEL 1 #define ARCH_AMD 2 +#define END_OF_TESTS 1 + #define PARENT_EXIT(err_msg) \ do { \ perror(err_msg); \ @@ -62,7 +64,7 @@ struct resctrl_val_param { char mongrp[64]; int cpu_no; unsigned long span; - int mum_resctrlfs; + bool mum_resctrlfs; char filename[64]; char *bw_report; unsigned long mask; @@ -107,6 +109,8 @@ void mba_test_cleanup(void); int get_cbm_mask(char *cache_type, char *cbm_mask); int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size); void ctrlc_handler(int signum, siginfo_t *info, void *ptr); +int signal_handler_register(void); +void signal_handler_unregister(void); int cat_val(struct resctrl_val_param *param); void cat_test_cleanup(void); int cat_perf_miss_val(int cpu_no, int no_of_bits, char *cache_type); diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index df0d8d8526fc..9b9751206e1c 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -77,7 +77,7 @@ static void run_mbm_test(bool has_ben, char **benchmark_cmd, int span, ksft_print_msg("Starting MBM BW change ...\n"); - if (!validate_resctrl_feature_request(MBM_STR)) { + if (!validate_resctrl_feature_request(MBM_STR) || (get_vendor() != ARCH_INTEL)) { ksft_test_result_skip("Hardware does not support MBM or MBM is disabled\n"); return; } @@ -88,7 +88,6 @@ static void run_mbm_test(bool has_ben, char **benchmark_cmd, int span, ksft_test_result(!res, "MBM: bw change\n"); if ((get_vendor() == ARCH_INTEL) && res) ksft_print_msg("Intel MBM may be inaccurate when Sub-NUMA Clustering is enabled. Check BIOS configuration.\n"); - mbm_test_cleanup(); } static void run_mba_test(bool has_ben, char **benchmark_cmd, int span, @@ -98,7 +97,7 @@ static void run_mba_test(bool has_ben, char **benchmark_cmd, int span, ksft_print_msg("Starting MBA Schemata change ...\n"); - if (!validate_resctrl_feature_request(MBA_STR)) { + if (!validate_resctrl_feature_request(MBA_STR) || (get_vendor() != ARCH_INTEL)) { ksft_test_result_skip("Hardware does not support MBA or MBA is disabled\n"); return; } @@ -107,7 +106,6 @@ static void run_mba_test(bool has_ben, char **benchmark_cmd, int span, sprintf(benchmark_cmd[1], "%d", span); res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd); ksft_test_result(!res, "MBA: schemata change\n"); - mba_test_cleanup(); } static void run_cmt_test(bool has_ben, char **benchmark_cmd, int cpu_no) @@ -126,7 +124,6 @@ static void run_cmt_test(bool has_ben, char **benchmark_cmd, int cpu_no) ksft_test_result(!res, "CMT: test\n"); if ((get_vendor() == ARCH_INTEL) && res) ksft_print_msg("Intel CMT may be inaccurate when Sub-NUMA Clustering is enabled. Check BIOS configuration.\n"); - cmt_test_cleanup(); } static void run_cat_test(int cpu_no, int no_of_bits) @@ -142,7 +139,6 @@ static void run_cat_test(int cpu_no, int no_of_bits) res = cat_perf_miss_val(cpu_no, no_of_bits, "L3"); ksft_test_result(!res, "CAT: test\n"); - cat_test_cleanup(); } int main(int argc, char **argv) @@ -258,10 +254,10 @@ int main(int argc, char **argv) ksft_set_plan(tests ? : 4); - if ((get_vendor() == ARCH_INTEL) && mbm_test) + if (mbm_test) run_mbm_test(has_ben, benchmark_cmd, span, cpu_no, bw_report); - if ((get_vendor() == ARCH_INTEL) && mba_test) + if (mba_test) run_mba_test(has_ben, benchmark_cmd, span, cpu_no, bw_report); if (cmt_test) @@ -272,5 +268,5 @@ int main(int argc, char **argv) umount_resctrlfs(); - return ksft_exit_pass(); + ksft_finished(); } diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c index b32b96356ec7..ab1eab1e7ff6 100644 --- a/tools/testing/selftests/resctrl/resctrl_val.c +++ b/tools/testing/selftests/resctrl/resctrl_val.c @@ -477,6 +477,45 @@ void ctrlc_handler(int signum, siginfo_t *info, void *ptr) } /* + * Register CTRL-C handler for parent, as it has to kill + * child process before exiting. + */ +int signal_handler_register(void) +{ + struct sigaction sigact; + int ret = 0; + + sigact.sa_sigaction = ctrlc_handler; + sigemptyset(&sigact.sa_mask); + sigact.sa_flags = SA_SIGINFO; + if (sigaction(SIGINT, &sigact, NULL) || + sigaction(SIGTERM, &sigact, NULL) || + sigaction(SIGHUP, &sigact, NULL)) { + perror("# sigaction"); + ret = -1; + } + return ret; +} + +/* + * Reset signal handler to SIG_DFL. + * Non-Value return because the caller should keep + * the error code of other path even if sigaction fails. + */ +void signal_handler_unregister(void) +{ + struct sigaction sigact; + + sigact.sa_handler = SIG_DFL; + sigemptyset(&sigact.sa_mask); + if (sigaction(SIGINT, &sigact, NULL) || + sigaction(SIGTERM, &sigact, NULL) || + sigaction(SIGHUP, &sigact, NULL)) { + perror("# sigaction"); + } +} + +/* * print_results_bw: the memory bandwidth results are stored in a file * @filename: file that stores the results * @bm_pid: child pid that runs benchmark @@ -629,6 +668,7 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) * Fork to start benchmark, save child's pid so that it can be killed * when needed */ + fflush(stdout); bm_pid = fork(); if (bm_pid == -1) { perror("# Unable to fork"); @@ -670,39 +710,28 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) ksft_print_msg("Benchmark PID: %d\n", bm_pid); - /* - * Register CTRL-C handler for parent, as it has to kill benchmark - * before exiting - */ - sigact.sa_sigaction = ctrlc_handler; - sigemptyset(&sigact.sa_mask); - sigact.sa_flags = SA_SIGINFO; - if (sigaction(SIGINT, &sigact, NULL) || - sigaction(SIGTERM, &sigact, NULL) || - sigaction(SIGHUP, &sigact, NULL)) { - perror("# sigaction"); - ret = errno; + ret = signal_handler_register(); + if (ret) goto out; - } value.sival_ptr = benchmark_cmd; /* Taskset benchmark to specified cpu */ ret = taskset_benchmark(bm_pid, param->cpu_no); if (ret) - goto out; + goto unregister; /* Write benchmark to specified control&monitoring grp in resctrl FS */ ret = write_bm_pid_to_resctrl(bm_pid, param->ctrlgrp, param->mongrp, resctrl_val); if (ret) - goto out; + goto unregister; if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) || !strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) { ret = initialize_mem_bw_imc(); if (ret) - goto out; + goto unregister; initialize_mem_bw_resctrl(param->ctrlgrp, param->mongrp, param->cpu_no, resctrl_val); @@ -717,7 +746,7 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) sizeof(pipe_message)) { perror("# failed reading message from child process"); close(pipefd[0]); - goto out; + goto unregister; } } close(pipefd[0]); @@ -726,7 +755,7 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) if (sigqueue(bm_pid, SIGUSR1, value) == -1) { perror("# sigqueue SIGUSR1 to child"); ret = errno; - goto out; + goto unregister; } /* Give benchmark enough time to fully run */ @@ -734,32 +763,29 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) /* Test runs until the callback setup() tells the test to stop. */ while (1) { + ret = param->setup(1, param); + if (ret == END_OF_TESTS) { + ret = 0; + break; + } + if (ret < 0) + break; + if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) || !strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) { - ret = param->setup(1, param); - if (ret) { - ret = 0; - break; - } - ret = measure_vals(param, &bw_resc_start); if (ret) break; } else if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) { - ret = param->setup(1, param); - if (ret) { - ret = 0; - break; - } sleep(1); ret = measure_cache_vals(param, bm_pid); if (ret) break; - } else { - break; } } +unregister: + signal_handler_unregister(); out: kill(bm_pid, SIGKILL); umount_resctrlfs(); diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c index 6f543e470ad4..fb00245dee92 100644 --- a/tools/testing/selftests/resctrl/resctrlfs.c +++ b/tools/testing/selftests/resctrl/resctrlfs.c @@ -210,7 +210,7 @@ int get_cbm_mask(char *cache_type, char *cbm_mask) if (!cbm_mask) return -1; - sprintf(cbm_mask_path, "%s/%s/cbm_mask", CBM_MASK_PATH, cache_type); + sprintf(cbm_mask_path, "%s/%s/cbm_mask", INFO_PATH, cache_type); fp = fopen(cbm_mask_path, "r"); if (!fp) { @@ -498,6 +498,7 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val) FILE *fp; if (strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)) && + strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) && strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)) && strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) return -ENOENT; @@ -523,7 +524,8 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val) if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)) || !strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) sprintf(schema, "%s%d%c%s", "L3:", resource_id, '=', schemata); - if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) + if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)) || + !strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata); fp = fopen(controlgroup, "w"); @@ -676,6 +678,7 @@ int filter_dmesg(void) perror("pipe"); return ret; } + fflush(stdout); pid = fork(); if (pid == 0) { close(pipefds[0]); diff --git a/tools/testing/selftests/sched/cs_prctl_test.c b/tools/testing/selftests/sched/cs_prctl_test.c index 25e0d95d3713..3e1619b6bf2d 100644 --- a/tools/testing/selftests/sched/cs_prctl_test.c +++ b/tools/testing/selftests/sched/cs_prctl_test.c @@ -334,6 +334,12 @@ int main(int argc, char *argv[]) validate(get_cs_cookie(pid) != 0); validate(get_cs_cookie(pid) == get_cs_cookie(procs[pidx].thr_tids[0])); + validate(_prctl(PR_SCHED_CORE, PR_SCHED_CORE_MAX, 0, PIDTYPE_PGID, 0) < 0 + && errno == EINVAL); + + validate(_prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_TO, 0, PIDTYPE_PGID, 1) < 0 + && errno == EINVAL); + if (errors) { printf("TESTS FAILED. errors: %d\n", errors); res = 10; diff --git a/tools/virtio/virtio-trace/README b/tools/virtio/virtio-trace/README index b64845b823ab..4fb9368bf751 100644 --- a/tools/virtio/virtio-trace/README +++ b/tools/virtio/virtio-trace/README @@ -61,7 +61,7 @@ and id=channel0,name=agent-ctl-path\ ##data path## -chardev pipe,id=charchannel1,path=/tmp/virtio-trace/trace-path-cpu0\ - -device virtserialport,bus=virtio-serial0.0,nr=2,chardev=charchannel0,\ + -device virtserialport,bus=virtio-serial0.0,nr=2,chardev=charchannel1,\ id=channel1,name=trace-path-cpu0\ ... diff --git a/usr/gen_init_cpio.c b/usr/gen_init_cpio.c index ee01e40e8bc6..61230532fef1 100644 --- a/usr/gen_init_cpio.c +++ b/usr/gen_init_cpio.c @@ -353,6 +353,12 @@ static int cpio_mkfile(const char *name, const char *location, buf.st_mtime = 0xffffffff; } + if (buf.st_mtime < 0) { + fprintf(stderr, "%s: Timestamp negative, clipping.\n", + location); + buf.st_mtime = 0; + } + if (buf.st_size > 0xffffffff) { fprintf(stderr, "%s: Size exceeds maximum cpio file size\n", location); @@ -602,10 +608,10 @@ int main (int argc, char *argv[]) /* * Timestamps after 2106-02-07 06:28:15 UTC have an ascii hex time_t * representation that exceeds 8 chars and breaks the cpio header - * specification. + * specification. Negative timestamps similarly exceed 8 chars. */ - if (default_mtime > 0xffffffff) { - fprintf(stderr, "ERROR: Timestamp too large for cpio format\n"); + if (default_mtime > 0xffffffff || default_mtime < 0) { + fprintf(stderr, "ERROR: Timestamp out of range for cpio format\n"); exit(1); } |