diff options
389 files changed, 4318 insertions, 2652 deletions
@@ -376,6 +376,7 @@ Juha Yrjola <juha.yrjola@solidboot.com> Julien Thierry <julien.thierry.kdev@gmail.com> <julien.thierry@arm.com> Iskren Chernev <me@iskren.info> <iskren.chernev@gmail.com> Kalle Valo <kvalo@kernel.org> <kvalo@codeaurora.org> +Kalle Valo <kvalo@kernel.org> <quic_kvalo@quicinc.com> Kalyan Thota <quic_kalyant@quicinc.com> <kalyan_t@codeaurora.org> Karthikeyan Periyasamy <quic_periyasa@quicinc.com> <periyasa@codeaurora.org> Kathiravan T <quic_kathirav@quicinc.com> <kathirav@codeaurora.org> @@ -2515,11 +2515,9 @@ D: SLS distribution D: Initial implementation of VC's, pty's and select() N: Pavel Machek -E: pavel@ucw.cz +E: pavel@kernel.org P: 4096R/92DFCE96 4FA7 9EEF FCD4 C44F C585 B8C7 C060 2241 92DF CE96 -D: Softcursor for vga, hypertech cdrom support, vcsa bugfix, nbd, -D: sun4/330 port, capabilities for elf, speedup for rm on ext2, USB, -D: work on suspend-to-ram/disk, killing duplicates from ioctl32, +D: NBD, Sun4/330 port, USB, work on suspend-to-ram/disk, D: Altera SoCFPGA and Nokia N900 support. S: Czech Republic diff --git a/Documentation/devicetree/bindings/interrupt-controller/microchip,lan966x-oic.yaml b/Documentation/devicetree/bindings/interrupt-controller/microchip,lan966x-oic.yaml index b2adc7174177..dca16e202da9 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/microchip,lan966x-oic.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/microchip,lan966x-oic.yaml @@ -14,9 +14,8 @@ allOf: description: | The Microchip LAN966x outband interrupt controller (OIC) maps the internal - interrupt sources of the LAN966x device to an external interrupt. - When the LAN966x device is used as a PCI device, the external interrupt is - routed to the PCI interrupt. + interrupt sources of the LAN966x device to a PCI interrupt when the LAN966x + device is used as a PCI device. properties: compatible: diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml b/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml index 070c4c9b8643..aace072e2d52 100644 --- a/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml +++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml @@ -7,7 +7,6 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Technologies ath10k wireless devices maintainers: - - Kalle Valo <kvalo@kernel.org> - Jeff Johnson <jjohnson@kernel.org> description: diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath11k-pci.yaml b/Documentation/devicetree/bindings/net/wireless/qcom,ath11k-pci.yaml index a71fdf05bc1e..a4425cf196ab 100644 --- a/Documentation/devicetree/bindings/net/wireless/qcom,ath11k-pci.yaml +++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath11k-pci.yaml @@ -8,7 +8,6 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Technologies ath11k wireless devices (PCIe) maintainers: - - Kalle Valo <kvalo@kernel.org> - Jeff Johnson <jjohnson@kernel.org> description: | diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml b/Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml index ff5763dc66a8..a69ffb7b3cb8 100644 --- a/Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml +++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml @@ -8,7 +8,6 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Technologies ath11k wireless devices maintainers: - - Kalle Valo <kvalo@kernel.org> - Jeff Johnson <jjohnson@kernel.org> description: | diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath12k-wsi.yaml b/Documentation/devicetree/bindings/net/wireless/qcom,ath12k-wsi.yaml index cbfb559f6b69..318f305405e3 100644 --- a/Documentation/devicetree/bindings/net/wireless/qcom,ath12k-wsi.yaml +++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath12k-wsi.yaml @@ -9,7 +9,6 @@ title: Qualcomm Technologies ath12k wireless devices (PCIe) with WSI interface maintainers: - Jeff Johnson <jjohnson@kernel.org> - - Kalle Valo <kvalo@kernel.org> description: | Qualcomm Technologies IEEE 802.11be PCIe devices with WSI interface. diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath12k.yaml b/Documentation/devicetree/bindings/net/wireless/qcom,ath12k.yaml index 1b5884015b15..9e557cb838c7 100644 --- a/Documentation/devicetree/bindings/net/wireless/qcom,ath12k.yaml +++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath12k.yaml @@ -9,7 +9,6 @@ title: Qualcomm Technologies ath12k wireless devices (PCIe) maintainers: - Jeff Johnson <quic_jjohnson@quicinc.com> - - Kalle Valo <kvalo@kernel.org> description: Qualcomm Technologies IEEE 802.11be PCIe devices. diff --git a/Documentation/filesystems/bcachefs/SubmittingPatches.rst b/Documentation/filesystems/bcachefs/SubmittingPatches.rst new file mode 100644 index 000000000000..026b12ae0d6a --- /dev/null +++ b/Documentation/filesystems/bcachefs/SubmittingPatches.rst @@ -0,0 +1,98 @@ +Submitting patches to bcachefs: +=============================== + +Patches must be tested before being submitted, either with the xfstests suite +[0], or the full bcachefs test suite in ktest [1], depending on what's being +touched. Note that ktest wraps xfstests and will be an easier method to running +it for most users; it includes single-command wrappers for all the mainstream +in-kernel local filesystems. + +Patches will undergo more testing after being merged (including +lockdep/kasan/preempt/etc. variants), these are not generally required to be +run by the submitter - but do put some thought into what you're changing and +which tests might be relevant, e.g. are you dealing with tricky memory layout +work? kasan, are you doing locking work? then lockdep; and ktest includes +single-command variants for the debug build types you'll most likely need. + +The exception to this rule is incomplete WIP/RFC patches: if you're working on +something nontrivial, it's encouraged to send out a WIP patch to let people +know what you're doing and make sure you're on the right track. Just make sure +it includes a brief note as to what's done and what's incomplete, to avoid +confusion. + +Rigorous checkpatch.pl adherence is not required (many of its warnings are +considered out of date), but try not to deviate too much without reason. + +Focus on writing code that reads well and is organized well; code should be +aesthetically pleasing. + +CI: +=== + +Instead of running your tests locally, when running the full test suite it's +prefereable to let a server farm do it in parallel, and then have the results +in a nice test dashboard (which can tell you which failures are new, and +presents results in a git log view, avoiding the need for most bisecting). + +That exists [2], and community members may request an account. If you work for +a big tech company, you'll need to help out with server costs to get access - +but the CI is not restricted to running bcachefs tests: it runs any ktest test +(which generally makes it easy to wrap other tests that can run in qemu). + +Other things to think about: +============================ + +- How will we debug this code? Is there sufficient introspection to diagnose + when something starts acting wonky on a user machine? + + We don't necessarily need every single field of every data structure visible + with introspection, but having the important fields of all the core data + types wired up makes debugging drastically easier - a bit of thoughtful + foresight greatly reduces the need to have people build custom kernels with + debug patches. + + More broadly, think about all the debug tooling that might be needed. + +- Does it make the codebase more or less of a mess? Can we also try to do some + organizing, too? + +- Do new tests need to be written? New assertions? How do we know and verify + that the code is correct, and what happens if something goes wrong? + + We don't yet have automated code coverage analysis or easy fault injection - + but for now, pretend we did and ask what they might tell us. + + Assertions are hugely important, given that we don't yet have a systems + language that can do ergonomic embedded correctness proofs. Hitting an assert + in testing is much better than wandering off into undefined behaviour la-la + land - use them. Use them judiciously, and not as a replacement for proper + error handling, but use them. + +- Does it need to be performance tested? Should we add new peformance counters? + + bcachefs has a set of persistent runtime counters which can be viewed with + the 'bcachefs fs top' command; this should give users a basic idea of what + their filesystem is currently doing. If you're doing a new feature or looking + at old code, think if anything should be added. + +- If it's a new on disk format feature - have upgrades and downgrades been + tested? (Automated tests exists but aren't in the CI, due to the hassle of + disk image management; coordinate to have them run.) + +Mailing list, IRC: +================== + +Patches should hit the list [3], but much discussion and code review happens on +IRC as well [4]; many people appreciate the more conversational approach and +quicker feedback. + +Additionally, we have a lively user community doing excellent QA work, which +exists primarily on IRC. Please make use of that resource; user feedback is +important for any nontrivial feature, and documenting it in commit messages +would be a good idea. + +[0]: git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git +[1]: https://evilpiepirate.org/git/ktest.git/ +[2]: https://evilpiepirate.org/~testdashboard/ci/ +[3]: linux-bcachefs@vger.kernel.org +[4]: irc.oftc.net#bcache, #bcachefs-dev diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst index 95fc4b90739e..7db4d7ceab58 100644 --- a/Documentation/filesystems/bcachefs/index.rst +++ b/Documentation/filesystems/bcachefs/index.rst @@ -9,4 +9,5 @@ bcachefs Documentation :numbered: CodingStyle + SubmittingPatches errorcodes diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 259cb211a338..655d8d10fe24 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1524,7 +1524,8 @@ attribute-sets: nested-attributes: bitset - name: hwtstamp-flags - type: u32 + type: nest + nested-attributes: bitset operations: enum-model: directional diff --git a/Documentation/networking/iso15765-2.rst b/Documentation/networking/iso15765-2.rst index 0e9d96074178..37ebb2c417cb 100644 --- a/Documentation/networking/iso15765-2.rst +++ b/Documentation/networking/iso15765-2.rst @@ -369,8 +369,8 @@ to their default. addr.can_family = AF_CAN; addr.can_ifindex = if_nametoindex("can0"); - addr.tp.tx_id = 0x18DA42F1 | CAN_EFF_FLAG; - addr.tp.rx_id = 0x18DAF142 | CAN_EFF_FLAG; + addr.can_addr.tp.tx_id = 0x18DA42F1 | CAN_EFF_FLAG; + addr.can_addr.tp.rx_id = 0x18DAF142 | CAN_EFF_FLAG; ret = bind(s, (struct sockaddr *)&addr, sizeof(addr)); if (ret < 0) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 0d1c3a820ce6..2b52eb77e29c 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1419,7 +1419,7 @@ fetch) is injected in the guest. S390: ^^^^^ -Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set. +Returns -EINVAL or -EEXIST if the VM has the KVM_VM_S390_UCONTROL flag set. Returns -EINVAL if called on a protected VM. 4.36 KVM_SET_TSS_ADDR diff --git a/MAINTAINERS b/MAINTAINERS index 896a307fa065..10893c91b1c1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2209,7 +2209,6 @@ F: sound/soc/codecs/cs42l84.* F: sound/soc/codecs/ssm3515.c ARM/APPLE MACHINE SUPPORT -M: Hector Martin <marcan@marcan.st> M: Sven Peter <sven@svenpeter.dev> R: Alyssa Rosenzweig <alyssa@rosenzweig.io> L: asahi@lists.linux.dev @@ -3655,7 +3654,6 @@ F: Documentation/devicetree/bindings/phy/phy-ath79-usb.txt F: drivers/phy/qualcomm/phy-ath79-usb.c ATHEROS ATH GENERIC UTILITIES -M: Kalle Valo <kvalo@kernel.org> M: Jeff Johnson <jjohnson@kernel.org> L: linux-wireless@vger.kernel.org S: Supported @@ -3955,6 +3953,7 @@ M: Kent Overstreet <kent.overstreet@linux.dev> L: linux-bcachefs@vger.kernel.org S: Supported C: irc://irc.oftc.net/bcache +P: Documentation/filesystems/bcachefs/SubmittingPatches.rst T: git https://evilpiepirate.org/git/bcachefs.git F: fs/bcachefs/ F: Documentation/filesystems/bcachefs/ @@ -9418,7 +9417,7 @@ F: fs/freevxfs/ FREEZER M: "Rafael J. Wysocki" <rafael@kernel.org> -M: Pavel Machek <pavel@ucw.cz> +M: Pavel Machek <pavel@kernel.org> L: linux-pm@vger.kernel.org S: Supported F: Documentation/power/freezing-of-tasks.rst @@ -9878,7 +9877,7 @@ S: Maintained F: drivers/staging/gpib/ GPIO ACPI SUPPORT -M: Mika Westerberg <mika.westerberg@linux.intel.com> +M: Mika Westerberg <westeri@kernel.org> M: Andy Shevchenko <andriy.shevchenko@linux.intel.com> L: linux-gpio@vger.kernel.org L: linux-acpi@vger.kernel.org @@ -10253,7 +10252,7 @@ F: drivers/video/fbdev/hgafb.c HIBERNATION (aka Software Suspend, aka swsusp) M: "Rafael J. Wysocki" <rafael@kernel.org> -M: Pavel Machek <pavel@ucw.cz> +M: Pavel Machek <pavel@kernel.org> L: linux-pm@vger.kernel.org S: Supported B: https://bugzilla.kernel.org @@ -13124,8 +13123,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/har F: scripts/leaking_addresses.pl LED SUBSYSTEM -M: Pavel Machek <pavel@ucw.cz> M: Lee Jones <lee@kernel.org> +M: Pavel Machek <pavel@kernel.org> L: linux-leds@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/lee/leds.git @@ -16438,7 +16437,7 @@ X: drivers/net/can/ X: drivers/net/wireless/ NETWORKING DRIVERS (WIRELESS) -M: Kalle Valo <kvalo@kernel.org> +M: Johannes Berg <johannes@sipsolutions.net> L: linux-wireless@vger.kernel.org S: Maintained W: https://wireless.wiki.kernel.org/ @@ -16462,6 +16461,22 @@ F: include/net/dsa.h F: net/dsa/ F: tools/testing/selftests/drivers/net/dsa/ +NETWORKING [ETHTOOL] +M: Andrew Lunn <andrew@lunn.ch> +M: Jakub Kicinski <kuba@kernel.org> +F: Documentation/netlink/specs/ethtool.yaml +F: Documentation/networking/ethtool-netlink.rst +F: include/linux/ethtool* +F: include/uapi/linux/ethtool* +F: net/ethtool/ +F: tools/testing/selftests/drivers/net/*/ethtool* + +NETWORKING [ETHTOOL CABLE TEST] +M: Andrew Lunn <andrew@lunn.ch> +F: net/ethtool/cabletest.c +F: tools/testing/selftests/drivers/net/*/ethtool* +K: cable_test + NETWORKING [GENERAL] M: "David S. Miller" <davem@davemloft.net> M: Eric Dumazet <edumazet@google.com> @@ -16493,6 +16508,7 @@ F: include/linux/netdev* F: include/linux/netlink.h F: include/linux/netpoll.h F: include/linux/rtnetlink.h +F: include/linux/sctp.h F: include/linux/seq_file_net.h F: include/linux/skbuff* F: include/net/ @@ -16509,6 +16525,7 @@ F: include/uapi/linux/netdev* F: include/uapi/linux/netlink.h F: include/uapi/linux/netlink_diag.h F: include/uapi/linux/rtnetlink.h +F: include/uapi/linux/sctp.h F: lib/net_utils.c F: lib/random32.c F: net/ @@ -16621,6 +16638,7 @@ F: tools/testing/selftests/net/mptcp/ NETWORKING [TCP] M: Eric Dumazet <edumazet@google.com> M: Neal Cardwell <ncardwell@google.com> +R: Kuniyuki Iwashima <kuniyu@amazon.com> L: netdev@vger.kernel.org S: Maintained F: Documentation/networking/net_cachelines/tcp_sock.rst @@ -16648,6 +16666,31 @@ F: include/net/tls.h F: include/uapi/linux/tls.h F: net/tls/* +NETWORKING [SOCKETS] +M: Eric Dumazet <edumazet@google.com> +M: Kuniyuki Iwashima <kuniyu@amazon.com> +M: Paolo Abeni <pabeni@redhat.com> +M: Willem de Bruijn <willemb@google.com> +S: Maintained +F: include/linux/sock_diag.h +F: include/linux/socket.h +F: include/linux/sockptr.h +F: include/net/sock.h +F: include/net/sock_reuseport.h +F: include/uapi/linux/socket.h +F: net/core/*sock* +F: net/core/scm.c +F: net/socket.c + +NETWORKING [UNIX SOCKETS] +M: Kuniyuki Iwashima <kuniyu@amazon.com> +S: Maintained +F: include/net/af_unix.h +F: include/net/netns/unix.h +F: include/uapi/linux/unix_diag.h +F: net/unix/ +F: tools/testing/selftests/net/af_unix/ + NETXEN (1/10) GbE SUPPORT M: Manish Chopra <manishc@marvell.com> M: Rahul Verma <rahulv@marvell.com> @@ -16781,7 +16824,7 @@ F: include/linux/tick.h F: kernel/time/tick*.* NOKIA N900 CAMERA SUPPORT (ET8EK8 SENSOR, AD5820 FOCUS) -M: Pavel Machek <pavel@ucw.cz> +M: Pavel Machek <pavel@kernel.org> M: Sakari Ailus <sakari.ailus@iki.fi> L: linux-media@vger.kernel.org S: Maintained @@ -17713,6 +17756,7 @@ L: netdev@vger.kernel.org L: dev@openvswitch.org S: Maintained W: http://openvswitch.org +F: Documentation/networking/openvswitch.rst F: include/uapi/linux/openvswitch.h F: net/openvswitch/ F: tools/testing/selftests/net/openvswitch/ @@ -19312,7 +19356,6 @@ Q: http://patchwork.linuxtv.org/project/linux-media/list/ F: drivers/media/tuners/qt1010* QUALCOMM ATH12K WIRELESS DRIVER -M: Kalle Valo <kvalo@kernel.org> M: Jeff Johnson <jjohnson@kernel.org> L: ath12k@lists.infradead.org S: Supported @@ -19322,7 +19365,6 @@ F: drivers/net/wireless/ath/ath12k/ N: ath12k QUALCOMM ATHEROS ATH10K WIRELESS DRIVER -M: Kalle Valo <kvalo@kernel.org> M: Jeff Johnson <jjohnson@kernel.org> L: ath10k@lists.infradead.org S: Supported @@ -19332,7 +19374,6 @@ F: drivers/net/wireless/ath/ath10k/ N: ath10k QUALCOMM ATHEROS ATH11K WIRELESS DRIVER -M: Kalle Valo <kvalo@kernel.org> M: Jeff Johnson <jjohnson@kernel.org> L: ath11k@lists.infradead.org S: Supported @@ -22806,7 +22847,7 @@ F: drivers/sh/ SUSPEND TO RAM M: "Rafael J. Wysocki" <rafael@kernel.org> M: Len Brown <len.brown@intel.com> -M: Pavel Machek <pavel@ucw.cz> +M: Pavel Machek <pavel@kernel.org> L: linux-pm@vger.kernel.org S: Supported B: https://bugzilla.kernel.org @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Baby Opossum Posse # *DOCUMENTATION* diff --git a/arch/alpha/include/asm/elf.h b/arch/alpha/include/asm/elf.h index 4d7c46f50382..50c82187e60e 100644 --- a/arch/alpha/include/asm/elf.h +++ b/arch/alpha/include/asm/elf.h @@ -74,7 +74,7 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG]; /* * This is used to ensure we don't load something for the wrong architecture. */ -#define elf_check_arch(x) ((x)->e_machine == EM_ALPHA) +#define elf_check_arch(x) (((x)->e_machine == EM_ALPHA) && !((x)->e_flags & EF_ALPHA_32BIT)) /* * These are used to set parameters in the core dumps. @@ -137,10 +137,6 @@ extern int dump_elf_task(elf_greg_t *dest, struct task_struct *task); : amask (AMASK_CIX) ? "ev6" : "ev67"); \ }) -#define SET_PERSONALITY(EX) \ - set_personality(((EX).e_flags & EF_ALPHA_32BIT) \ - ? PER_LINUX_32BIT : PER_LINUX) - extern int alpha_l1i_cacheshape; extern int alpha_l1d_cacheshape; extern int alpha_l2_cacheshape; diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 635f0a5f5bbd..02e8817a8921 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -360,7 +360,7 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) extern void paging_init(void); -/* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */ +/* We have our own get_unmapped_area */ #define HAVE_ARCH_UNMAPPED_AREA #endif /* _ALPHA_PGTABLE_H */ diff --git a/arch/alpha/include/asm/processor.h b/arch/alpha/include/asm/processor.h index 55bb1c09fd39..5dce5518a211 100644 --- a/arch/alpha/include/asm/processor.h +++ b/arch/alpha/include/asm/processor.h @@ -8,23 +8,19 @@ #ifndef __ASM_ALPHA_PROCESSOR_H #define __ASM_ALPHA_PROCESSOR_H -#include <linux/personality.h> /* for ADDR_LIMIT_32BIT */ - /* * We have a 42-bit user address space: 4TB user VM... */ #define TASK_SIZE (0x40000000000UL) -#define STACK_TOP \ - (current->personality & ADDR_LIMIT_32BIT ? 0x80000000 : 0x00120000000UL) +#define STACK_TOP (0x00120000000UL) #define STACK_TOP_MAX 0x00120000000UL /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ -#define TASK_UNMAPPED_BASE \ - ((current->personality & ADDR_LIMIT_32BIT) ? 0x40000000 : TASK_SIZE / 2) +#define TASK_UNMAPPED_BASE (TASK_SIZE / 2) /* This is dead. Everything has been moved to thread_info. */ struct thread_struct { }; diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 86185021f75a..a08e8edef1a4 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1210,8 +1210,7 @@ SYSCALL_DEFINE1(old_adjtimex, struct timex32 __user *, txc_p) return ret; } -/* Get an address range which is currently unmapped. Similar to the - generic version except that we know how to honor ADDR_LIMIT_32BIT. */ +/* Get an address range which is currently unmapped. */ static unsigned long arch_get_unmapped_area_1(unsigned long addr, unsigned long len, @@ -1230,13 +1229,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { - unsigned long limit; - - /* "32 bit" actually means 31 bit, since pointers sign extend. */ - if (current->personality & ADDR_LIMIT_32BIT) - limit = 0x80000000; - else - limit = TASK_SIZE; + unsigned long limit = TASK_SIZE; if (len > limit) return -ENOMEM; diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index d3d243366536..231c0cd9c7b4 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -471,10 +471,8 @@ static void timer_emulate(struct arch_timer_context *ctx) trace_kvm_timer_emulate(ctx, should_fire); - if (should_fire != ctx->irq.level) { + if (should_fire != ctx->irq.level) kvm_timer_update_irq(ctx->vcpu, should_fire, ctx); - return; - } kvm_timer_update_status(ctx, should_fire); @@ -761,21 +759,6 @@ static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu, timer_irq(map->direct_ptimer), &arch_timer_irq_ops); WARN_ON_ONCE(ret); - - /* - * The virtual offset behaviour is "interesting", as it - * always applies when HCR_EL2.E2H==0, but only when - * accessed from EL1 when HCR_EL2.E2H==1. So make sure we - * track E2H when putting the HV timer in "direct" mode. - */ - if (map->direct_vtimer == vcpu_hvtimer(vcpu)) { - struct arch_timer_offset *offs = &map->direct_vtimer->offset; - - if (vcpu_el2_e2h_is_set(vcpu)) - offs->vcpu_offset = NULL; - else - offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2); - } } } @@ -976,31 +959,21 @@ void kvm_timer_sync_nested(struct kvm_vcpu *vcpu) * which allows trapping of the timer registers even with NV2. * Still, this is still worse than FEAT_NV on its own. Meh. */ - if (!vcpu_el2_e2h_is_set(vcpu)) { - if (cpus_have_final_cap(ARM64_HAS_ECV)) - return; - - /* - * A non-VHE guest hypervisor doesn't have any direct access - * to its timers: the EL2 registers trap (and the HW is - * fully emulated), while the EL0 registers access memory - * despite the access being notionally direct. Boo. - * - * We update the hardware timer registers with the - * latest value written by the guest to the VNCR page - * and let the hardware take care of the rest. - */ - write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CTL_EL0), SYS_CNTV_CTL); - write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CVAL_EL0), SYS_CNTV_CVAL); - write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CTL_EL0), SYS_CNTP_CTL); - write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CVAL_EL0), SYS_CNTP_CVAL); - } else { + if (!cpus_have_final_cap(ARM64_HAS_ECV)) { /* * For a VHE guest hypervisor, the EL2 state is directly - * stored in the host EL1 timers, while the emulated EL0 + * stored in the host EL1 timers, while the emulated EL1 * state is stored in the VNCR page. The latter could have * been updated behind our back, and we must reset the * emulation of the timers. + * + * A non-VHE guest hypervisor doesn't have any direct access + * to its timers: the EL2 registers trap despite being + * notionally direct (we use the EL1 HW, as for VHE), while + * the EL1 registers access memory. + * + * In both cases, process the emulated timers on each guest + * exit. Boo. */ struct timer_map map; get_timer_map(vcpu, &map); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 646e806c6ca6..071a7d75be68 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2290,6 +2290,19 @@ static int __init init_subsystems(void) break; case -ENODEV: case -ENXIO: + /* + * No VGIC? No pKVM for you. + * + * Protected mode assumes that VGICv3 is present, so no point + * in trying to hobble along if vgic initialization fails. + */ + if (is_protected_kvm_enabled()) + goto out; + + /* + * Otherwise, userspace could choose to implement a GIC for its + * guest on non-cooperative hardware. + */ vgic_present = false; err = 0; break; @@ -2400,6 +2413,13 @@ static void kvm_hyp_init_symbols(void) kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1); kvm_nvhe_sym(__icache_flags) = __icache_flags; kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; + + /* + * Flush entire BSS since part of its data containing init symbols is read + * while the MMU is off. + */ + kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start), + kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start)); } static int __init kvm_hyp_init_protection(u32 hyp_va_bits) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 5c134520e180..6e12c070832f 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -91,11 +91,34 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; } +static void flush_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + hyp_vcpu->vcpu.arch.debug_owner = host_vcpu->arch.debug_owner; + + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) + hyp_vcpu->vcpu.arch.vcpu_debug_state = host_vcpu->arch.vcpu_debug_state; + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) + hyp_vcpu->vcpu.arch.external_debug_state = host_vcpu->arch.external_debug_state; +} + +static void sync_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) + host_vcpu->arch.vcpu_debug_state = hyp_vcpu->vcpu.arch.vcpu_debug_state; + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) + host_vcpu->arch.external_debug_state = hyp_vcpu->vcpu.arch.external_debug_state; +} + static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) { struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; fpsimd_sve_flush(); + flush_debug_state(hyp_vcpu); hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; @@ -123,6 +146,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) unsigned int i; fpsimd_sve_sync(&hyp_vcpu->vcpu); + sync_debug_state(hyp_vcpu); host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt; diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 33d2ace68665..0c9387d2f507 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -67,26 +67,27 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) if (!tmp) return -ENOMEM; + swap(kvm->arch.nested_mmus, tmp); + /* * If we went through a realocation, adjust the MMU back-pointers in * the previously initialised kvm_pgtable structures. */ if (kvm->arch.nested_mmus != tmp) for (int i = 0; i < kvm->arch.nested_mmus_size; i++) - tmp[i].pgt->mmu = &tmp[i]; + kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i]; for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++) - ret = init_nested_s2_mmu(kvm, &tmp[i]); + ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]); if (ret) { for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++) - kvm_free_stage2_pgd(&tmp[i]); + kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]); return ret; } kvm->arch.nested_mmus_size = num_mmus; - kvm->arch.nested_mmus = tmp; return 0; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index f6cd1ea7fb55..82430c1e1dd0 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1452,6 +1452,16 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } +static bool access_hv_timer(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!vcpu_el2_e2h_is_set(vcpu)) + return undef_access(vcpu, p, r); + + return access_arch_timer(vcpu, p, r); +} + static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, s64 new, s64 cur) { @@ -3103,9 +3113,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(CNTHP_CTL_EL2, access_arch_timer, reset_val, 0), EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0), - { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_arch_timer }, - EL2_REG(CNTHV_CTL_EL2, access_arch_timer, reset_val, 0), - EL2_REG(CNTHV_CVAL_EL2, access_arch_timer, reset_val, 0), + { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_hv_timer }, + EL2_REG(CNTHV_CTL_EL2, access_hv_timer, reset_val, 0), + EL2_REG(CNTHV_CVAL_EL2, access_hv_timer, reset_val, 0), { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 }, diff --git a/arch/loongarch/include/asm/cpu-info.h b/arch/loongarch/include/asm/cpu-info.h index 35e0a230a484..7f5bc0ad9d50 100644 --- a/arch/loongarch/include/asm/cpu-info.h +++ b/arch/loongarch/include/asm/cpu-info.h @@ -76,27 +76,6 @@ extern const char *__cpu_full_name[]; #define cpu_family_string() __cpu_family[raw_smp_processor_id()] #define cpu_full_name_string() __cpu_full_name[raw_smp_processor_id()] -struct seq_file; -struct notifier_block; - -extern int register_proc_cpuinfo_notifier(struct notifier_block *nb); -extern int proc_cpuinfo_notifier_call_chain(unsigned long val, void *v); - -#define proc_cpuinfo_notifier(fn, pri) \ -({ \ - static struct notifier_block fn##_nb = { \ - .notifier_call = fn, \ - .priority = pri \ - }; \ - \ - register_proc_cpuinfo_notifier(&fn##_nb); \ -}) - -struct proc_cpuinfo_notifier_args { - struct seq_file *m; - unsigned long n; -}; - static inline bool cpus_are_siblings(int cpua, int cpub) { struct cpuinfo_loongarch *infoa = &cpu_data[cpua]; diff --git a/arch/loongarch/include/asm/smp.h b/arch/loongarch/include/asm/smp.h index 3383c9d24e94..b87d1d5e5890 100644 --- a/arch/loongarch/include/asm/smp.h +++ b/arch/loongarch/include/asm/smp.h @@ -77,6 +77,8 @@ extern int __cpu_logical_map[NR_CPUS]; #define SMP_IRQ_WORK BIT(ACTION_IRQ_WORK) #define SMP_CLEAR_VECTOR BIT(ACTION_CLEAR_VECTOR) +struct seq_file; + struct secondary_data { unsigned long stack; unsigned long thread_info; diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S index 86d5d90ebefe..4f0912141781 100644 --- a/arch/loongarch/kernel/genex.S +++ b/arch/loongarch/kernel/genex.S @@ -18,16 +18,19 @@ .align 5 SYM_FUNC_START(__arch_cpu_idle) - /* start of rollback region */ - LONG_L t0, tp, TI_FLAGS - nop - andi t0, t0, _TIF_NEED_RESCHED - bnez t0, 1f - nop - nop - nop + /* start of idle interrupt region */ + ori t0, zero, CSR_CRMD_IE + /* idle instruction needs irq enabled */ + csrxchg t0, t0, LOONGARCH_CSR_CRMD + /* + * If an interrupt lands here; between enabling interrupts above and + * going idle on the next instruction, we must *NOT* go idle since the + * interrupt could have set TIF_NEED_RESCHED or caused an timer to need + * reprogramming. Fall through -- see handle_vint() below -- and have + * the idle loop take care of things. + */ idle 0 - /* end of rollback region */ + /* end of idle interrupt region */ 1: jr ra SYM_FUNC_END(__arch_cpu_idle) @@ -35,11 +38,10 @@ SYM_CODE_START(handle_vint) UNWIND_HINT_UNDEFINED BACKUP_T0T1 SAVE_ALL - la_abs t1, __arch_cpu_idle + la_abs t1, 1b LONG_L t0, sp, PT_ERA - /* 32 byte rollback region */ - ori t0, t0, 0x1f - xori t0, t0, 0x1f + /* 3 instructions idle interrupt region */ + ori t0, t0, 0b1100 bne t0, t1, 1f LONG_S t0, sp, PT_ERA 1: move a0, sp diff --git a/arch/loongarch/kernel/idle.c b/arch/loongarch/kernel/idle.c index 0b5dd2faeb90..54b247d8cdb6 100644 --- a/arch/loongarch/kernel/idle.c +++ b/arch/loongarch/kernel/idle.c @@ -11,7 +11,6 @@ void __cpuidle arch_cpu_idle(void) { - raw_local_irq_enable(); - __arch_cpu_idle(); /* idle instruction needs irq enabled */ + __arch_cpu_idle(); raw_local_irq_disable(); } diff --git a/arch/loongarch/kernel/proc.c b/arch/loongarch/kernel/proc.c index 6ce46d92f1f1..cea30768ae92 100644 --- a/arch/loongarch/kernel/proc.c +++ b/arch/loongarch/kernel/proc.c @@ -13,28 +13,12 @@ #include <asm/processor.h> #include <asm/time.h> -/* - * No lock; only written during early bootup by CPU 0. - */ -static RAW_NOTIFIER_HEAD(proc_cpuinfo_chain); - -int __ref register_proc_cpuinfo_notifier(struct notifier_block *nb) -{ - return raw_notifier_chain_register(&proc_cpuinfo_chain, nb); -} - -int proc_cpuinfo_notifier_call_chain(unsigned long val, void *v) -{ - return raw_notifier_call_chain(&proc_cpuinfo_chain, val, v); -} - static int show_cpuinfo(struct seq_file *m, void *v) { unsigned long n = (unsigned long) v - 1; unsigned int isa = cpu_data[n].isa_level; unsigned int version = cpu_data[n].processor_id & 0xff; unsigned int fp_version = cpu_data[n].fpu_vers; - struct proc_cpuinfo_notifier_args proc_cpuinfo_notifier_args; #ifdef CONFIG_SMP if (!cpu_online(n)) @@ -91,20 +75,13 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (cpu_has_lbt_mips) seq_printf(m, " lbt_mips"); seq_printf(m, "\n"); - seq_printf(m, "Hardware Watchpoint\t: %s", - cpu_has_watch ? "yes, " : "no\n"); + seq_printf(m, "Hardware Watchpoint\t: %s", str_yes_no(cpu_has_watch)); if (cpu_has_watch) { - seq_printf(m, "iwatch count: %d, dwatch count: %d\n", + seq_printf(m, ", iwatch count: %d, dwatch count: %d", cpu_data[n].watch_ireg_count, cpu_data[n].watch_dreg_count); } - proc_cpuinfo_notifier_args.m = m; - proc_cpuinfo_notifier_args.n = n; - - raw_notifier_call_chain(&proc_cpuinfo_chain, 0, - &proc_cpuinfo_notifier_args); - - seq_printf(m, "\n"); + seq_printf(m, "\n\n"); return 0; } diff --git a/arch/loongarch/kernel/reset.c b/arch/loongarch/kernel/reset.c index 1ef8c6383535..de8fa5a8a825 100644 --- a/arch/loongarch/kernel/reset.c +++ b/arch/loongarch/kernel/reset.c @@ -33,7 +33,7 @@ void machine_halt(void) console_flush_on_panic(CONSOLE_FLUSH_PENDING); while (true) { - __arch_cpu_idle(); + __asm__ __volatile__("idle 0" : : : "memory"); } } @@ -53,7 +53,7 @@ void machine_power_off(void) #endif while (true) { - __arch_cpu_idle(); + __asm__ __volatile__("idle 0" : : : "memory"); } } @@ -74,6 +74,6 @@ void machine_restart(char *command) acpi_reboot(); while (true) { - __arch_cpu_idle(); + __asm__ __volatile__("idle 0" : : : "memory"); } } diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index bf9268bf26d5..f6d3242b9234 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -303,9 +303,9 @@ int kvm_arch_enable_virtualization_cpu(void) * TOE=0: Trap on Exception. * TIT=0: Trap on Timer. */ - if (env & CSR_GCFG_GCIP_ALL) + if (env & CSR_GCFG_GCIP_SECURE) gcfg |= CSR_GCFG_GCI_SECURE; - if (env & CSR_GCFG_MATC_ROOT) + if (env & CSR_GCFG_MATP_ROOT) gcfg |= CSR_GCFG_MATC_ROOT; write_csr_gcfg(gcfg); diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S index 0c292f818492..1be185e94807 100644 --- a/arch/loongarch/kvm/switch.S +++ b/arch/loongarch/kvm/switch.S @@ -85,7 +85,7 @@ * Guest CRMD comes from separate GCSR_CRMD register */ ori t0, zero, CSR_PRMD_PIE - csrxchg t0, t0, LOONGARCH_CSR_PRMD + csrwr t0, LOONGARCH_CSR_PRMD /* Set PVM bit to setup ertn to guest context */ ori t0, zero, CSR_GSTAT_PVM diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index fb72095c8077..20f941af3e9e 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -1548,9 +1548,6 @@ static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) /* Restore timer state regardless */ kvm_restore_timer(vcpu); - - /* Control guest page CCA attribute */ - change_csr_gcfg(CSR_GCFG_MATC_MASK, CSR_GCFG_MATC_ROOT); kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); /* Restore hardware PMU CSRs */ diff --git a/arch/loongarch/lib/csum.c b/arch/loongarch/lib/csum.c index a5e84b403c3b..df309ae4045d 100644 --- a/arch/loongarch/lib/csum.c +++ b/arch/loongarch/lib/csum.c @@ -25,7 +25,7 @@ unsigned int __no_sanitize_address do_csum(const unsigned char *buff, int len) const u64 *ptr; u64 data, sum64 = 0; - if (unlikely(len == 0)) + if (unlikely(len <= 0)) return 0; offset = (unsigned long)buff & 7; diff --git a/arch/loongarch/mm/pageattr.c b/arch/loongarch/mm/pageattr.c index bf8678248444..99165903908a 100644 --- a/arch/loongarch/mm/pageattr.c +++ b/arch/loongarch/mm/pageattr.c @@ -3,6 +3,7 @@ * Copyright (C) 2024 Loongson Technology Corporation Limited */ +#include <linux/memblock.h> #include <linux/pagewalk.h> #include <linux/pgtable.h> #include <asm/set_memory.h> @@ -167,7 +168,7 @@ bool kernel_page_present(struct page *page) unsigned long addr = (unsigned long)page_address(page); if (addr < vm_map_base) - return true; + return memblock_is_memory(__pa(addr)); pgd = pgd_offset_k(addr); if (pgd_none(pgdp_get(pgd))) diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index 1aa0cb097c9c..7b9a5ea9cad9 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -75,7 +75,7 @@ static void fsl_msi_print_chip(struct irq_data *irqd, struct seq_file *p) srs = (hwirq >> msi_data->srs_shift) & MSI_SRS_MASK; cascade_virq = msi_data->cascade_array[srs]->virq; - seq_printf(p, " fsl-msi-%d", cascade_virq); + seq_printf(p, "fsl-msi-%d", cascade_virq); } diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 13f51a6a5bb1..4e73ef46d4b2 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -23,7 +23,6 @@ /** * struct gmap_struct - guest address space * @list: list head for the mm->context gmap list - * @crst_list: list of all crst tables used in the guest address space * @mm: pointer to the parent mm_struct * @guest_to_host: radix tree with guest to host address translation * @host_to_guest: radix tree with pointer to segment table entries @@ -35,7 +34,6 @@ * @guest_handle: protected virtual machine handle for the ultravisor * @host_to_rmap: radix tree with gmap_rmap lists * @children: list of shadow gmap structures - * @pt_list: list of all page tables used in the shadow guest address space * @shadow_lock: spinlock to protect the shadow gmap list * @parent: pointer to the parent gmap for shadow guest address spaces * @orig_asce: ASCE for which the shadow page table has been created @@ -45,7 +43,6 @@ */ struct gmap { struct list_head list; - struct list_head crst_list; struct mm_struct *mm; struct radix_tree_root guest_to_host; struct radix_tree_root host_to_guest; @@ -61,7 +58,6 @@ struct gmap { /* Additional data for shadow guest address spaces */ struct radix_tree_root host_to_rmap; struct list_head children; - struct list_head pt_list; spinlock_t shadow_lock; struct gmap *parent; unsigned long orig_asce; @@ -106,23 +102,21 @@ struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit); void gmap_remove(struct gmap *gmap); struct gmap *gmap_get(struct gmap *gmap); void gmap_put(struct gmap *gmap); +void gmap_free(struct gmap *gmap); +struct gmap *gmap_alloc(unsigned long limit); int gmap_map_segment(struct gmap *gmap, unsigned long from, unsigned long to, unsigned long len); int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); unsigned long __gmap_translate(struct gmap *, unsigned long gaddr); -unsigned long gmap_translate(struct gmap *, unsigned long gaddr); int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr); -int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags); void gmap_discard(struct gmap *, unsigned long from, unsigned long to); void __gmap_zap(struct gmap *, unsigned long gaddr); void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val); -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, - int edat_level); -int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level); +void gmap_unshadow(struct gmap *sg); int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, int fake); int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, @@ -131,24 +125,22 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, int fake); int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, int fake); -int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, - unsigned long *pgt, int *dat_protection, int *fake); int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); void gmap_register_pte_notifier(struct gmap_notifier *); void gmap_unregister_pte_notifier(struct gmap_notifier *); -int gmap_mprotect_notify(struct gmap *, unsigned long start, - unsigned long len, int prot); +int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits); void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], unsigned long gaddr, unsigned long vmaddr); int s390_disable_cow_sharing(void); -void s390_unlist_old_asce(struct gmap *gmap); int s390_replace_asce(struct gmap *gmap); void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns); int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool interruptible); +int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split); +unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level); /** * s390_uv_destroy_range - Destroy a range of pages in the given mm. diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 97c7c8127543..9a367866cab0 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -30,6 +30,8 @@ #define KVM_S390_ESCA_CPU_SLOTS 248 #define KVM_MAX_VCPUS 255 +#define KVM_INTERNAL_MEM_SLOTS 1 + /* * These seem to be used for allocating ->chip in the routing table, which we * don't use. 1 is as small as we can get to reduce the needed memory. If we @@ -931,12 +933,14 @@ struct sie_page2 { u8 reserved928[0x1000 - 0x928]; /* 0x0928 */ }; +struct vsie_page; + struct kvm_s390_vsie { struct mutex mutex; struct radix_tree_root addr_to_page; int page_count; int next; - struct page *pages[KVM_MAX_VCPUS]; + struct vsie_page *pages[KVM_MAX_VCPUS]; }; struct kvm_s390_gisa_iam { diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index a3b51056a177..3ca5af4cfe43 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -420,9 +420,10 @@ void setup_protection_map(void); #define PGSTE_HC_BIT 0x0020000000000000UL #define PGSTE_GR_BIT 0x0004000000000000UL #define PGSTE_GC_BIT 0x0002000000000000UL -#define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */ -#define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */ -#define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */ +#define PGSTE_ST2_MASK 0x0000ffff00000000UL +#define PGSTE_UC_BIT 0x0000000000008000UL /* user dirty (migration) */ +#define PGSTE_IN_BIT 0x0000000000004000UL /* IPTE notify bit */ +#define PGSTE_VSIE_BIT 0x0000000000002000UL /* ref'd in a shadow table */ /* Guest Page State used for virtualization */ #define _PGSTE_GPS_ZERO 0x0000000080000000UL @@ -2007,4 +2008,18 @@ extern void s390_reset_cmma(struct mm_struct *mm); #define pmd_pgtable(pmd) \ ((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE)) +static inline unsigned long gmap_pgste_get_pgt_addr(unsigned long *pgt) +{ + unsigned long *pgstes, res; + + pgstes = pgt + _PAGE_ENTRIES; + + res = (pgstes[0] & PGSTE_ST2_MASK) << 16; + res |= pgstes[1] & PGSTE_ST2_MASK; + res |= (pgstes[2] & PGSTE_ST2_MASK) >> 16; + res |= (pgstes[3] & PGSTE_ST2_MASK) >> 32; + + return res; +} + #endif /* _S390_PAGE_H */ diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index dc332609f2c3..b11f5b6d0bd1 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -628,12 +628,12 @@ static inline int is_prot_virt_host(void) } int uv_pin_shared(unsigned long paddr); -int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); -int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr); int uv_destroy_folio(struct folio *folio); int uv_destroy_pte(pte_t pte); int uv_convert_from_secure_pte(pte_t pte); -int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); +int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb); +int uv_convert_from_secure(unsigned long paddr); +int uv_convert_from_secure_folio(struct folio *folio); void setup_uv(void); diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 6f9654a191ad..9f05df2da2f7 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -19,19 +19,6 @@ #include <asm/sections.h> #include <asm/uv.h> -#if !IS_ENABLED(CONFIG_KVM) -unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) -{ - return 0; -} - -int gmap_fault(struct gmap *gmap, unsigned long gaddr, - unsigned int fault_flags) -{ - return 0; -} -#endif - /* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */ int __bootdata_preserved(prot_virt_guest); EXPORT_SYMBOL(prot_virt_guest); @@ -159,6 +146,7 @@ int uv_destroy_folio(struct folio *folio) folio_put(folio); return rc; } +EXPORT_SYMBOL(uv_destroy_folio); /* * The present PTE still indirectly holds a folio reference through the mapping. @@ -175,7 +163,7 @@ int uv_destroy_pte(pte_t pte) * * @paddr: Absolute host address of page to be exported */ -static int uv_convert_from_secure(unsigned long paddr) +int uv_convert_from_secure(unsigned long paddr) { struct uv_cb_cfs uvcb = { .header.cmd = UVC_CMD_CONV_FROM_SEC_STOR, @@ -187,11 +175,12 @@ static int uv_convert_from_secure(unsigned long paddr) return -EINVAL; return 0; } +EXPORT_SYMBOL_GPL(uv_convert_from_secure); /* * The caller must already hold a reference to the folio. */ -static int uv_convert_from_secure_folio(struct folio *folio) +int uv_convert_from_secure_folio(struct folio *folio) { int rc; @@ -206,6 +195,7 @@ static int uv_convert_from_secure_folio(struct folio *folio) folio_put(folio); return rc; } +EXPORT_SYMBOL_GPL(uv_convert_from_secure_folio); /* * The present PTE still indirectly holds a folio reference through the mapping. @@ -237,13 +227,33 @@ static int expected_folio_refs(struct folio *folio) return res; } -static int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) +/** + * make_folio_secure() - make a folio secure + * @folio: the folio to make secure + * @uvcb: the uvcb that describes the UVC to be used + * + * The folio @folio will be made secure if possible, @uvcb will be passed + * as-is to the UVC. + * + * Return: 0 on success; + * -EBUSY if the folio is in writeback or has too many references; + * -E2BIG if the folio is large; + * -EAGAIN if the UVC needs to be attempted again; + * -ENXIO if the address is not mapped; + * -EINVAL if the UVC failed for other reasons. + * + * Context: The caller must hold exactly one extra reference on the folio + * (it's the same logic as split_folio()) + */ +int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) { int expected, cc = 0; + if (folio_test_large(folio)) + return -E2BIG; if (folio_test_writeback(folio)) - return -EAGAIN; - expected = expected_folio_refs(folio); + return -EBUSY; + expected = expected_folio_refs(folio) + 1; if (!folio_ref_freeze(folio, expected)) return -EBUSY; set_bit(PG_arch_1, &folio->flags); @@ -267,251 +277,7 @@ static int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) return -EAGAIN; return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; } - -/** - * should_export_before_import - Determine whether an export is needed - * before an import-like operation - * @uvcb: the Ultravisor control block of the UVC to be performed - * @mm: the mm of the process - * - * Returns whether an export is needed before every import-like operation. - * This is needed for shared pages, which don't trigger a secure storage - * exception when accessed from a different guest. - * - * Although considered as one, the Unpin Page UVC is not an actual import, - * so it is not affected. - * - * No export is needed also when there is only one protected VM, because the - * page cannot belong to the wrong VM in that case (there is no "other VM" - * it can belong to). - * - * Return: true if an export is needed before every import, otherwise false. - */ -static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) -{ - /* - * The misc feature indicates, among other things, that importing a - * shared page from a different protected VM will automatically also - * transfer its ownership. - */ - if (uv_has_feature(BIT_UV_FEAT_MISC)) - return false; - if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) - return false; - return atomic_read(&mm->context.protected_count) > 1; -} - -/* - * Drain LRU caches: the local one on first invocation and the ones of all - * CPUs on successive invocations. Returns "true" on the first invocation. - */ -static bool drain_lru(bool *drain_lru_called) -{ - /* - * If we have tried a local drain and the folio refcount - * still does not match our expected safe value, try with a - * system wide drain. This is needed if the pagevecs holding - * the page are on a different CPU. - */ - if (*drain_lru_called) { - lru_add_drain_all(); - /* We give up here, don't retry immediately. */ - return false; - } - /* - * We are here if the folio refcount does not match the - * expected safe value. The main culprits are usually - * pagevecs. With lru_add_drain() we drain the pagevecs - * on the local CPU so that hopefully the refcount will - * reach the expected safe value. - */ - lru_add_drain(); - *drain_lru_called = true; - /* The caller should try again immediately */ - return true; -} - -/* - * Requests the Ultravisor to make a page accessible to a guest. - * If it's brought in the first time, it will be cleared. If - * it has been exported before, it will be decrypted and integrity - * checked. - */ -int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) -{ - struct vm_area_struct *vma; - bool drain_lru_called = false; - spinlock_t *ptelock; - unsigned long uaddr; - struct folio *folio; - pte_t *ptep; - int rc; - -again: - rc = -EFAULT; - mmap_read_lock(gmap->mm); - - uaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(uaddr)) - goto out; - vma = vma_lookup(gmap->mm, uaddr); - if (!vma) - goto out; - /* - * Secure pages cannot be huge and userspace should not combine both. - * In case userspace does it anyway this will result in an -EFAULT for - * the unpack. The guest is thus never reaching secure mode. If - * userspace is playing dirty tricky with mapping huge pages later - * on this will result in a segmentation fault. - */ - if (is_vm_hugetlb_page(vma)) - goto out; - - rc = -ENXIO; - ptep = get_locked_pte(gmap->mm, uaddr, &ptelock); - if (!ptep) - goto out; - if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) { - folio = page_folio(pte_page(*ptep)); - rc = -EAGAIN; - if (folio_test_large(folio)) { - rc = -E2BIG; - } else if (folio_trylock(folio)) { - if (should_export_before_import(uvcb, gmap->mm)) - uv_convert_from_secure(PFN_PHYS(folio_pfn(folio))); - rc = make_folio_secure(folio, uvcb); - folio_unlock(folio); - } - - /* - * Once we drop the PTL, the folio may get unmapped and - * freed immediately. We need a temporary reference. - */ - if (rc == -EAGAIN || rc == -E2BIG) - folio_get(folio); - } - pte_unmap_unlock(ptep, ptelock); -out: - mmap_read_unlock(gmap->mm); - - switch (rc) { - case -E2BIG: - folio_lock(folio); - rc = split_folio(folio); - folio_unlock(folio); - folio_put(folio); - - switch (rc) { - case 0: - /* Splitting succeeded, try again immediately. */ - goto again; - case -EAGAIN: - /* Additional folio references. */ - if (drain_lru(&drain_lru_called)) - goto again; - return -EAGAIN; - case -EBUSY: - /* Unexpected race. */ - return -EAGAIN; - } - WARN_ON_ONCE(1); - return -ENXIO; - case -EAGAIN: - /* - * If we are here because the UVC returned busy or partial - * completion, this is just a useless check, but it is safe. - */ - folio_wait_writeback(folio); - folio_put(folio); - return -EAGAIN; - case -EBUSY: - /* Additional folio references. */ - if (drain_lru(&drain_lru_called)) - goto again; - return -EAGAIN; - case -ENXIO: - if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE)) - return -EFAULT; - return -EAGAIN; - } - return rc; -} -EXPORT_SYMBOL_GPL(gmap_make_secure); - -int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) -{ - struct uv_cb_cts uvcb = { - .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, - .header.len = sizeof(uvcb), - .guest_handle = gmap->guest_handle, - .gaddr = gaddr, - }; - - return gmap_make_secure(gmap, gaddr, &uvcb); -} -EXPORT_SYMBOL_GPL(gmap_convert_to_secure); - -/** - * gmap_destroy_page - Destroy a guest page. - * @gmap: the gmap of the guest - * @gaddr: the guest address to destroy - * - * An attempt will be made to destroy the given guest page. If the attempt - * fails, an attempt is made to export the page. If both attempts fail, an - * appropriate error is returned. - */ -int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) -{ - struct vm_area_struct *vma; - struct folio_walk fw; - unsigned long uaddr; - struct folio *folio; - int rc; - - rc = -EFAULT; - mmap_read_lock(gmap->mm); - - uaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(uaddr)) - goto out; - vma = vma_lookup(gmap->mm, uaddr); - if (!vma) - goto out; - /* - * Huge pages should not be able to become secure - */ - if (is_vm_hugetlb_page(vma)) - goto out; - - rc = 0; - folio = folio_walk_start(&fw, vma, uaddr, 0); - if (!folio) - goto out; - /* - * See gmap_make_secure(): large folios cannot be secure. Small - * folio implies FW_LEVEL_PTE. - */ - if (folio_test_large(folio) || !pte_write(fw.pte)) - goto out_walk_end; - rc = uv_destroy_folio(folio); - /* - * Fault handlers can race; it is possible that two CPUs will fault - * on the same secure page. One CPU can destroy the page, reboot, - * re-enter secure mode and import it, while the second CPU was - * stuck at the beginning of the handler. At some point the second - * CPU will be able to progress, and it will not be able to destroy - * the page. In that case we do not want to terminate the process, - * we instead try to export the page. - */ - if (rc) - rc = uv_convert_from_secure_folio(folio); -out_walk_end: - folio_walk_end(&fw, vma); -out: - mmap_read_unlock(gmap->mm); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_destroy_page); +EXPORT_SYMBOL_GPL(make_folio_secure); /* * To be called with the folio locked or with an extra reference! This will diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 02217fb4ae10..f0ffe874adc2 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -8,7 +8,7 @@ include $(srctree)/virt/kvm/Makefile.kvm ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o +kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap.o gmap-vsie.o kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 9816b0060fbe..f6fded15633a 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -16,6 +16,7 @@ #include <asm/gmap.h> #include <asm/dat-bits.h> #include "kvm-s390.h" +#include "gmap.h" #include "gaccess.h" /* @@ -1393,6 +1394,44 @@ shadow_pgt: } /** + * shadow_pgt_lookup() - find a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: the address in the shadow aguest address space + * @pgt: parent gmap address of the page table to get shadowed + * @dat_protection: if the pgtable is marked as protected by dat + * @fake: pgt references contiguous guest memory block, not a pgtable + * + * Returns 0 if the shadow page table was found and -EAGAIN if the page + * table was not found. + * + * Called with sg->mm->mmap_lock in read. + */ +static int shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt, + int *dat_protection, int *fake) +{ + unsigned long pt_index; + unsigned long *table; + struct page *page; + int rc; + + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ + if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { + /* Shadow page tables are full pages (pte+pgste) */ + page = pfn_to_page(*table >> PAGE_SHIFT); + pt_index = gmap_pgste_get_pgt_addr(page_to_virt(page)); + *pgt = pt_index & ~GMAP_SHADOW_FAKE_TABLE; + *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); + *fake = !!(pt_index & GMAP_SHADOW_FAKE_TABLE); + rc = 0; + } else { + rc = -EAGAIN; + } + spin_unlock(&sg->guest_table_lock); + return rc; +} + +/** * kvm_s390_shadow_fault - handle fault on a shadow page table * @vcpu: virtual cpu * @sg: pointer to the shadow guest address space structure @@ -1415,6 +1454,9 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, int dat_protection, fake; int rc; + if (KVM_BUG_ON(!gmap_is_shadow(sg), vcpu->kvm)) + return -EFAULT; + mmap_read_lock(sg->mm); /* * We don't want any guest-2 tables to change - so the parent @@ -1423,7 +1465,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, */ ipte_lock(vcpu->kvm); - rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); + rc = shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); if (rc) rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection, &fake); diff --git a/arch/s390/kvm/gmap-vsie.c b/arch/s390/kvm/gmap-vsie.c new file mode 100644 index 000000000000..a6d1dbb04c97 --- /dev/null +++ b/arch/s390/kvm/gmap-vsie.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Guest memory management for KVM/s390 nested VMs. + * + * Copyright IBM Corp. 2008, 2020, 2024 + * + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> + * Martin Schwidefsky <schwidefsky@de.ibm.com> + * David Hildenbrand <david@redhat.com> + * Janosch Frank <frankja@linux.vnet.ibm.com> + */ + +#include <linux/compiler.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/pgtable.h> +#include <linux/pagemap.h> +#include <linux/mman.h> + +#include <asm/lowcore.h> +#include <asm/gmap.h> +#include <asm/uv.h> + +#include "kvm-s390.h" +#include "gmap.h" + +/** + * gmap_find_shadow - find a specific asce in the list of shadow tables + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation + * + * Returns the pointer to a gmap if a shadow table with the given asce is + * already available, ERR_PTR(-EAGAIN) if another one is just being created, + * otherwise NULL + * + * Context: Called with parent->shadow_lock held + */ +static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, int edat_level) +{ + struct gmap *sg; + + lockdep_assert_held(&parent->shadow_lock); + list_for_each_entry(sg, &parent->children, list) { + if (!gmap_shadow_valid(sg, asce, edat_level)) + continue; + if (!sg->initialized) + return ERR_PTR(-EAGAIN); + refcount_inc(&sg->ref_count); + return sg; + } + return NULL; +} + +/** + * gmap_shadow - create/find a shadow guest address space + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation + * + * The pages of the top level page table referred by the asce parameter + * will be set to read-only and marked in the PGSTEs of the kvm process. + * The shadow table will be removed automatically on any change to the + * PTE mapping for the source table. + * + * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, + * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the + * parent gmap table could not be protected. + */ +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level) +{ + struct gmap *sg, *new; + unsigned long limit; + int rc; + + if (KVM_BUG_ON(parent->mm->context.allow_gmap_hpage_1m, (struct kvm *)parent->private) || + KVM_BUG_ON(gmap_is_shadow(parent), (struct kvm *)parent->private)) + return ERR_PTR(-EFAULT); + spin_lock(&parent->shadow_lock); + sg = gmap_find_shadow(parent, asce, edat_level); + spin_unlock(&parent->shadow_lock); + if (sg) + return sg; + /* Create a new shadow gmap */ + limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); + if (asce & _ASCE_REAL_SPACE) + limit = -1UL; + new = gmap_alloc(limit); + if (!new) + return ERR_PTR(-ENOMEM); + new->mm = parent->mm; + new->parent = gmap_get(parent); + new->private = parent->private; + new->orig_asce = asce; + new->edat_level = edat_level; + new->initialized = false; + spin_lock(&parent->shadow_lock); + /* Recheck if another CPU created the same shadow */ + sg = gmap_find_shadow(parent, asce, edat_level); + if (sg) { + spin_unlock(&parent->shadow_lock); + gmap_free(new); + return sg; + } + if (asce & _ASCE_REAL_SPACE) { + /* only allow one real-space gmap shadow */ + list_for_each_entry(sg, &parent->children, list) { + if (sg->orig_asce & _ASCE_REAL_SPACE) { + spin_lock(&sg->guest_table_lock); + gmap_unshadow(sg); + spin_unlock(&sg->guest_table_lock); + list_del(&sg->list); + gmap_put(sg); + break; + } + } + } + refcount_set(&new->ref_count, 2); + list_add(&new->list, &parent->children); + if (asce & _ASCE_REAL_SPACE) { + /* nothing to protect, return right away */ + new->initialized = true; + spin_unlock(&parent->shadow_lock); + return new; + } + spin_unlock(&parent->shadow_lock); + /* protect after insertion, so it will get properly invalidated */ + mmap_read_lock(parent->mm); + rc = __kvm_s390_mprotect_many(parent, asce & _ASCE_ORIGIN, + ((asce & _ASCE_TABLE_LENGTH) + 1), + PROT_READ, GMAP_NOTIFY_SHADOW); + mmap_read_unlock(parent->mm); + spin_lock(&parent->shadow_lock); + new->initialized = true; + if (rc) { + list_del(&new->list); + gmap_free(new); + new = ERR_PTR(rc); + } + spin_unlock(&parent->shadow_lock); + return new; +} diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c new file mode 100644 index 000000000000..02adf151d4de --- /dev/null +++ b/arch/s390/kvm/gmap.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Guest memory management for KVM/s390 + * + * Copyright IBM Corp. 2008, 2020, 2024 + * + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> + * Martin Schwidefsky <schwidefsky@de.ibm.com> + * David Hildenbrand <david@redhat.com> + * Janosch Frank <frankja@linux.vnet.ibm.com> + */ + +#include <linux/compiler.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/pgtable.h> +#include <linux/pagemap.h> + +#include <asm/lowcore.h> +#include <asm/gmap.h> +#include <asm/uv.h> + +#include "gmap.h" + +/** + * should_export_before_import - Determine whether an export is needed + * before an import-like operation + * @uvcb: the Ultravisor control block of the UVC to be performed + * @mm: the mm of the process + * + * Returns whether an export is needed before every import-like operation. + * This is needed for shared pages, which don't trigger a secure storage + * exception when accessed from a different guest. + * + * Although considered as one, the Unpin Page UVC is not an actual import, + * so it is not affected. + * + * No export is needed also when there is only one protected VM, because the + * page cannot belong to the wrong VM in that case (there is no "other VM" + * it can belong to). + * + * Return: true if an export is needed before every import, otherwise false. + */ +static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) +{ + /* + * The misc feature indicates, among other things, that importing a + * shared page from a different protected VM will automatically also + * transfer its ownership. + */ + if (uv_has_feature(BIT_UV_FEAT_MISC)) + return false; + if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) + return false; + return atomic_read(&mm->context.protected_count) > 1; +} + +static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb) +{ + struct folio *folio = page_folio(page); + int rc; + + /* + * Secure pages cannot be huge and userspace should not combine both. + * In case userspace does it anyway this will result in an -EFAULT for + * the unpack. The guest is thus never reaching secure mode. + * If userspace plays dirty tricks and decides to map huge pages at a + * later point in time, it will receive a segmentation fault or + * KVM_RUN will return -EFAULT. + */ + if (folio_test_hugetlb(folio)) + return -EFAULT; + if (folio_test_large(folio)) { + mmap_read_unlock(gmap->mm); + rc = kvm_s390_wiggle_split_folio(gmap->mm, folio, true); + mmap_read_lock(gmap->mm); + if (rc) + return rc; + folio = page_folio(page); + } + + if (!folio_trylock(folio)) + return -EAGAIN; + if (should_export_before_import(uvcb, gmap->mm)) + uv_convert_from_secure(folio_to_phys(folio)); + rc = make_folio_secure(folio, uvcb); + folio_unlock(folio); + + /* + * In theory a race is possible and the folio might have become + * large again before the folio_trylock() above. In that case, no + * action is performed and -EAGAIN is returned; the callers will + * have to try again later. + * In most cases this implies running the VM again, getting the same + * exception again, and make another attempt in this function. + * This is expected to happen extremely rarely. + */ + if (rc == -E2BIG) + return -EAGAIN; + /* The folio has too many references, try to shake some off */ + if (rc == -EBUSY) { + mmap_read_unlock(gmap->mm); + kvm_s390_wiggle_split_folio(gmap->mm, folio, false); + mmap_read_lock(gmap->mm); + return -EAGAIN; + } + + return rc; +} + +/** + * gmap_make_secure() - make one guest page secure + * @gmap: the guest gmap + * @gaddr: the guest address that needs to be made secure + * @uvcb: the UVCB specifying which operation needs to be performed + * + * Context: needs to be called with kvm->srcu held. + * Return: 0 on success, < 0 in case of error (see __gmap_make_secure()). + */ +int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) +{ + struct kvm *kvm = gmap->private; + struct page *page; + int rc = 0; + + lockdep_assert_held(&kvm->srcu); + + page = gfn_to_page(kvm, gpa_to_gfn(gaddr)); + mmap_read_lock(gmap->mm); + if (page) + rc = __gmap_make_secure(gmap, page, uvcb); + kvm_release_page_clean(page); + mmap_read_unlock(gmap->mm); + + return rc; +} + +int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) +{ + struct uv_cb_cts uvcb = { + .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, + .header.len = sizeof(uvcb), + .guest_handle = gmap->guest_handle, + .gaddr = gaddr, + }; + + return gmap_make_secure(gmap, gaddr, &uvcb); +} + +/** + * __gmap_destroy_page() - Destroy a guest page. + * @gmap: the gmap of the guest + * @page: the page to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + * + * Context: must be called holding the mm lock for gmap->mm + */ +static int __gmap_destroy_page(struct gmap *gmap, struct page *page) +{ + struct folio *folio = page_folio(page); + int rc; + + /* + * See gmap_make_secure(): large folios cannot be secure. Small + * folio implies FW_LEVEL_PTE. + */ + if (folio_test_large(folio)) + return -EFAULT; + + rc = uv_destroy_folio(folio); + /* + * Fault handlers can race; it is possible that two CPUs will fault + * on the same secure page. One CPU can destroy the page, reboot, + * re-enter secure mode and import it, while the second CPU was + * stuck at the beginning of the handler. At some point the second + * CPU will be able to progress, and it will not be able to destroy + * the page. In that case we do not want to terminate the process, + * we instead try to export the page. + */ + if (rc) + rc = uv_convert_from_secure_folio(folio); + + return rc; +} + +/** + * gmap_destroy_page() - Destroy a guest page. + * @gmap: the gmap of the guest + * @gaddr: the guest address to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + * + * Context: may sleep. + */ +int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) +{ + struct page *page; + int rc = 0; + + mmap_read_lock(gmap->mm); + page = gfn_to_page(gmap->private, gpa_to_gfn(gaddr)); + if (page) + rc = __gmap_destroy_page(gmap, page); + kvm_release_page_clean(page); + mmap_read_unlock(gmap->mm); + return rc; +} diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h new file mode 100644 index 000000000000..c8f031c9ea5f --- /dev/null +++ b/arch/s390/kvm/gmap.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2016, 2025 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + * Claudio Imbrenda <imbrenda@linux.ibm.com> + */ + +#ifndef ARCH_KVM_S390_GMAP_H +#define ARCH_KVM_S390_GMAP_H + +#define GMAP_SHADOW_FAKE_TABLE 1ULL + +int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); +int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); +int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr); +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level); + +/** + * gmap_shadow_valid - check if a shadow guest address space matches the + * given properties and is still valid + * @sg: pointer to the shadow guest address space structure + * @asce: ASCE for which the shadow table is requested + * @edat_level: edat level to be used for the shadow translation + * + * Returns 1 if the gmap shadow is still valid and matches the given + * properties, the caller can continue using it. Returns 0 otherwise, the + * caller has to request a new shadow gmap in this case. + * + */ +static inline int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) +{ + if (sg->removed) + return 0; + return sg->orig_asce == asce && sg->edat_level == edat_level; +} + +#endif diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 5bbaadf75dc6..610dd44a948b 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -21,6 +21,7 @@ #include "gaccess.h" #include "trace.h" #include "trace-s390.h" +#include "gmap.h" u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu) { @@ -367,7 +368,7 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) reg2, &srcaddr, GACC_FETCH, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - rc = gmap_fault(vcpu->arch.gmap, srcaddr, 0); + rc = kvm_s390_handle_dat_fault(vcpu, srcaddr, 0); if (rc != 0) return rc; @@ -376,7 +377,7 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) reg1, &dstaddr, GACC_STORE, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - rc = gmap_fault(vcpu->arch.gmap, dstaddr, FAULT_FLAG_WRITE); + rc = kvm_s390_handle_dat_fault(vcpu, dstaddr, FOLL_WRITE); if (rc != 0) return rc; @@ -549,7 +550,7 @@ static int handle_pv_uvc(struct kvm_vcpu *vcpu) * If the unpin did not succeed, the guest will exit again for the UVC * and we will retry the unpin. */ - if (rc == -EINVAL) + if (rc == -EINVAL || rc == -ENXIO) return 0; /* * If we got -EAGAIN here, we simply return it. It will eventually diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index d4f031e086fc..07ff0e10cb7f 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2893,7 +2893,8 @@ int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { - u64 uaddr; + u64 uaddr_s, uaddr_i; + int idx; switch (ue->type) { /* we store the userspace addresses instead of the guest addresses */ @@ -2901,14 +2902,16 @@ int kvm_set_routing_entry(struct kvm *kvm, if (kvm_is_ucontrol(kvm)) return -EINVAL; e->set = set_adapter_int; - uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr); - if (uaddr == -EFAULT) - return -EFAULT; - e->adapter.summary_addr = uaddr; - uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr); - if (uaddr == -EFAULT) + + idx = srcu_read_lock(&kvm->srcu); + uaddr_s = gpa_to_hva(kvm, ue->u.adapter.summary_addr); + uaddr_i = gpa_to_hva(kvm, ue->u.adapter.ind_addr); + srcu_read_unlock(&kvm->srcu, idx); + + if (kvm_is_error_hva(uaddr_s) || kvm_is_error_hva(uaddr_i)) return -EFAULT; - e->adapter.ind_addr = uaddr; + e->adapter.summary_addr = uaddr_s; + e->adapter.ind_addr = uaddr_i; e->adapter.summary_offset = ue->u.adapter.summary_offset; e->adapter.ind_offset = ue->u.adapter.ind_offset; e->adapter.adapter_id = ue->u.adapter.adapter_id; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d8080c27d45b..ebecb96bacce 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -50,6 +50,7 @@ #include "kvm-s390.h" #include "gaccess.h" #include "pci.h" +#include "gmap.h" #define CREATE_TRACE_POINTS #include "trace.h" @@ -3428,8 +3429,20 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) VM_EVENT(kvm, 3, "vm created with type %lu", type); if (type & KVM_VM_S390_UCONTROL) { + struct kvm_userspace_memory_region2 fake_memslot = { + .slot = KVM_S390_UCONTROL_MEMSLOT, + .guest_phys_addr = 0, + .userspace_addr = 0, + .memory_size = ALIGN_DOWN(TASK_SIZE, _SEGMENT_SIZE), + .flags = 0, + }; + kvm->arch.gmap = NULL; kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT; + /* one flat fake memslot covering the whole address-space */ + mutex_lock(&kvm->slots_lock); + KVM_BUG_ON(kvm_set_internal_memslot(kvm, &fake_memslot), kvm); + mutex_unlock(&kvm->slots_lock); } else { if (sclp.hamax == U64_MAX) kvm->arch.mem_limit = TASK_SIZE_MAX; @@ -4498,6 +4511,75 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu) return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS); } +static int __kvm_s390_fixup_fault_sync(struct gmap *gmap, gpa_t gaddr, unsigned int flags) +{ + struct kvm *kvm = gmap->private; + gfn_t gfn = gpa_to_gfn(gaddr); + bool unlocked; + hva_t vmaddr; + gpa_t tmp; + int rc; + + if (kvm_is_ucontrol(kvm)) { + tmp = __gmap_translate(gmap, gaddr); + gfn = gpa_to_gfn(tmp); + } + + vmaddr = gfn_to_hva(kvm, gfn); + rc = fixup_user_fault(gmap->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); + if (!rc) + rc = __gmap_link(gmap, gaddr, vmaddr); + return rc; +} + +/** + * __kvm_s390_mprotect_many() - Apply specified protection to guest pages + * @gmap: the gmap of the guest + * @gpa: the starting guest address + * @npages: how many pages to protect + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: pgste notification bits to set + * + * Returns: 0 in case of success, < 0 in case of error - see gmap_protect_one() + * + * Context: kvm->srcu and gmap->mm need to be held in read mode + */ +int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, + unsigned long bits) +{ + unsigned int fault_flag = (prot & PROT_WRITE) ? FAULT_FLAG_WRITE : 0; + gpa_t end = gpa + npages * PAGE_SIZE; + int rc; + + for (; gpa < end; gpa = ALIGN(gpa + 1, rc)) { + rc = gmap_protect_one(gmap, gpa, prot, bits); + if (rc == -EAGAIN) { + __kvm_s390_fixup_fault_sync(gmap, gpa, fault_flag); + rc = gmap_protect_one(gmap, gpa, prot, bits); + } + if (rc < 0) + return rc; + } + + return 0; +} + +static int kvm_s390_mprotect_notify_prefix(struct kvm_vcpu *vcpu) +{ + gpa_t gaddr = kvm_s390_get_prefix(vcpu); + int idx, rc; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + mmap_read_lock(vcpu->arch.gmap->mm); + + rc = __kvm_s390_mprotect_many(vcpu->arch.gmap, gaddr, 2, PROT_WRITE, GMAP_NOTIFY_MPROT); + + mmap_read_unlock(vcpu->arch.gmap->mm); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + return rc; +} + static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) { retry: @@ -4513,9 +4595,8 @@ retry: */ if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) { int rc; - rc = gmap_mprotect_notify(vcpu->arch.gmap, - kvm_s390_get_prefix(vcpu), - PAGE_SIZE * 2, PROT_WRITE); + + rc = kvm_s390_mprotect_notify_prefix(vcpu); if (rc) { kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); return rc; @@ -4766,11 +4847,111 @@ static int vcpu_post_run_addressing_exception(struct kvm_vcpu *vcpu) return kvm_s390_inject_prog_irq(vcpu, &pgm_info); } +static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu) +{ + KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm, + "Unexpected program interrupt 0x%x, TEID 0x%016lx", + current->thread.gmap_int_code, current->thread.gmap_teid.val); +} + +/* + * __kvm_s390_handle_dat_fault() - handle a dat fault for the gmap of a vcpu + * @vcpu: the vCPU whose gmap is to be fixed up + * @gfn: the guest frame number used for memslots (including fake memslots) + * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps + * @flags: FOLL_* flags + * + * Return: 0 on success, < 0 in case of error. + * Context: The mm lock must not be held before calling. May sleep. + */ +int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags) +{ + struct kvm_memory_slot *slot; + unsigned int fault_flags; + bool writable, unlocked; + unsigned long vmaddr; + struct page *page; + kvm_pfn_t pfn; + int rc; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return vcpu_post_run_addressing_exception(vcpu); + + fault_flags = flags & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; + if (vcpu->arch.gmap->pfault_enabled) + flags |= FOLL_NOWAIT; + vmaddr = __gfn_to_hva_memslot(slot, gfn); + +try_again: + pfn = __kvm_faultin_pfn(slot, gfn, flags, &writable, &page); + + /* Access outside memory, inject addressing exception */ + if (is_noslot_pfn(pfn)) + return vcpu_post_run_addressing_exception(vcpu); + /* Signal pending: try again */ + if (pfn == KVM_PFN_ERR_SIGPENDING) + return -EAGAIN; + + /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT) */ + if (pfn == KVM_PFN_ERR_NEEDS_IO) { + trace_kvm_s390_major_guest_pfault(vcpu); + if (kvm_arch_setup_async_pf(vcpu)) + return 0; + vcpu->stat.pfault_sync++; + /* Could not setup async pfault, try again synchronously */ + flags &= ~FOLL_NOWAIT; + goto try_again; + } + /* Any other error */ + if (is_error_pfn(pfn)) + return -EFAULT; + + /* Success */ + mmap_read_lock(vcpu->arch.gmap->mm); + /* Mark the userspace PTEs as young and/or dirty, to avoid page fault loops */ + rc = fixup_user_fault(vcpu->arch.gmap->mm, vmaddr, fault_flags, &unlocked); + if (!rc) + rc = __gmap_link(vcpu->arch.gmap, gaddr, vmaddr); + scoped_guard(spinlock, &vcpu->kvm->mmu_lock) { + kvm_release_faultin_page(vcpu->kvm, page, false, writable); + } + mmap_read_unlock(vcpu->arch.gmap->mm); + return rc; +} + +static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int flags) +{ + unsigned long gaddr_tmp; + gfn_t gfn; + + gfn = gpa_to_gfn(gaddr); + if (kvm_is_ucontrol(vcpu->kvm)) { + /* + * This translates the per-vCPU guest address into a + * fake guest address, which can then be used with the + * fake memslots that are identity mapping userspace. + * This allows ucontrol VMs to use the normal fault + * resolution path, like normal VMs. + */ + mmap_read_lock(vcpu->arch.gmap->mm); + gaddr_tmp = __gmap_translate(vcpu->arch.gmap, gaddr); + mmap_read_unlock(vcpu->arch.gmap->mm); + if (gaddr_tmp == -EFAULT) { + vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; + vcpu->run->s390_ucontrol.trans_exc_code = gaddr; + vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION; + return -EREMOTE; + } + gfn = gpa_to_gfn(gaddr_tmp); + } + return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, flags); +} + static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) { unsigned int flags = 0; unsigned long gaddr; - int rc = 0; gaddr = current->thread.gmap_teid.addr * PAGE_SIZE; if (kvm_s390_cur_gmap_fault_is_write()) @@ -4781,9 +4962,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) vcpu->stat.exit_null++; break; case PGM_NON_SECURE_STORAGE_ACCESS: - KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm, - "Unexpected program interrupt 0x%x, TEID 0x%016lx", - current->thread.gmap_int_code, current->thread.gmap_teid.val); + kvm_s390_assert_primary_as(vcpu); /* * This is normal operation; a page belonging to a protected * guest has not been imported yet. Try to import the page into @@ -4794,9 +4973,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) break; case PGM_SECURE_STORAGE_ACCESS: case PGM_SECURE_STORAGE_VIOLATION: - KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm, - "Unexpected program interrupt 0x%x, TEID 0x%016lx", - current->thread.gmap_int_code, current->thread.gmap_teid.val); + kvm_s390_assert_primary_as(vcpu); /* * This can happen after a reboot with asynchronous teardown; * the new guest (normal or protected) will run on top of the @@ -4825,40 +5002,15 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) case PGM_REGION_FIRST_TRANS: case PGM_REGION_SECOND_TRANS: case PGM_REGION_THIRD_TRANS: - KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm, - "Unexpected program interrupt 0x%x, TEID 0x%016lx", - current->thread.gmap_int_code, current->thread.gmap_teid.val); - if (vcpu->arch.gmap->pfault_enabled) { - rc = gmap_fault(vcpu->arch.gmap, gaddr, flags | FAULT_FLAG_RETRY_NOWAIT); - if (rc == -EFAULT) - return vcpu_post_run_addressing_exception(vcpu); - if (rc == -EAGAIN) { - trace_kvm_s390_major_guest_pfault(vcpu); - if (kvm_arch_setup_async_pf(vcpu)) - return 0; - vcpu->stat.pfault_sync++; - } else { - return rc; - } - } - rc = gmap_fault(vcpu->arch.gmap, gaddr, flags); - if (rc == -EFAULT) { - if (kvm_is_ucontrol(vcpu->kvm)) { - vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; - vcpu->run->s390_ucontrol.trans_exc_code = gaddr; - vcpu->run->s390_ucontrol.pgm_code = 0x10; - return -EREMOTE; - } - return vcpu_post_run_addressing_exception(vcpu); - } - break; + kvm_s390_assert_primary_as(vcpu); + return vcpu_dat_fault_handler(vcpu, gaddr, flags); default: KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx", current->thread.gmap_int_code, current->thread.gmap_teid.val); send_sig(SIGSEGV, current, 0); break; } - return rc; + return 0; } static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) @@ -5737,7 +5889,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } #endif case KVM_S390_VCPU_FAULT: { - r = gmap_fault(vcpu->arch.gmap, arg, 0); + idx = srcu_read_lock(&vcpu->kvm->srcu); + r = vcpu_dat_fault_handler(vcpu, arg, 0); + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } case KVM_ENABLE_CAP: @@ -5853,7 +6007,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, { gpa_t size; - if (kvm_is_ucontrol(kvm)) + if (kvm_is_ucontrol(kvm) && new->id < KVM_USER_MEM_SLOTS) return -EINVAL; /* When we are protected, we should not change the memory slots */ @@ -5905,6 +6059,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, { int rc = 0; + if (kvm_is_ucontrol(kvm)) + return; + switch (change) { case KVM_MR_DELETE: rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE, diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 597d7a71deeb..8d3bbb2dd8d2 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -20,6 +20,8 @@ #include <asm/processor.h> #include <asm/sclp.h> +#define KVM_S390_UCONTROL_MEMSLOT (KVM_USER_MEM_SLOTS + 0) + static inline void kvm_s390_fpu_store(struct kvm_run *run) { fpu_stfpc(&run->s.regs.fpc); @@ -279,6 +281,15 @@ static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm) return gd; } +static inline hva_t gpa_to_hva(struct kvm *kvm, gpa_t gpa) +{ + hva_t hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); + + if (!kvm_is_error_hva(hva)) + hva |= offset_in_page(gpa); + return hva; +} + /* implemented in pv.c */ int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); @@ -408,6 +419,14 @@ void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm); __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu); int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc); +int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags); +int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, + unsigned long bits); + +static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, unsigned int flags) +{ + return __kvm_s390_handle_dat_fault(vcpu, gpa_to_gfn(gaddr), gaddr, flags); +} /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 75e81ba26d04..22c012aa5206 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -17,6 +17,7 @@ #include <linux/sched/mm.h> #include <linux/mmu_notifier.h> #include "kvm-s390.h" +#include "gmap.h" bool kvm_s390_pv_is_protected(struct kvm *kvm) { @@ -638,10 +639,28 @@ static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak, .tweak[1] = offset, }; int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb); + unsigned long vmaddr; + bool unlocked; *rc = uvcb.header.rc; *rrc = uvcb.header.rrc; + if (ret == -ENXIO) { + mmap_read_lock(kvm->mm); + vmaddr = gfn_to_hva(kvm, gpa_to_gfn(addr)); + if (kvm_is_error_hva(vmaddr)) { + ret = -EFAULT; + } else { + ret = fixup_user_fault(kvm->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); + if (!ret) + ret = __gmap_link(kvm->arch.gmap, addr, vmaddr); + } + mmap_read_unlock(kvm->mm); + if (!ret) + return -EAGAIN; + return ret; + } + if (ret && ret != -EAGAIN) KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x", uvcb.gaddr, *rc, *rrc); @@ -660,6 +679,8 @@ int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx", addr, size); + guard(srcu)(&kvm->srcu); + while (offset < size) { ret = unpack_one(kvm, addr, tweak, offset, rc, rrc); if (ret == -EAGAIN) { diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index a687695d8f68..a78df3a4f353 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -13,6 +13,7 @@ #include <linux/bitmap.h> #include <linux/sched/signal.h> #include <linux/io.h> +#include <linux/mman.h> #include <asm/gmap.h> #include <asm/mmu_context.h> @@ -22,6 +23,11 @@ #include <asm/facility.h> #include "kvm-s390.h" #include "gaccess.h" +#include "gmap.h" + +enum vsie_page_flags { + VSIE_PAGE_IN_USE = 0, +}; struct vsie_page { struct kvm_s390_sie_block scb_s; /* 0x0000 */ @@ -46,7 +52,18 @@ struct vsie_page { gpa_t gvrd_gpa; /* 0x0240 */ gpa_t riccbd_gpa; /* 0x0248 */ gpa_t sdnx_gpa; /* 0x0250 */ - __u8 reserved[0x0700 - 0x0258]; /* 0x0258 */ + /* + * guest address of the original SCB. Remains set for free vsie + * pages, so we can properly look them up in our addr_to_page + * radix tree. + */ + gpa_t scb_gpa; /* 0x0258 */ + /* + * Flags: must be set/cleared atomically after the vsie page can be + * looked up by other CPUs. + */ + unsigned long flags; /* 0x0260 */ + __u8 reserved[0x0700 - 0x0268]; /* 0x0268 */ struct kvm_s390_crypto_cb crycb; /* 0x0700 */ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ }; @@ -584,7 +601,6 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, struct kvm *kvm = gmap->private; struct vsie_page *cur; unsigned long prefix; - struct page *page; int i; if (!gmap_is_shadow(gmap)) @@ -594,10 +610,9 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, * therefore we can safely reference them all the time. */ for (i = 0; i < kvm->arch.vsie.page_count; i++) { - page = READ_ONCE(kvm->arch.vsie.pages[i]); - if (!page) + cur = READ_ONCE(kvm->arch.vsie.pages[i]); + if (!cur) continue; - cur = page_to_virt(page); if (READ_ONCE(cur->gmap) != gmap) continue; prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT; @@ -1345,6 +1360,20 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return rc; } +/* Try getting a given vsie page, returning "true" on success. */ +static inline bool try_get_vsie_page(struct vsie_page *vsie_page) +{ + if (test_bit(VSIE_PAGE_IN_USE, &vsie_page->flags)) + return false; + return !test_and_set_bit(VSIE_PAGE_IN_USE, &vsie_page->flags); +} + +/* Put a vsie page acquired through get_vsie_page / try_get_vsie_page. */ +static void put_vsie_page(struct vsie_page *vsie_page) +{ + clear_bit(VSIE_PAGE_IN_USE, &vsie_page->flags); +} + /* * Get or create a vsie page for a scb address. * @@ -1355,16 +1384,21 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) { struct vsie_page *vsie_page; - struct page *page; int nr_vcpus; rcu_read_lock(); - page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9); + vsie_page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9); rcu_read_unlock(); - if (page) { - if (page_ref_inc_return(page) == 2) - return page_to_virt(page); - page_ref_dec(page); + if (vsie_page) { + if (try_get_vsie_page(vsie_page)) { + if (vsie_page->scb_gpa == addr) + return vsie_page; + /* + * We raced with someone reusing + putting this vsie + * page before we grabbed it. + */ + put_vsie_page(vsie_page); + } } /* @@ -1375,36 +1409,40 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) mutex_lock(&kvm->arch.vsie.mutex); if (kvm->arch.vsie.page_count < nr_vcpus) { - page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA); - if (!page) { + vsie_page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA); + if (!vsie_page) { mutex_unlock(&kvm->arch.vsie.mutex); return ERR_PTR(-ENOMEM); } - page_ref_inc(page); - kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page; + __set_bit(VSIE_PAGE_IN_USE, &vsie_page->flags); + kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = vsie_page; kvm->arch.vsie.page_count++; } else { /* reuse an existing entry that belongs to nobody */ while (true) { - page = kvm->arch.vsie.pages[kvm->arch.vsie.next]; - if (page_ref_inc_return(page) == 2) + vsie_page = kvm->arch.vsie.pages[kvm->arch.vsie.next]; + if (try_get_vsie_page(vsie_page)) break; - page_ref_dec(page); kvm->arch.vsie.next++; kvm->arch.vsie.next %= nr_vcpus; } - radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); + if (vsie_page->scb_gpa != ULONG_MAX) + radix_tree_delete(&kvm->arch.vsie.addr_to_page, + vsie_page->scb_gpa >> 9); } - page->index = addr; - /* double use of the same address */ - if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) { - page_ref_dec(page); + /* Mark it as invalid until it resides in the tree. */ + vsie_page->scb_gpa = ULONG_MAX; + + /* Double use of the same address or allocation failure. */ + if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, + vsie_page)) { + put_vsie_page(vsie_page); mutex_unlock(&kvm->arch.vsie.mutex); return NULL; } + vsie_page->scb_gpa = addr; mutex_unlock(&kvm->arch.vsie.mutex); - vsie_page = page_to_virt(page); memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block)); release_gmap_shadow(vsie_page); vsie_page->fault_addr = 0; @@ -1412,14 +1450,6 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) return vsie_page; } -/* put a vsie page acquired via get_vsie_page */ -static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page) -{ - struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT); - - page_ref_dec(page); -} - int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) { struct vsie_page *vsie_page; @@ -1470,7 +1500,7 @@ out_unshadow: out_unpin_scb: unpin_scb(vcpu, vsie_page, scb_addr); out_put: - put_vsie_page(vcpu->kvm, vsie_page); + put_vsie_page(vsie_page); return rc < 0 ? rc : 0; } @@ -1486,18 +1516,18 @@ void kvm_s390_vsie_init(struct kvm *kvm) void kvm_s390_vsie_destroy(struct kvm *kvm) { struct vsie_page *vsie_page; - struct page *page; int i; mutex_lock(&kvm->arch.vsie.mutex); for (i = 0; i < kvm->arch.vsie.page_count; i++) { - page = kvm->arch.vsie.pages[i]; + vsie_page = kvm->arch.vsie.pages[i]; kvm->arch.vsie.pages[i] = NULL; - vsie_page = page_to_virt(page); release_gmap_shadow(vsie_page); /* free the radix tree entry */ - radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); - __free_page(page); + if (vsie_page->scb_gpa != ULONG_MAX) + radix_tree_delete(&kvm->arch.vsie.addr_to_page, + vsie_page->scb_gpa >> 9); + free_page((unsigned long)vsie_page); } kvm->arch.vsie.page_count = 0; mutex_unlock(&kvm->arch.vsie.mutex); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 16b8a36c56de..94d927785800 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -24,6 +24,16 @@ #include <asm/page.h> #include <asm/tlb.h> +/* + * The address is saved in a radix tree directly; NULL would be ambiguous, + * since 0 is a valid address, and NULL is returned when nothing was found. + * The lower bits are ignored by all users of the macro, so it can be used + * to distinguish a valid address 0 from a NULL. + */ +#define VALID_GADDR_FLAG 1 +#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG) +#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG) + #define GMAP_SHADOW_FAKE_TABLE 1ULL static struct page *gmap_alloc_crst(void) @@ -43,7 +53,7 @@ static struct page *gmap_alloc_crst(void) * * Returns a guest address space structure. */ -static struct gmap *gmap_alloc(unsigned long limit) +struct gmap *gmap_alloc(unsigned long limit) { struct gmap *gmap; struct page *page; @@ -70,9 +80,7 @@ static struct gmap *gmap_alloc(unsigned long limit) gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); if (!gmap) goto out; - INIT_LIST_HEAD(&gmap->crst_list); INIT_LIST_HEAD(&gmap->children); - INIT_LIST_HEAD(&gmap->pt_list); INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); @@ -82,8 +90,6 @@ static struct gmap *gmap_alloc(unsigned long limit) page = gmap_alloc_crst(); if (!page) goto out_free; - page->index = 0; - list_add(&page->lru, &gmap->crst_list); table = page_to_virt(page); crst_table_init(table, etype); gmap->table = table; @@ -97,6 +103,7 @@ out_free: out: return NULL; } +EXPORT_SYMBOL_GPL(gmap_alloc); /** * gmap_create - create a guest address space @@ -185,32 +192,46 @@ static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) } while (nr > 0); } +static void gmap_free_crst(unsigned long *table, bool free_ptes) +{ + bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0; + int i; + + if (is_segment) { + if (!free_ptes) + goto out; + for (i = 0; i < _CRST_ENTRIES; i++) + if (!(table[i] & _SEGMENT_ENTRY_INVALID)) + page_table_free_pgste(page_ptdesc(phys_to_page(table[i]))); + } else { + for (i = 0; i < _CRST_ENTRIES; i++) + if (!(table[i] & _REGION_ENTRY_INVALID)) + gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes); + } + +out: + free_pages((unsigned long)table, CRST_ALLOC_ORDER); +} + /** * gmap_free - free a guest address space * @gmap: pointer to the guest address space structure * * No locks required. There are no references to this gmap anymore. */ -static void gmap_free(struct gmap *gmap) +void gmap_free(struct gmap *gmap) { - struct page *page, *next; - /* Flush tlb of all gmaps (if not already done for shadows) */ if (!(gmap_is_shadow(gmap) && gmap->removed)) gmap_flush_tlb(gmap); /* Free all segment & region tables. */ - list_for_each_entry_safe(page, next, &gmap->crst_list, lru) - __free_pages(page, CRST_ALLOC_ORDER); + gmap_free_crst(gmap->table, gmap_is_shadow(gmap)); + gmap_radix_tree_free(&gmap->guest_to_host); gmap_radix_tree_free(&gmap->host_to_guest); /* Free additional data for a shadow gmap */ if (gmap_is_shadow(gmap)) { - struct ptdesc *ptdesc, *n; - - /* Free all page tables. */ - list_for_each_entry_safe(ptdesc, n, &gmap->pt_list, pt_list) - page_table_free_pgste(ptdesc); gmap_rmap_radix_tree_free(&gmap->host_to_rmap); /* Release reference to the parent */ gmap_put(gmap->parent); @@ -218,6 +239,7 @@ static void gmap_free(struct gmap *gmap) kfree(gmap); } +EXPORT_SYMBOL_GPL(gmap_free); /** * gmap_get - increase reference counter for guest address space @@ -298,10 +320,8 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, crst_table_init(new, init); spin_lock(&gmap->guest_table_lock); if (*table & _REGION_ENTRY_INVALID) { - list_add(&page->lru, &gmap->crst_list); *table = __pa(new) | _REGION_ENTRY_LENGTH | (*table & _REGION_ENTRY_TYPE_MASK); - page->index = gaddr; page = NULL; } spin_unlock(&gmap->guest_table_lock); @@ -310,21 +330,23 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, return 0; } -/** - * __gmap_segment_gaddr - find virtual address from segment pointer - * @entry: pointer to a segment table entry in the guest address space - * - * Returns the virtual address in the guest address space for the segment - */ -static unsigned long __gmap_segment_gaddr(unsigned long *entry) +static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr) { - struct page *page; - unsigned long offset; + return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); +} - offset = (unsigned long) entry / sizeof(unsigned long); - offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; - page = pmd_pgtable_page((pmd_t *) entry); - return page->index + offset; +static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr) +{ + return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); +} + +static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr, + unsigned long *gaddr) +{ + *gaddr = host_to_guest_delete(gmap, vmaddr); + if (IS_GADDR_VALID(*gaddr)) + return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1); + return NULL; } /** @@ -336,16 +358,19 @@ static unsigned long __gmap_segment_gaddr(unsigned long *entry) */ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) { - unsigned long *entry; + unsigned long gaddr; int flush = 0; + pmd_t *pmdp; BUG_ON(gmap_is_shadow(gmap)); spin_lock(&gmap->guest_table_lock); - entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); - if (entry) { - flush = (*entry != _SEGMENT_ENTRY_EMPTY); - *entry = _SEGMENT_ENTRY_EMPTY; + + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { + flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY); + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); } + spin_unlock(&gmap->guest_table_lock); return flush; } @@ -464,26 +489,6 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) EXPORT_SYMBOL_GPL(__gmap_translate); /** - * gmap_translate - translate a guest address to a user space address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * - * Returns user space address which corresponds to the guest address or - * -EFAULT if no such mapping exists. - * This function does not establish potentially missing page table entries. - */ -unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long rc; - - mmap_read_lock(gmap->mm); - rc = __gmap_translate(gmap, gaddr); - mmap_read_unlock(gmap->mm); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_translate); - -/** * gmap_unlink - disconnect a page table from the gmap shadow tables * @mm: pointer to the parent mm_struct * @table: pointer to the host page table @@ -582,7 +587,8 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) spin_lock(&gmap->guest_table_lock); if (*table == _SEGMENT_ENTRY_EMPTY) { rc = radix_tree_insert(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT, table); + vmaddr >> PMD_SHIFT, + (void *)MAKE_VALID_GADDR(gaddr)); if (!rc) { if (pmd_leaf(*pmd)) { *table = (pmd_val(*pmd) & @@ -605,130 +611,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) radix_tree_preload_end(); return rc; } - -/** - * fixup_user_fault_nowait - manually resolve a user page fault without waiting - * @mm: mm_struct of target mm - * @address: user address - * @fault_flags:flags to pass down to handle_mm_fault() - * @unlocked: did we unlock the mmap_lock while retrying - * - * This function behaves similarly to fixup_user_fault(), but it guarantees - * that the fault will be resolved without waiting. The function might drop - * and re-acquire the mm lock, in which case @unlocked will be set to true. - * - * The guarantee is that the fault is handled without waiting, but the - * function itself might sleep, due to the lock. - * - * Context: Needs to be called with mm->mmap_lock held in read mode, and will - * return with the lock held in read mode; @unlocked will indicate whether - * the lock has been dropped and re-acquired. This is the same behaviour as - * fixup_user_fault(). - * - * Return: 0 on success, -EAGAIN if the fault cannot be resolved without - * waiting, -EFAULT if the fault cannot be resolved, -ENOMEM if out of - * memory. - */ -static int fixup_user_fault_nowait(struct mm_struct *mm, unsigned long address, - unsigned int fault_flags, bool *unlocked) -{ - struct vm_area_struct *vma; - unsigned int test_flags; - vm_fault_t fault; - int rc; - - fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; - test_flags = fault_flags & FAULT_FLAG_WRITE ? VM_WRITE : VM_READ; - - vma = find_vma(mm, address); - if (unlikely(!vma || address < vma->vm_start)) - return -EFAULT; - if (unlikely(!(vma->vm_flags & test_flags))) - return -EFAULT; - - fault = handle_mm_fault(vma, address, fault_flags, NULL); - /* the mm lock has been dropped, take it again */ - if (fault & VM_FAULT_COMPLETED) { - *unlocked = true; - mmap_read_lock(mm); - return 0; - } - /* the mm lock has not been dropped */ - if (fault & VM_FAULT_ERROR) { - rc = vm_fault_to_errno(fault, 0); - BUG_ON(!rc); - return rc; - } - /* the mm lock has not been dropped because of FAULT_FLAG_RETRY_NOWAIT */ - if (fault & VM_FAULT_RETRY) - return -EAGAIN; - /* nothing needed to be done and the mm lock has not been dropped */ - return 0; -} - -/** - * __gmap_fault - resolve a fault on a guest address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * @fault_flags: flags to pass down to handle_mm_fault() - * - * Context: Needs to be called with mm->mmap_lock held in read mode. Might - * drop and re-acquire the lock. Will always return with the lock held. - */ -static int __gmap_fault(struct gmap *gmap, unsigned long gaddr, unsigned int fault_flags) -{ - unsigned long vmaddr; - bool unlocked; - int rc = 0; - -retry: - unlocked = false; - - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) - return vmaddr; - - if (fault_flags & FAULT_FLAG_RETRY_NOWAIT) - rc = fixup_user_fault_nowait(gmap->mm, vmaddr, fault_flags, &unlocked); - else - rc = fixup_user_fault(gmap->mm, vmaddr, fault_flags, &unlocked); - if (rc) - return rc; - /* - * In the case that fixup_user_fault unlocked the mmap_lock during - * fault-in, redo __gmap_translate() to avoid racing with a - * map/unmap_segment. - * In particular, __gmap_translate(), fixup_user_fault{,_nowait}(), - * and __gmap_link() must all be called atomically in one go; if the - * lock had been dropped in between, a retry is needed. - */ - if (unlocked) - goto retry; - - return __gmap_link(gmap, gaddr, vmaddr); -} - -/** - * gmap_fault - resolve a fault on a guest address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * @fault_flags: flags to pass down to handle_mm_fault() - * - * Returns 0 on success, -ENOMEM for out of memory conditions, -EFAULT if the - * vm address is already mapped to a different guest segment, and -EAGAIN if - * FAULT_FLAG_RETRY_NOWAIT was specified and the fault could not be processed - * immediately. - */ -int gmap_fault(struct gmap *gmap, unsigned long gaddr, unsigned int fault_flags) -{ - int rc; - - mmap_read_lock(gmap->mm); - rc = __gmap_fault(gmap, gaddr, fault_flags); - mmap_read_unlock(gmap->mm); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_fault); +EXPORT_SYMBOL(__gmap_link); /* * this function is assumed to be called with mmap_lock held @@ -853,8 +736,7 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start, * * Note: Can also be called for shadow gmaps. */ -static inline unsigned long *gmap_table_walk(struct gmap *gmap, - unsigned long gaddr, int level) +unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level) { const int asce_type = gmap->asce & _ASCE_TYPE_MASK; unsigned long *table = gmap->table; @@ -905,6 +787,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, } return table; } +EXPORT_SYMBOL(gmap_table_walk); /** * gmap_pte_op_walk - walk the gmap page table, get the page table lock @@ -1101,86 +984,40 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE * @bits: pgste notification bits to set * - * Returns 0 if successfully protected, -ENOMEM if out of memory and - * -EFAULT if gaddr is invalid (or mapping for shadows is missing). + * Returns: + * PAGE_SIZE if a small page was successfully protected; + * HPAGE_SIZE if a large page was successfully protected; + * -ENOMEM if out of memory; + * -EFAULT if gaddr is invalid (or mapping for shadows is missing); + * -EAGAIN if the guest mapping is missing and should be fixed by the caller. * - * Called with sg->mm->mmap_lock in read. + * Context: Called with sg->mm->mmap_lock in read. */ -static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, - unsigned long len, int prot, unsigned long bits) +int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits) { - unsigned long vmaddr, dist; pmd_t *pmdp; - int rc; + int rc = 0; BUG_ON(gmap_is_shadow(gmap)); - while (len) { - rc = -EAGAIN; - pmdp = gmap_pmd_op_walk(gmap, gaddr); - if (pmdp) { - if (!pmd_leaf(*pmdp)) { - rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, - bits); - if (!rc) { - len -= PAGE_SIZE; - gaddr += PAGE_SIZE; - } - } else { - rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, - bits); - if (!rc) { - dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK); - len = len < dist ? 0 : len - dist; - gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE; - } - } - gmap_pmd_op_end(gmap, pmdp); - } - if (rc) { - if (rc == -EINVAL) - return rc; - /* -EAGAIN, fixup of userspace mm and gmap */ - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) - return vmaddr; - rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); - if (rc) - return rc; - } - } - return 0; -} + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (!pmdp) + return -EAGAIN; -/** - * gmap_mprotect_notify - change access rights for a range of ptes and - * call the notifier if any pte changes again - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @len: size of area - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * - * Returns 0 if for each page in the given range a gmap mapping exists, - * the new access rights could be set and the notifier could be armed. - * If the gmap mapping is missing for one or more pages -EFAULT is - * returned. If no memory could be allocated -ENOMEM is returned. - * This function establishes missing page table entries. - */ -int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, - unsigned long len, int prot) -{ - int rc; + if (!pmd_leaf(*pmdp)) { + rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits); + if (!rc) + rc = PAGE_SIZE; + } else { + rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits); + if (!rc) + rc = HPAGE_SIZE; + } + gmap_pmd_op_end(gmap, pmdp); - if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) - return -EINVAL; - if (!MACHINE_HAS_ESOP && prot == PROT_READ) - return -EINVAL; - mmap_read_lock(gmap->mm); - rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT); - mmap_read_unlock(gmap->mm); return rc; } -EXPORT_SYMBOL_GPL(gmap_mprotect_notify); +EXPORT_SYMBOL_GPL(gmap_protect_one); /** * gmap_read_table - get an unsigned long value from a guest page table using @@ -1414,7 +1251,6 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ ptdesc = page_ptdesc(phys_to_page(pgt)); - list_del(&ptdesc->pt_list); page_table_free_pgste(ptdesc); } @@ -1442,7 +1278,6 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ ptdesc = page_ptdesc(phys_to_page(pgt)); - list_del(&ptdesc->pt_list); page_table_free_pgste(ptdesc); } } @@ -1472,7 +1307,6 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ page = phys_to_page(sgt); - list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1500,7 +1334,6 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ page = phys_to_page(sgt); - list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1530,7 +1363,6 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ page = phys_to_page(r3t); - list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1558,7 +1390,6 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ page = phys_to_page(r3t); - list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1588,7 +1419,6 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) __gmap_unshadow_r2t(sg, raddr, __va(r2t)); /* Free region 2 table */ page = phys_to_page(r2t); - list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1620,7 +1450,6 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, r1t[i] = _REGION1_ENTRY_EMPTY; /* Free region 2 table */ page = phys_to_page(r2t); - list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1631,7 +1460,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, * * Called with sg->guest_table_lock */ -static void gmap_unshadow(struct gmap *sg) +void gmap_unshadow(struct gmap *sg) { unsigned long *table; @@ -1657,143 +1486,7 @@ static void gmap_unshadow(struct gmap *sg) break; } } - -/** - * gmap_find_shadow - find a specific asce in the list of shadow tables - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * Returns the pointer to a gmap if a shadow table with the given asce is - * already available, ERR_PTR(-EAGAIN) if another one is just being created, - * otherwise NULL - */ -static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, - int edat_level) -{ - struct gmap *sg; - - list_for_each_entry(sg, &parent->children, list) { - if (sg->orig_asce != asce || sg->edat_level != edat_level || - sg->removed) - continue; - if (!sg->initialized) - return ERR_PTR(-EAGAIN); - refcount_inc(&sg->ref_count); - return sg; - } - return NULL; -} - -/** - * gmap_shadow_valid - check if a shadow guest address space matches the - * given properties and is still valid - * @sg: pointer to the shadow guest address space structure - * @asce: ASCE for which the shadow table is requested - * @edat_level: edat level to be used for the shadow translation - * - * Returns 1 if the gmap shadow is still valid and matches the given - * properties, the caller can continue using it. Returns 0 otherwise, the - * caller has to request a new shadow gmap in this case. - * - */ -int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) -{ - if (sg->removed) - return 0; - return sg->orig_asce == asce && sg->edat_level == edat_level; -} -EXPORT_SYMBOL_GPL(gmap_shadow_valid); - -/** - * gmap_shadow - create/find a shadow guest address space - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * The pages of the top level page table referred by the asce parameter - * will be set to read-only and marked in the PGSTEs of the kvm process. - * The shadow table will be removed automatically on any change to the - * PTE mapping for the source table. - * - * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, - * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the - * parent gmap table could not be protected. - */ -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, - int edat_level) -{ - struct gmap *sg, *new; - unsigned long limit; - int rc; - - BUG_ON(parent->mm->context.allow_gmap_hpage_1m); - BUG_ON(gmap_is_shadow(parent)); - spin_lock(&parent->shadow_lock); - sg = gmap_find_shadow(parent, asce, edat_level); - spin_unlock(&parent->shadow_lock); - if (sg) - return sg; - /* Create a new shadow gmap */ - limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); - if (asce & _ASCE_REAL_SPACE) - limit = -1UL; - new = gmap_alloc(limit); - if (!new) - return ERR_PTR(-ENOMEM); - new->mm = parent->mm; - new->parent = gmap_get(parent); - new->private = parent->private; - new->orig_asce = asce; - new->edat_level = edat_level; - new->initialized = false; - spin_lock(&parent->shadow_lock); - /* Recheck if another CPU created the same shadow */ - sg = gmap_find_shadow(parent, asce, edat_level); - if (sg) { - spin_unlock(&parent->shadow_lock); - gmap_free(new); - return sg; - } - if (asce & _ASCE_REAL_SPACE) { - /* only allow one real-space gmap shadow */ - list_for_each_entry(sg, &parent->children, list) { - if (sg->orig_asce & _ASCE_REAL_SPACE) { - spin_lock(&sg->guest_table_lock); - gmap_unshadow(sg); - spin_unlock(&sg->guest_table_lock); - list_del(&sg->list); - gmap_put(sg); - break; - } - } - } - refcount_set(&new->ref_count, 2); - list_add(&new->list, &parent->children); - if (asce & _ASCE_REAL_SPACE) { - /* nothing to protect, return right away */ - new->initialized = true; - spin_unlock(&parent->shadow_lock); - return new; - } - spin_unlock(&parent->shadow_lock); - /* protect after insertion, so it will get properly invalidated */ - mmap_read_lock(parent->mm); - rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, - ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, - PROT_READ, GMAP_NOTIFY_SHADOW); - mmap_read_unlock(parent->mm); - spin_lock(&parent->shadow_lock); - new->initialized = true; - if (rc) { - list_del(&new->list); - gmap_free(new); - new = ERR_PTR(rc); - } - spin_unlock(&parent->shadow_lock); - return new; -} -EXPORT_SYMBOL_GPL(gmap_shadow); +EXPORT_SYMBOL(gmap_unshadow); /** * gmap_shadow_r2t - create an empty shadow region 2 table @@ -1827,9 +1520,6 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = r2t & _REGION_ENTRY_ORIGIN; - if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; s_r2t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); @@ -1851,7 +1541,6 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r2t & _REGION_ENTRY_PROTECT); - list_add(&page->lru, &sg->crst_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_REGION_ENTRY_INVALID; @@ -1911,9 +1600,6 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = r3t & _REGION_ENTRY_ORIGIN; - if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; s_r3t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); @@ -1935,7 +1621,6 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r3t & _REGION_ENTRY_PROTECT); - list_add(&page->lru, &sg->crst_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_REGION_ENTRY_INVALID; @@ -1995,9 +1680,6 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = sgt & _REGION_ENTRY_ORIGIN; - if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; s_sgt = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); @@ -2019,7 +1701,6 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= sgt & _REGION_ENTRY_PROTECT; - list_add(&page->lru, &sg->crst_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_REGION_ENTRY_INVALID; @@ -2052,45 +1733,22 @@ out_free: } EXPORT_SYMBOL_GPL(gmap_shadow_sgt); -/** - * gmap_shadow_pgt_lookup - find a shadow page table - * @sg: pointer to the shadow guest address space structure - * @saddr: the address in the shadow aguest address space - * @pgt: parent gmap address of the page table to get shadowed - * @dat_protection: if the pgtable is marked as protected by dat - * @fake: pgt references contiguous guest memory block, not a pgtable - * - * Returns 0 if the shadow page table was found and -EAGAIN if the page - * table was not found. - * - * Called with sg->mm->mmap_lock in read. - */ -int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, - unsigned long *pgt, int *dat_protection, - int *fake) +static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr) { - unsigned long *table; - struct page *page; - int rc; + unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc)); - BUG_ON(!gmap_is_shadow(sg)); - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ - if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { - /* Shadow page tables are full pages (pte+pgste) */ - page = pfn_to_page(*table >> PAGE_SHIFT); - *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; - *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); - *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); - rc = 0; - } else { - rc = -EAGAIN; - } - spin_unlock(&sg->guest_table_lock); - return rc; + pgstes += _PAGE_ENTRIES; + + pgstes[0] &= ~PGSTE_ST2_MASK; + pgstes[1] &= ~PGSTE_ST2_MASK; + pgstes[2] &= ~PGSTE_ST2_MASK; + pgstes[3] &= ~PGSTE_ST2_MASK; + pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK; + pgstes[1] |= pgt_addr & PGSTE_ST2_MASK; + pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK; + pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK; } -EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); /** * gmap_shadow_pgt - instantiate a shadow page table @@ -2119,9 +1777,10 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, ptdesc = page_table_alloc_pgste(sg->mm); if (!ptdesc) return -ENOMEM; - ptdesc->pt_index = pgt & _SEGMENT_ENTRY_ORIGIN; + origin = pgt & _SEGMENT_ENTRY_ORIGIN; if (fake) - ptdesc->pt_index |= GMAP_SHADOW_FAKE_TABLE; + origin |= GMAP_SHADOW_FAKE_TABLE; + gmap_pgste_set_pgt_addr(ptdesc, origin); s_pgt = page_to_phys(ptdesc_page(ptdesc)); /* Install shadow page table */ spin_lock(&sg->guest_table_lock); @@ -2140,7 +1799,6 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, /* mark as invalid as long as the parent table is not protected */ *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; - list_add(&ptdesc->pt_list, &sg->pt_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_SEGMENT_ENTRY_INVALID; @@ -2318,7 +1976,6 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte, unsigned long bits) { unsigned long offset, gaddr = 0; - unsigned long *table; struct gmap *gmap, *sg, *next; offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); @@ -2326,12 +1983,9 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - table = radix_tree_lookup(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (table) - gaddr = __gmap_segment_gaddr(table) + offset; + gaddr = host_to_guest_lookup(gmap, vmaddr) + offset; spin_unlock(&gmap->guest_table_lock); - if (!table) + if (!IS_GADDR_VALID(gaddr)) continue; if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { @@ -2391,10 +2045,8 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); if (pmdp) { - gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); pmdp_notify_gmap(gmap, pmdp, gaddr); WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | _SEGMENT_ENTRY_GMAP_UC | @@ -2438,28 +2090,25 @@ EXPORT_SYMBOL_GPL(gmap_pmdp_csp); */ void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) { - unsigned long *entry, gaddr; + unsigned long gaddr; struct gmap *gmap; pmd_t *pmdp; rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - entry = radix_tree_delete(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (entry) { - pmdp = (pmd_t *)entry; - gaddr = __gmap_segment_gaddr(entry); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC | - _SEGMENT_ENTRY)); + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); if (MACHINE_HAS_TLB_GUEST) __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_LOCAL); else if (MACHINE_HAS_IDTE) __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); - *entry = _SEGMENT_ENTRY_EMPTY; + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); } spin_unlock(&gmap->guest_table_lock); } @@ -2474,22 +2123,19 @@ EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); */ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) { - unsigned long *entry, gaddr; + unsigned long gaddr; struct gmap *gmap; pmd_t *pmdp; rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - entry = radix_tree_delete(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (entry) { - pmdp = (pmd_t *)entry; - gaddr = __gmap_segment_gaddr(entry); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC | - _SEGMENT_ENTRY)); + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); if (MACHINE_HAS_TLB_GUEST) __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_GLOBAL); @@ -2497,7 +2143,7 @@ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); else __pmdp_csp(pmdp); - *entry = _SEGMENT_ENTRY_EMPTY; + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); } spin_unlock(&gmap->guest_table_lock); } @@ -2943,49 +2589,6 @@ int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); /** - * s390_unlist_old_asce - Remove the topmost level of page tables from the - * list of page tables of the gmap. - * @gmap: the gmap whose table is to be removed - * - * On s390x, KVM keeps a list of all pages containing the page tables of the - * gmap (the CRST list). This list is used at tear down time to free all - * pages that are now not needed anymore. - * - * This function removes the topmost page of the tree (the one pointed to by - * the ASCE) from the CRST list. - * - * This means that it will not be freed when the VM is torn down, and needs - * to be handled separately by the caller, unless a leak is actually - * intended. Notice that this function will only remove the page from the - * list, the page will still be used as a top level page table (and ASCE). - */ -void s390_unlist_old_asce(struct gmap *gmap) -{ - struct page *old; - - old = virt_to_page(gmap->table); - spin_lock(&gmap->guest_table_lock); - list_del(&old->lru); - /* - * Sometimes the topmost page might need to be "removed" multiple - * times, for example if the VM is rebooted into secure mode several - * times concurrently, or if s390_replace_asce fails after calling - * s390_remove_old_asce and is attempted again later. In that case - * the old asce has been removed from the list, and therefore it - * will not be freed when the VM terminates, but the ASCE is still - * in use and still pointed to. - * A subsequent call to replace_asce will follow the pointer and try - * to remove the same page from the list again. - * Therefore it's necessary that the page of the ASCE has valid - * pointers, so list_del can work (and do nothing) without - * dereferencing stale or invalid pointers. - */ - INIT_LIST_HEAD(&old->lru); - spin_unlock(&gmap->guest_table_lock); -} -EXPORT_SYMBOL_GPL(s390_unlist_old_asce); - -/** * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy * @gmap: the gmap whose ASCE needs to be replaced * @@ -3004,8 +2607,6 @@ int s390_replace_asce(struct gmap *gmap) struct page *page; void *table; - s390_unlist_old_asce(gmap); - /* Replacing segment type ASCEs would cause serious issues */ if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) return -EINVAL; @@ -3013,19 +2614,9 @@ int s390_replace_asce(struct gmap *gmap) page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = 0; table = page_to_virt(page); memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); - /* - * The caller has to deal with the old ASCE, but here we make sure - * the new one is properly added to the CRST list, so that - * it will be freed when the VM is torn down. - */ - spin_lock(&gmap->guest_table_lock); - list_add(&page->lru, &gmap->crst_list); - spin_unlock(&gmap->guest_table_lock); - /* Set new table origin while preserving existing ASCE control bits */ asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); WRITE_ONCE(gmap->asce, asce); @@ -3035,3 +2626,31 @@ int s390_replace_asce(struct gmap *gmap) return 0; } EXPORT_SYMBOL_GPL(s390_replace_asce); + +/** + * kvm_s390_wiggle_split_folio() - try to drain extra references to a folio and optionally split + * @mm: the mm containing the folio to work on + * @folio: the folio + * @split: whether to split a large folio + * + * Context: Must be called while holding an extra reference to the folio; + * the mm lock should not be held. + */ +int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split) +{ + int rc; + + lockdep_assert_not_held(&mm->mmap_lock); + folio_wait_writeback(folio); + lru_add_drain_all(); + if (split) { + folio_lock(folio); + rc = split_folio(folio); + folio_unlock(folio); + + if (rc != -EBUSY) + return rc; + } + return -EAGAIN; +} +EXPORT_SYMBOL_GPL(kvm_s390_wiggle_split_folio); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index cd2fef79ad2c..30387a6e98ff 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -176,8 +176,6 @@ unsigned long *page_table_alloc(struct mm_struct *mm) } table = ptdesc_to_virt(ptdesc); __arch_set_page_dat(table, 1); - /* pt_list is used by gmap only */ - INIT_LIST_HEAD(&ptdesc->pt_list); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); return table; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 87198d957e2f..be2c311f5118 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2599,7 +2599,8 @@ config MITIGATION_IBPB_ENTRY depends on CPU_SUP_AMD && X86_64 default y help - Compile the kernel with support for the retbleed=ibpb mitigation. + Compile the kernel with support for the retbleed=ibpb and + spec_rstack_overflow={ibpb,ibpb-vmexit} mitigations. config MITIGATION_IBRS_ENTRY bool "Enable IBRS on kernel entry" diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f2051644de94..606c74f27459 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -25,6 +25,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ # avoid errors with '-march=i386', and future flags may depend on the target to # be valid. KBUILD_CFLAGS := -m$(BITS) -O2 $(CLANG_FLAGS) +KBUILD_CFLAGS += -std=gnu11 KBUILD_CFLAGS += -fno-strict-aliasing -fPIE KBUILD_CFLAGS += -Wundef KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 5a505aa65489..a5d0998d7604 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1115,6 +1115,8 @@ do_cmd_auto: case RETBLEED_MITIGATION_IBPB: setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); + mitigate_smt = true; /* * IBPB on entry already obviates the need for @@ -1124,9 +1126,6 @@ do_cmd_auto: setup_clear_cpu_cap(X86_FEATURE_UNRET); setup_clear_cpu_cap(X86_FEATURE_RETHUNK); - setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); - mitigate_smt = true; - /* * There is no need for RSB filling: entry_ibpb() ensures * all predictions, including the RSB, are invalidated, @@ -2646,6 +2645,7 @@ static void __init srso_select_mitigation(void) if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); srso_mitigation = SRSO_MITIGATION_IBPB; /* @@ -2655,6 +2655,13 @@ static void __init srso_select_mitigation(void) */ setup_clear_cpu_cap(X86_FEATURE_UNRET); setup_clear_cpu_cap(X86_FEATURE_RETHUNK); + + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); } } else { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); @@ -2663,8 +2670,8 @@ static void __init srso_select_mitigation(void) ibpb_on_vmexit: case SRSO_CMD_IBPB_ON_VMEXIT: - if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { - if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { + if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { + if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; @@ -2676,8 +2683,8 @@ ibpb_on_vmexit: setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); } } else { - pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); - } + pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); + } break; default: break; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2cbb3874ad39..8eb3a88707f2 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1180,7 +1180,7 @@ void kvm_set_cpu_caps(void) SYNTHESIZED_F(SBPB), SYNTHESIZED_F(IBPB_BRTYPE), SYNTHESIZED_F(SRSO_NO), - SYNTHESIZED_F(SRSO_USER_KERNEL_NO), + F(SRSO_USER_KERNEL_NO), ); kvm_cpu_cap_init(CPUID_8000_0022_EAX, diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a45ae60e84ab..74c20dbb92da 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7120,6 +7120,19 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } +static void kvm_wake_nx_recovery_thread(struct kvm *kvm) +{ + /* + * The NX recovery thread is spawned on-demand at the first KVM_RUN and + * may not be valid even though the VM is globally visible. Do nothing, + * as such a VM can't have any possible NX huge pages. + */ + struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread); + + if (nx_thread) + vhost_task_wake(nx_thread); +} + static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) { if (nx_hugepage_mitigation_hard_disabled) @@ -7180,7 +7193,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); - vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); + kvm_wake_nx_recovery_thread(kvm); } mutex_unlock(&kvm_lock); } @@ -7315,7 +7328,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) - vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); + kvm_wake_nx_recovery_thread(kvm); mutex_unlock(&kvm_lock); } @@ -7451,14 +7464,20 @@ static void kvm_mmu_start_lpage_recovery(struct once *once) { struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once); struct kvm *kvm = container_of(ka, struct kvm, arch); + struct vhost_task *nx_thread; kvm->arch.nx_huge_page_last = get_jiffies_64(); - kvm->arch.nx_huge_page_recovery_thread = vhost_task_create( - kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill, - kvm, "kvm-nx-lpage-recovery"); + nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker, + kvm_nx_huge_page_recovery_worker_kill, + kvm, "kvm-nx-lpage-recovery"); - if (kvm->arch.nx_huge_page_recovery_thread) - vhost_task_start(kvm->arch.nx_huge_page_recovery_thread); + if (!nx_thread) + return; + + vhost_task_start(nx_thread); + + /* Make the task visible only once it is fully started. */ + WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread); } int kvm_mmu_post_init_vm(struct kvm *kvm) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d4a6734b2d6..8e77e61d4fbd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12741,6 +12741,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) "does not run without ignore_msrs=1, please report it to kvm@vger.kernel.org.\n"); } + once_init(&kvm->arch.nx_once); return 0; out_uninit_mmu: @@ -12750,12 +12751,6 @@ out: return ret; } -int kvm_arch_post_init_vm(struct kvm *kvm) -{ - once_init(&kvm->arch.nx_once); - return 0; -} - static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 9252652afe59..894edf8d6d62 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -100,9 +100,6 @@ SYM_FUNC_START(xen_hypercall_hvm) push %r10 push %r9 push %r8 -#ifdef CONFIG_FRAME_POINTER - pushq $0 /* Dummy push for stack alignment. */ -#endif #endif /* Set the vendor specific function. */ call __xen_hypercall_setfunc @@ -117,11 +114,8 @@ SYM_FUNC_START(xen_hypercall_hvm) pop %ebx pop %eax #else - lea xen_hypercall_amd(%rip), %rbx - cmp %rax, %rbx -#ifdef CONFIG_FRAME_POINTER - pop %rax /* Dummy pop. */ -#endif + lea xen_hypercall_amd(%rip), %rcx + cmp %rax, %rcx pop %r8 pop %r9 pop %r10 @@ -132,6 +126,7 @@ SYM_FUNC_START(xen_hypercall_hvm) pop %rcx pop %rax #endif + FRAME_END /* Use correct hypercall function. */ jz xen_hypercall_amd jmp xen_hypercall_intel diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/amdxdna/amdxdna_pci_drv.c index 97d4a032171f..f5b8497cf5ad 100644 --- a/drivers/accel/amdxdna/amdxdna_pci_drv.c +++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c @@ -21,6 +21,11 @@ #define AMDXDNA_AUTOSUSPEND_DELAY 5000 /* milliseconds */ +MODULE_FIRMWARE("amdnpu/1502_00/npu.sbin"); +MODULE_FIRMWARE("amdnpu/17f0_10/npu.sbin"); +MODULE_FIRMWARE("amdnpu/17f0_11/npu.sbin"); +MODULE_FIRMWARE("amdnpu/17f0_20/npu.sbin"); + /* * Bind the driver base on (vendor_id, device_id) pair and later use the * (device_id, rev_id) pair as a key to select the devices. The devices with diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index 1e8ffbe25eee..38cf1c342c72 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -397,15 +397,19 @@ int ivpu_boot(struct ivpu_device *vdev) if (ivpu_fw_is_cold_boot(vdev)) { ret = ivpu_pm_dct_init(vdev); if (ret) - goto err_diagnose_failure; + goto err_disable_ipc; ret = ivpu_hw_sched_init(vdev); if (ret) - goto err_diagnose_failure; + goto err_disable_ipc; } return 0; +err_disable_ipc: + ivpu_ipc_disable(vdev); + ivpu_hw_irq_disable(vdev); + disable_irq(vdev->irq); err_diagnose_failure: ivpu_hw_diagnose_failure(vdev); ivpu_mmu_evtq_dump(vdev); diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index 87d7411ae059..5060c5dd40d1 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -115,41 +115,57 @@ err_power_down: return ret; } -static void ivpu_pm_recovery_work(struct work_struct *work) +static void ivpu_pm_reset_begin(struct ivpu_device *vdev) { - struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work); - struct ivpu_device *vdev = pm->vdev; - char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL}; - int ret; - - ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter)); - - ret = pm_runtime_resume_and_get(vdev->drm.dev); - if (ret) - ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); - - ivpu_jsm_state_dump(vdev); - ivpu_dev_coredump(vdev); + pm_runtime_disable(vdev->drm.dev); atomic_inc(&vdev->pm->reset_counter); atomic_set(&vdev->pm->reset_pending, 1); down_write(&vdev->pm->reset_lock); +} + +static void ivpu_pm_reset_complete(struct ivpu_device *vdev) +{ + int ret; - ivpu_suspend(vdev); ivpu_pm_prepare_cold_boot(vdev); ivpu_jobs_abort_all(vdev); ivpu_ms_cleanup_all(vdev); ret = ivpu_resume(vdev); - if (ret) + if (ret) { ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); + pm_runtime_set_suspended(vdev->drm.dev); + } else { + pm_runtime_set_active(vdev->drm.dev); + } up_write(&vdev->pm->reset_lock); atomic_set(&vdev->pm->reset_pending, 0); - kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt); pm_runtime_mark_last_busy(vdev->drm.dev); - pm_runtime_put_autosuspend(vdev->drm.dev); + pm_runtime_enable(vdev->drm.dev); +} + +static void ivpu_pm_recovery_work(struct work_struct *work) +{ + struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work); + struct ivpu_device *vdev = pm->vdev; + char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL}; + + ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter)); + + ivpu_pm_reset_begin(vdev); + + if (!pm_runtime_status_suspended(vdev->drm.dev)) { + ivpu_jsm_state_dump(vdev); + ivpu_dev_coredump(vdev); + ivpu_suspend(vdev); + } + + ivpu_pm_reset_complete(vdev); + + kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt); } void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason) @@ -309,7 +325,10 @@ int ivpu_rpm_get(struct ivpu_device *vdev) int ret; ret = pm_runtime_resume_and_get(vdev->drm.dev); - drm_WARN_ON(&vdev->drm, ret < 0); + if (ret < 0) { + ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); + pm_runtime_set_suspended(vdev->drm.dev); + } return ret; } @@ -325,16 +344,13 @@ void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev) struct ivpu_device *vdev = pci_get_drvdata(pdev); ivpu_dbg(vdev, PM, "Pre-reset..\n"); - atomic_inc(&vdev->pm->reset_counter); - atomic_set(&vdev->pm->reset_pending, 1); - pm_runtime_get_sync(vdev->drm.dev); - down_write(&vdev->pm->reset_lock); - ivpu_prepare_for_reset(vdev); - ivpu_hw_reset(vdev); - ivpu_pm_prepare_cold_boot(vdev); - ivpu_jobs_abort_all(vdev); - ivpu_ms_cleanup_all(vdev); + ivpu_pm_reset_begin(vdev); + + if (!pm_runtime_status_suspended(vdev->drm.dev)) { + ivpu_prepare_for_reset(vdev); + ivpu_hw_reset(vdev); + } ivpu_dbg(vdev, PM, "Pre-reset done.\n"); } @@ -342,18 +358,12 @@ void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev) void ivpu_pm_reset_done_cb(struct pci_dev *pdev) { struct ivpu_device *vdev = pci_get_drvdata(pdev); - int ret; ivpu_dbg(vdev, PM, "Post-reset..\n"); - ret = ivpu_resume(vdev); - if (ret) - ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret); - up_write(&vdev->pm->reset_lock); - atomic_set(&vdev->pm->reset_pending, 0); - ivpu_dbg(vdev, PM, "Post-reset done.\n"); - pm_runtime_mark_last_busy(vdev->drm.dev); - pm_runtime_put_autosuspend(vdev->drm.dev); + ivpu_pm_reset_complete(vdev); + + ivpu_dbg(vdev, PM, "Post-reset done.\n"); } void ivpu_pm_init(struct ivpu_device *vdev) diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c index 747f83f7114d..e549914a636c 100644 --- a/drivers/acpi/prmt.c +++ b/drivers/acpi/prmt.c @@ -287,9 +287,7 @@ static acpi_status acpi_platformrt_space_handler(u32 function, if (!handler || !module) goto invalid_guid; - if (!handler->handler_addr || - !handler->static_data_buffer_addr || - !handler->acpi_param_buffer_addr) { + if (!handler->handler_addr) { buffer->prm_status = PRM_HANDLER_ERROR; return AE_OK; } diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 98d93ed58315..436019d96027 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -1187,8 +1187,6 @@ static int acpi_data_prop_read(const struct acpi_device_data *data, } break; } - if (nval == 0) - return -EINVAL; if (obj->type == ACPI_TYPE_BUFFER) { if (proptype != DEV_PROP_U8) @@ -1212,9 +1210,11 @@ static int acpi_data_prop_read(const struct acpi_device_data *data, ret = acpi_copy_property_array_uint(items, (u64 *)val, nval); break; case DEV_PROP_STRING: - ret = acpi_copy_property_array_string( - items, (char **)val, - min_t(u32, nval, obj->package.count)); + nval = min_t(u32, nval, obj->package.count); + if (nval == 0) + return -ENODATA; + + ret = acpi_copy_property_array_string(items, (char **)val, nval); break; default: ret = -EINVAL; diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index 90aaec923889..b4cd14e7fa76 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -564,6 +564,12 @@ static const struct dmi_system_id irq1_edge_low_force_override[] = { }, }, { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Eluktronics Inc."), + DMI_MATCH(DMI_BOARD_NAME, "MECH-17"), + }, + }, + { /* TongFang GM6XGxX/TUXEDO Stellaris 16 Gen5 AMD */ .matches = { DMI_MATCH(DMI_BOARD_NAME, "GM6XGxX"), diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index d497d448e4b2..40e1d8d8a589 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1191,24 +1191,18 @@ static pm_message_t resume_event(pm_message_t sleep_state) return PMSG_ON; } -static void dpm_superior_set_must_resume(struct device *dev, bool set_active) +static void dpm_superior_set_must_resume(struct device *dev) { struct device_link *link; int idx; - if (dev->parent) { + if (dev->parent) dev->parent->power.must_resume = true; - if (set_active) - dev->parent->power.set_active = true; - } idx = device_links_read_lock(); - list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) { + list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) link->supplier->power.must_resume = true; - if (set_active) - link->supplier->power.set_active = true; - } device_links_read_unlock(idx); } @@ -1287,9 +1281,12 @@ Skip: dev->power.must_resume = true; if (dev->power.must_resume) { - dev->power.set_active = dev->power.set_active || - dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND); - dpm_superior_set_must_resume(dev, dev->power.set_active); + if (dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND)) { + dev->power.set_active = true; + if (dev->parent && !dev->parent->power.ignore_children) + dev->parent->power.set_active = true; + } + dpm_superior_set_must_resume(dev); } Complete: diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c index 0bcd81389a29..978613407ea3 100644 --- a/drivers/base/regmap/regmap-irq.c +++ b/drivers/base/regmap/regmap-irq.c @@ -906,6 +906,7 @@ err_alloc: kfree(d->wake_buf); kfree(d->mask_buf_def); kfree(d->mask_buf); + kfree(d->main_status_buf); kfree(d->status_buf); kfree(d->status_reg_buf); if (d->config_buf) { @@ -981,6 +982,7 @@ void regmap_del_irq_chip(int irq, struct regmap_irq_chip_data *d) kfree(d->wake_buf); kfree(d->mask_buf_def); kfree(d->mask_buf); + kfree(d->main_status_buf); kfree(d->status_reg_buf); kfree(d->status_buf); if (d->config_buf) { diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 33b3bc99d532..282f81616a78 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -1127,8 +1127,8 @@ static void vdc_queue_drain(struct vdc_port *port) spin_lock_irq(&port->vio.lock); port->drain = 0; - blk_mq_unquiesce_queue(q, memflags); - blk_mq_unfreeze_queue(q); + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q, memflags); } static void vdc_ldc_reset_timer_work(struct work_struct *work) diff --git a/drivers/bluetooth/btintel_pcie.c b/drivers/bluetooth/btintel_pcie.c index 2b79952f3628..091ffe3e1495 100644 --- a/drivers/bluetooth/btintel_pcie.c +++ b/drivers/bluetooth/btintel_pcie.c @@ -1320,6 +1320,10 @@ static int btintel_pcie_send_frame(struct hci_dev *hdev, if (opcode == 0xfc01) btintel_pcie_inject_cmd_complete(hdev, opcode); } + /* Firmware raises alive interrupt on HCI_OP_RESET */ + if (opcode == HCI_OP_RESET) + data->gp0_received = false; + hdev->stat.cmd_tx++; break; case HCI_ACLDATA_PKT: @@ -1357,7 +1361,6 @@ static int btintel_pcie_send_frame(struct hci_dev *hdev, opcode, btintel_pcie_alivectxt_state2str(old_ctxt), btintel_pcie_alivectxt_state2str(data->alive_intr_ctxt)); if (opcode == HCI_OP_RESET) { - data->gp0_received = false; ret = wait_event_timeout(data->gp0_wait_q, data->gp0_received, msecs_to_jiffies(BTINTEL_DEFAULT_INTR_TIMEOUT_MS)); diff --git a/drivers/bus/moxtet.c b/drivers/bus/moxtet.c index 6276551d7968..1e57ebfb7622 100644 --- a/drivers/bus/moxtet.c +++ b/drivers/bus/moxtet.c @@ -657,7 +657,7 @@ static void moxtet_irq_print_chip(struct irq_data *d, struct seq_file *p) id = moxtet->modules[pos->idx]; - seq_printf(p, " moxtet-%s.%i#%i", mox_module_name(id), pos->idx, + seq_printf(p, "moxtet-%s.%i#%i", mox_module_name(id), pos->idx, pos->bit); } diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 0ee5c691fb36..9e46960f6a86 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -17,7 +17,8 @@ config ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM config ARM_AIROHA_SOC_CPUFREQ tristate "Airoha EN7581 SoC CPUFreq support" - depends on (ARCH_AIROHA && OF) || COMPILE_TEST + depends on ARCH_AIROHA || COMPILE_TEST + depends on OF select PM_OPP default ARCH_AIROHA help diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index dd9b8d6993d6..313550fa62d4 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -699,7 +699,7 @@ static void amd_pstate_adjust_perf(unsigned int cpu, if (min_perf < lowest_nonlinear_perf) min_perf = lowest_nonlinear_perf; - max_perf = cap_perf; + max_perf = cpudata->max_limit_perf; if (max_perf < min_perf) max_perf = min_perf; @@ -747,7 +747,6 @@ static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state) guard(mutex)(&amd_pstate_driver_lock); ret = amd_pstate_cpu_boost_update(policy, state); - policy->boost_enabled = !ret ? state : false; refresh_frequency_limits(policy); return ret; @@ -822,25 +821,28 @@ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) static void amd_pstate_update_limits(unsigned int cpu) { - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct cpufreq_policy *policy = NULL; struct amd_cpudata *cpudata; u32 prev_high = 0, cur_high = 0; int ret; bool highest_perf_changed = false; + if (!amd_pstate_prefcore) + return; + + policy = cpufreq_cpu_get(cpu); if (!policy) return; cpudata = policy->driver_data; - if (!amd_pstate_prefcore) - return; - guard(mutex)(&amd_pstate_driver_lock); ret = amd_get_highest_perf(cpu, &cur_high); - if (ret) - goto free_cpufreq_put; + if (ret) { + cpufreq_cpu_put(policy); + return; + } prev_high = READ_ONCE(cpudata->prefcore_ranking); highest_perf_changed = (prev_high != cur_high); @@ -850,8 +852,6 @@ static void amd_pstate_update_limits(unsigned int cpu) if (cur_high < CPPC_MAX_PERF) sched_set_itmt_core_prio((int)cur_high, cpu); } - -free_cpufreq_put: cpufreq_cpu_put(policy); if (!highest_perf_changed) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index e0048856ecee..30ffbddc7ece 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1571,7 +1571,8 @@ static int cpufreq_online(unsigned int cpu) policy->cdev = of_cpufreq_cooling_register(policy); /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ - if (policy->boost_enabled != cpufreq_boost_enabled()) { + if (cpufreq_driver->set_boost && + policy->boost_enabled != cpufreq_boost_enabled()) { policy->boost_enabled = cpufreq_boost_enabled(); ret = cpufreq_driver->set_boost(policy, policy->boost_enabled); if (ret) { diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig index 71d8b26c4103..9f35f69e0f9e 100644 --- a/drivers/firmware/Kconfig +++ b/drivers/firmware/Kconfig @@ -106,7 +106,7 @@ config ISCSI_IBFT select ISCSI_BOOT_SYSFS select ISCSI_IBFT_FIND if X86 depends on ACPI && SCSI && SCSI_LOWLEVEL - default n + default n help This option enables support for detection and exposing of iSCSI Boot Firmware Table (iBFT) via sysfs to userspace. If you wish to diff --git a/drivers/firmware/iscsi_ibft.c b/drivers/firmware/iscsi_ibft.c index 6e9788324fea..371f24569b3b 100644 --- a/drivers/firmware/iscsi_ibft.c +++ b/drivers/firmware/iscsi_ibft.c @@ -310,7 +310,10 @@ static ssize_t ibft_attr_show_nic(void *data, int type, char *buf) str += sprintf_ipaddr(str, nic->ip_addr); break; case ISCSI_BOOT_ETH_SUBNET_MASK: - val = cpu_to_be32(~((1 << (32-nic->subnet_mask_prefix))-1)); + if (nic->subnet_mask_prefix > 32) + val = cpu_to_be32(~0); + else + val = cpu_to_be32(~((1 << (32-nic->subnet_mask_prefix))-1)); str += sprintf(str, "%pI4", &val); break; case ISCSI_BOOT_ETH_PREFIX_LEN: diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index add5ad29a673..98b4d1633b25 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -338,6 +338,7 @@ config GPIO_GRANITERAPIDS config GPIO_GRGPIO tristate "Aeroflex Gaisler GRGPIO support" + depends on OF || COMPILE_TEST select GPIO_GENERIC select IRQ_DOMAIN help diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index be4c9981ebc4..d63c1030e6ac 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -841,25 +841,6 @@ static bool pca953x_irq_pending(struct pca953x_chip *chip, unsigned long *pendin DECLARE_BITMAP(trigger, MAX_LINE); int ret; - if (chip->driver_data & PCA_PCAL) { - /* Read the current interrupt status from the device */ - ret = pca953x_read_regs(chip, PCAL953X_INT_STAT, trigger); - if (ret) - return false; - - /* Check latched inputs and clear interrupt status */ - ret = pca953x_read_regs(chip, chip->regs->input, cur_stat); - if (ret) - return false; - - /* Apply filter for rising/falling edge selection */ - bitmap_replace(new_stat, chip->irq_trig_fall, chip->irq_trig_raise, cur_stat, gc->ngpio); - - bitmap_and(pending, new_stat, trigger, gc->ngpio); - - return !bitmap_empty(pending, gc->ngpio); - } - ret = pca953x_read_regs(chip, chip->regs->input, cur_stat); if (ret) return false; diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c index a086087ada17..b6c230fab840 100644 --- a/drivers/gpio/gpio-sim.c +++ b/drivers/gpio/gpio-sim.c @@ -1028,20 +1028,23 @@ gpio_sim_device_lockup_configfs(struct gpio_sim_device *dev, bool lock) struct configfs_subsystem *subsys = dev->group.cg_subsys; struct gpio_sim_bank *bank; struct gpio_sim_line *line; + struct config_item *item; /* - * The device only needs to depend on leaf line entries. This is + * The device only needs to depend on leaf entries. This is * sufficient to lock up all the configfs entries that the * instantiated, alive device depends on. */ list_for_each_entry(bank, &dev->bank_list, siblings) { list_for_each_entry(line, &bank->line_list, siblings) { + item = line->hog ? &line->hog->item + : &line->group.cg_item; + if (lock) - WARN_ON(configfs_depend_item_unlocked( - subsys, &line->group.cg_item)); + WARN_ON(configfs_depend_item_unlocked(subsys, + item)); else - configfs_undepend_item_unlocked( - &line->group.cg_item); + configfs_undepend_item_unlocked(item); } } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 817116e53d44..dce9323fb410 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -119,9 +119,10 @@ * - 3.57.0 - Compute tunneling on GFX10+ * - 3.58.0 - Add GFX12 DCC support * - 3.59.0 - Cleared VRAM + * - 3.60.0 - Add AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE (Vulkan requirement) */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 59 +#define KMS_DRIVER_MINOR 60 #define KMS_DRIVER_PATCHLEVEL 0 /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index ff286940ab43..01ae2f88dec8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -309,7 +309,7 @@ int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev, mutex_lock(&adev->mman.gtt_window_lock); while (src_mm.remaining) { uint64_t from, to, cur_size, tiling_flags; - uint32_t num_type, data_format, max_com; + uint32_t num_type, data_format, max_com, write_compress_disable; struct dma_fence *next; /* Never copy more than 256MiB at once to avoid a timeout */ @@ -340,9 +340,13 @@ int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev, max_com = AMDGPU_TILING_GET(tiling_flags, GFX12_DCC_MAX_COMPRESSED_BLOCK); num_type = AMDGPU_TILING_GET(tiling_flags, GFX12_DCC_NUMBER_TYPE); data_format = AMDGPU_TILING_GET(tiling_flags, GFX12_DCC_DATA_FORMAT); + write_compress_disable = + AMDGPU_TILING_GET(tiling_flags, GFX12_DCC_WRITE_COMPRESS_DISABLE); copy_flags |= (AMDGPU_COPY_FLAGS_SET(MAX_COMPRESSED, max_com) | AMDGPU_COPY_FLAGS_SET(NUMBER_TYPE, num_type) | - AMDGPU_COPY_FLAGS_SET(DATA_FORMAT, data_format)); + AMDGPU_COPY_FLAGS_SET(DATA_FORMAT, data_format) | + AMDGPU_COPY_FLAGS_SET(WRITE_COMPRESS_DISABLE, + write_compress_disable)); } r = amdgpu_copy_buffer(ring, from, to, cur_size, resv, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index 461fb8090ae0..208b7d1d8a27 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -119,6 +119,8 @@ struct amdgpu_copy_mem { #define AMDGPU_COPY_FLAGS_NUMBER_TYPE_MASK 0x07 #define AMDGPU_COPY_FLAGS_DATA_FORMAT_SHIFT 8 #define AMDGPU_COPY_FLAGS_DATA_FORMAT_MASK 0x3f +#define AMDGPU_COPY_FLAGS_WRITE_COMPRESS_DISABLE_SHIFT 14 +#define AMDGPU_COPY_FLAGS_WRITE_COMPRESS_DISABLE_MASK 0x1 #define AMDGPU_COPY_FLAGS_SET(field, value) \ (((__u32)(value) & AMDGPU_COPY_FLAGS_##field##_MASK) << AMDGPU_COPY_FLAGS_##field##_SHIFT) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c index 9c17df2cf37b..7e10e94624e3 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c @@ -1741,11 +1741,12 @@ static void sdma_v7_0_emit_copy_buffer(struct amdgpu_ib *ib, uint32_t byte_count, uint32_t copy_flags) { - uint32_t num_type, data_format, max_com; + uint32_t num_type, data_format, max_com, write_cm; max_com = AMDGPU_COPY_FLAGS_GET(copy_flags, MAX_COMPRESSED); data_format = AMDGPU_COPY_FLAGS_GET(copy_flags, DATA_FORMAT); num_type = AMDGPU_COPY_FLAGS_GET(copy_flags, NUMBER_TYPE); + write_cm = AMDGPU_COPY_FLAGS_GET(copy_flags, WRITE_COMPRESS_DISABLE) ? 2 : 1; ib->ptr[ib->length_dw++] = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_COPY) | SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR) | @@ -1762,7 +1763,7 @@ static void sdma_v7_0_emit_copy_buffer(struct amdgpu_ib *ib, if ((copy_flags & (AMDGPU_COPY_FLAGS_READ_DECOMPRESSED | AMDGPU_COPY_FLAGS_WRITE_COMPRESSED))) ib->ptr[ib->length_dw++] = SDMA_DCC_DATA_FORMAT(data_format) | SDMA_DCC_NUM_TYPE(num_type) | ((copy_flags & AMDGPU_COPY_FLAGS_READ_DECOMPRESSED) ? SDMA_DCC_READ_CM(2) : 0) | - ((copy_flags & AMDGPU_COPY_FLAGS_WRITE_COMPRESSED) ? SDMA_DCC_WRITE_CM(1) : 0) | + ((copy_flags & AMDGPU_COPY_FLAGS_WRITE_COMPRESSED) ? SDMA_DCC_WRITE_CM(write_cm) : 0) | SDMA_DCC_MAX_COM(max_com) | SDMA_DCC_MAX_UCOM(1); else ib->ptr[ib->length_dw++] = 0; diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index cecaadf741ad..f84e795e35f5 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -2133,7 +2133,7 @@ static enum dc_status dc_commit_state_no_check(struct dc *dc, struct dc_state *c dc_enable_stereo(dc, context, dc_streams, context->stream_count); - if (context->stream_count > get_seamless_boot_stream_count(context) || + if (get_seamless_boot_stream_count(context) == 0 || context->stream_count == 0) { /* Must wait for no flips to be pending before doing optimize bw */ hwss_wait_for_no_pipes_pending(dc, context); diff --git a/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c b/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c index 5bb8b78bf250..bf636b28e3e1 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c @@ -63,8 +63,7 @@ void dmub_hw_lock_mgr_inbox0_cmd(struct dc_dmub_srv *dmub_srv, bool should_use_dmub_lock(struct dc_link *link) { - if (link->psr_settings.psr_version == DC_PSR_VERSION_SU_1 || - link->psr_settings.psr_version == DC_PSR_VERSION_1) + if (link->psr_settings.psr_version == DC_PSR_VERSION_SU_1) return true; if (link->replay_settings.replay_feature_enabled) diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile b/drivers/gpu/drm/amd/display/dc/dml/Makefile index 46f9c05de16e..e1d500633dfa 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile @@ -29,11 +29,15 @@ dml_ccflags := $(CC_FLAGS_FPU) dml_rcflags := $(CC_FLAGS_NO_FPU) ifneq ($(CONFIG_FRAME_WARN),0) -ifeq ($(filter y,$(CONFIG_KASAN)$(CONFIG_KCSAN)),y) -frame_warn_flag := -Wframe-larger-than=3072 -else -frame_warn_flag := -Wframe-larger-than=2048 -endif + ifeq ($(filter y,$(CONFIG_KASAN)$(CONFIG_KCSAN)),y) + frame_warn_limit := 3072 + else + frame_warn_limit := 2048 + endif + + ifeq ($(call test-lt, $(CONFIG_FRAME_WARN), $(frame_warn_limit)),y) + frame_warn_flag := -Wframe-larger-than=$(frame_warn_limit) + endif endif CFLAGS_$(AMDDALPATH)/dc/dml/display_mode_lib.o := $(dml_ccflags) diff --git a/drivers/gpu/drm/amd/display/dc/dml2/Makefile b/drivers/gpu/drm/amd/display/dc/dml2/Makefile index 91c4f3b4bd5f..21fd466dba26 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dml2/Makefile @@ -28,15 +28,19 @@ dml2_ccflags := $(CC_FLAGS_FPU) dml2_rcflags := $(CC_FLAGS_NO_FPU) ifneq ($(CONFIG_FRAME_WARN),0) -ifeq ($(filter y,$(CONFIG_KASAN)$(CONFIG_KCSAN)),y) -ifeq ($(CONFIG_CC_IS_CLANG)$(CONFIG_COMPILE_TEST),yy) -frame_warn_flag := -Wframe-larger-than=4096 -else -frame_warn_flag := -Wframe-larger-than=3072 -endif -else -frame_warn_flag := -Wframe-larger-than=2048 -endif + ifeq ($(filter y,$(CONFIG_KASAN)$(CONFIG_KCSAN)),y) + ifeq ($(CONFIG_CC_IS_CLANG)$(CONFIG_COMPILE_TEST),yy) + frame_warn_limit := 4096 + else + frame_warn_limit := 3072 + endif + else + frame_warn_limit := 2048 + endif + + ifeq ($(call test-lt, $(CONFIG_FRAME_WARN), $(frame_warn_limit)),y) + frame_warn_flag := -Wframe-larger-than=$(frame_warn_limit) + endif endif subdir-ccflags-y += -I$(FULL_AMD_DISPLAY_PATH)/dc/dml2 diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c index b9c6b45f6872..0c8ec30ea672 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c @@ -1017,7 +1017,7 @@ bool dml21_map_dc_state_into_dml_display_cfg(const struct dc *in_dc, struct dc_s if (disp_cfg_stream_location < 0) disp_cfg_stream_location = dml_dispcfg->num_streams++; - ASSERT(disp_cfg_stream_location >= 0 && disp_cfg_stream_location <= __DML2_WRAPPER_MAX_STREAMS_PLANES__); + ASSERT(disp_cfg_stream_location >= 0 && disp_cfg_stream_location < __DML2_WRAPPER_MAX_STREAMS_PLANES__); populate_dml21_timing_config_from_stream_state(&dml_dispcfg->stream_descriptors[disp_cfg_stream_location].timing, context->streams[stream_index], dml_ctx); adjust_dml21_hblank_timing_config_from_pipe_ctx(&dml_dispcfg->stream_descriptors[disp_cfg_stream_location].timing, &context->res_ctx.pipe_ctx[stream_index]); populate_dml21_output_config_from_stream_state(&dml_dispcfg->stream_descriptors[disp_cfg_stream_location].output, context->streams[stream_index], &context->res_ctx.pipe_ctx[stream_index]); @@ -1042,7 +1042,7 @@ bool dml21_map_dc_state_into_dml_display_cfg(const struct dc *in_dc, struct dc_s if (disp_cfg_plane_location < 0) disp_cfg_plane_location = dml_dispcfg->num_planes++; - ASSERT(disp_cfg_plane_location >= 0 && disp_cfg_plane_location <= __DML2_WRAPPER_MAX_STREAMS_PLANES__); + ASSERT(disp_cfg_plane_location >= 0 && disp_cfg_plane_location < __DML2_WRAPPER_MAX_STREAMS_PLANES__); populate_dml21_surface_config_from_plane_state(in_dc, &dml_dispcfg->plane_descriptors[disp_cfg_plane_location].surface, context->stream_status[stream_index].plane_states[plane_index]); populate_dml21_plane_config_from_plane_state(dml_ctx, &dml_dispcfg->plane_descriptors[disp_cfg_plane_location], context->stream_status[stream_index].plane_states[plane_index], context, stream_index); diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c index b416320873e1..b8a34abaf519 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c @@ -786,7 +786,7 @@ static void populate_dml_output_cfg_from_stream_state(struct dml_output_cfg_st * case SIGNAL_TYPE_DISPLAY_PORT_MST: case SIGNAL_TYPE_DISPLAY_PORT: out->OutputEncoder[location] = dml_dp; - if (dml2->v20.scratch.hpo_stream_to_link_encoder_mapping[location] != -1) + if (location < MAX_HPO_DP2_ENCODERS && dml2->v20.scratch.hpo_stream_to_link_encoder_mapping[location] != -1) out->OutputEncoder[dml2->v20.scratch.hpo_stream_to_link_encoder_mapping[location]] = dml_dp2p0; break; case SIGNAL_TYPE_EDP: @@ -1343,7 +1343,7 @@ void map_dc_state_into_dml_display_cfg(struct dml2_context *dml2, struct dc_stat if (disp_cfg_stream_location < 0) disp_cfg_stream_location = dml_dispcfg->num_timings++; - ASSERT(disp_cfg_stream_location >= 0 && disp_cfg_stream_location <= __DML2_WRAPPER_MAX_STREAMS_PLANES__); + ASSERT(disp_cfg_stream_location >= 0 && disp_cfg_stream_location < __DML2_WRAPPER_MAX_STREAMS_PLANES__); populate_dml_timing_cfg_from_stream_state(&dml_dispcfg->timing, disp_cfg_stream_location, context->streams[i]); populate_dml_output_cfg_from_stream_state(&dml_dispcfg->output, disp_cfg_stream_location, context->streams[i], current_pipe_context, dml2); @@ -1383,7 +1383,7 @@ void map_dc_state_into_dml_display_cfg(struct dml2_context *dml2, struct dc_stat if (disp_cfg_plane_location < 0) disp_cfg_plane_location = dml_dispcfg->num_surfaces++; - ASSERT(disp_cfg_plane_location >= 0 && disp_cfg_plane_location <= __DML2_WRAPPER_MAX_STREAMS_PLANES__); + ASSERT(disp_cfg_plane_location >= 0 && disp_cfg_plane_location < __DML2_WRAPPER_MAX_STREAMS_PLANES__); populate_dml_surface_cfg_from_plane_state(dml2->v20.dml_core_ctx.project, &dml_dispcfg->surface, disp_cfg_plane_location, context->stream_status[i].plane_states[j]); populate_dml_plane_cfg_from_plane_state( diff --git a/drivers/gpu/drm/amd/display/dc/hubbub/dcn30/dcn30_hubbub.c b/drivers/gpu/drm/amd/display/dc/hubbub/dcn30/dcn30_hubbub.c index fe741100c0f8..d347bb06577a 100644 --- a/drivers/gpu/drm/amd/display/dc/hubbub/dcn30/dcn30_hubbub.c +++ b/drivers/gpu/drm/amd/display/dc/hubbub/dcn30/dcn30_hubbub.c @@ -129,7 +129,8 @@ bool hubbub3_program_watermarks( REG_UPDATE(DCHUBBUB_ARB_DF_REQ_OUTSTAND, DCHUBBUB_ARB_MIN_REQ_OUTSTAND, 0x1FF); - hubbub1_allow_self_refresh_control(hubbub, !hubbub->ctx->dc->debug.disable_stutter); + if (safe_to_lower || hubbub->ctx->dc->debug.disable_stutter) + hubbub1_allow_self_refresh_control(hubbub, !hubbub->ctx->dc->debug.disable_stutter); return wm_pending; } diff --git a/drivers/gpu/drm/amd/display/dc/hubbub/dcn31/dcn31_hubbub.c b/drivers/gpu/drm/amd/display/dc/hubbub/dcn31/dcn31_hubbub.c index 7fb5523f9722..b98505b240a7 100644 --- a/drivers/gpu/drm/amd/display/dc/hubbub/dcn31/dcn31_hubbub.c +++ b/drivers/gpu/drm/amd/display/dc/hubbub/dcn31/dcn31_hubbub.c @@ -750,7 +750,8 @@ static bool hubbub31_program_watermarks( REG_UPDATE(DCHUBBUB_ARB_DF_REQ_OUTSTAND, DCHUBBUB_ARB_MIN_REQ_OUTSTAND, 0x1FF);*/ - hubbub1_allow_self_refresh_control(hubbub, !hubbub->ctx->dc->debug.disable_stutter); + if (safe_to_lower || hubbub->ctx->dc->debug.disable_stutter) + hubbub1_allow_self_refresh_control(hubbub, !hubbub->ctx->dc->debug.disable_stutter); return wm_pending; } diff --git a/drivers/gpu/drm/amd/display/dc/hubbub/dcn32/dcn32_hubbub.c b/drivers/gpu/drm/amd/display/dc/hubbub/dcn32/dcn32_hubbub.c index 5264dc26cce1..32a6be543105 100644 --- a/drivers/gpu/drm/amd/display/dc/hubbub/dcn32/dcn32_hubbub.c +++ b/drivers/gpu/drm/amd/display/dc/hubbub/dcn32/dcn32_hubbub.c @@ -786,7 +786,8 @@ static bool hubbub32_program_watermarks( REG_UPDATE(DCHUBBUB_ARB_DF_REQ_OUTSTAND, DCHUBBUB_ARB_MIN_REQ_OUTSTAND, 0x1FF);*/ - hubbub1_allow_self_refresh_control(hubbub, !hubbub->ctx->dc->debug.disable_stutter); + if (safe_to_lower || hubbub->ctx->dc->debug.disable_stutter) + hubbub1_allow_self_refresh_control(hubbub, !hubbub->ctx->dc->debug.disable_stutter); hubbub32_force_usr_retraining_allow(hubbub, hubbub->ctx->dc->debug.force_usr_allow); diff --git a/drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c b/drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c index 5eb3da8d5206..dce7269959ce 100644 --- a/drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c +++ b/drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c @@ -326,7 +326,8 @@ static bool hubbub35_program_watermarks( DCHUBBUB_ARB_MIN_REQ_OUTSTAND_COMMIT_THRESHOLD, 0xA);/*hw delta*/ REG_UPDATE(DCHUBBUB_ARB_HOSTVM_CNTL, DCHUBBUB_ARB_MAX_QOS_COMMIT_THRESHOLD, 0xF); - hubbub1_allow_self_refresh_control(hubbub, !hubbub->ctx->dc->debug.disable_stutter); + if (safe_to_lower || hubbub->ctx->dc->debug.disable_stutter) + hubbub1_allow_self_refresh_control(hubbub, !hubbub->ctx->dc->debug.disable_stutter); hubbub32_force_usr_retraining_allow(hubbub, hubbub->ctx->dc->debug.force_usr_allow); diff --git a/drivers/gpu/drm/amd/display/dc/hubp/dcn30/dcn30_hubp.c b/drivers/gpu/drm/amd/display/dc/hubp/dcn30/dcn30_hubp.c index be0ac613675a..0da70b50e86d 100644 --- a/drivers/gpu/drm/amd/display/dc/hubp/dcn30/dcn30_hubp.c +++ b/drivers/gpu/drm/amd/display/dc/hubp/dcn30/dcn30_hubp.c @@ -500,6 +500,8 @@ void hubp3_init(struct hubp *hubp) //hubp[i].HUBPREQ_DEBUG.HUBPREQ_DEBUG[26] = 1; REG_WRITE(HUBPREQ_DEBUG, 1 << 26); + REG_UPDATE(DCHUBP_CNTL, HUBP_TTU_DISABLE, 0); + hubp_reset(hubp); } diff --git a/drivers/gpu/drm/amd/display/dc/hubp/dcn32/dcn32_hubp.c b/drivers/gpu/drm/amd/display/dc/hubp/dcn32/dcn32_hubp.c index edd37898d550..f3a21c623f44 100644 --- a/drivers/gpu/drm/amd/display/dc/hubp/dcn32/dcn32_hubp.c +++ b/drivers/gpu/drm/amd/display/dc/hubp/dcn32/dcn32_hubp.c @@ -168,6 +168,8 @@ void hubp32_init(struct hubp *hubp) { struct dcn20_hubp *hubp2 = TO_DCN20_HUBP(hubp); REG_WRITE(HUBPREQ_DEBUG_DB, 1 << 8); + + REG_UPDATE(DCHUBP_CNTL, HUBP_TTU_DISABLE, 0); } static struct hubp_funcs dcn32_hubp_funcs = { .hubp_enable_tripleBuffer = hubp2_enable_triplebuffer, diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c index 623cde76debf..b907ad1acedd 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c @@ -236,7 +236,8 @@ void dcn35_init_hw(struct dc *dc) } hws->funcs.init_pipes(dc, dc->current_state); - if (dc->res_pool->hubbub->funcs->allow_self_refresh_control) + if (dc->res_pool->hubbub->funcs->allow_self_refresh_control && + !dc->res_pool->hubbub->ctx->dc->debug.disable_stutter) dc->res_pool->hubbub->funcs->allow_self_refresh_control(dc->res_pool->hubbub, !dc->res_pool->hubbub->ctx->dc->debug.disable_stutter); } diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c index ebccb74306a7..f30b3d5eeca5 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c @@ -160,6 +160,10 @@ static int komeda_wb_connector_add(struct komeda_kms_dev *kms, formats = komeda_get_layer_fourcc_list(&mdev->fmt_tbl, kwb_conn->wb_layer->layer_type, &n_formats); + if (!formats) { + kfree(kwb_conn); + return -ENOMEM; + } err = drm_writeback_connector_init(&kms->base, wb_conn, &komeda_wb_connector_funcs, diff --git a/drivers/gpu/drm/ast/ast_dp.c b/drivers/gpu/drm/ast/ast_dp.c index 0e282b7b167c..b9eb67e3fa90 100644 --- a/drivers/gpu/drm/ast/ast_dp.c +++ b/drivers/gpu/drm/ast/ast_dp.c @@ -195,7 +195,7 @@ static bool __ast_dp_wait_enable(struct ast_device *ast, bool enabled) if (enabled) vgacrdf_test |= AST_IO_VGACRDF_DP_VIDEO_ENABLE; - for (i = 0; i < 200; ++i) { + for (i = 0; i < 1000; ++i) { if (i) mdelay(1); vgacrdf = ast_get_index_reg_mask(ast, AST_IO_VGACRI, 0xdf, diff --git a/drivers/gpu/drm/display/drm_dp_cec.c b/drivers/gpu/drm/display/drm_dp_cec.c index 007ceb281d00..56a4965e518c 100644 --- a/drivers/gpu/drm/display/drm_dp_cec.c +++ b/drivers/gpu/drm/display/drm_dp_cec.c @@ -311,16 +311,6 @@ void drm_dp_cec_attach(struct drm_dp_aux *aux, u16 source_physical_address) if (!aux->transfer) return; -#ifndef CONFIG_MEDIA_CEC_RC - /* - * CEC_CAP_RC is part of CEC_CAP_DEFAULTS, but it is stripped by - * cec_allocate_adapter() if CONFIG_MEDIA_CEC_RC is undefined. - * - * Do this here as well to ensure the tests against cec_caps are - * correct. - */ - cec_caps &= ~CEC_CAP_RC; -#endif cancel_delayed_work_sync(&aux->cec.unregister_work); mutex_lock(&aux->cec.lock); @@ -337,7 +327,9 @@ void drm_dp_cec_attach(struct drm_dp_aux *aux, u16 source_physical_address) num_las = CEC_MAX_LOG_ADDRS; if (aux->cec.adap) { - if (aux->cec.adap->capabilities == cec_caps && + /* Check if the adapter properties have changed */ + if ((aux->cec.adap->capabilities & CEC_CAP_MONITOR_ALL) == + (cec_caps & CEC_CAP_MONITOR_ALL) && aux->cec.adap->available_log_addrs == num_las) { /* Unchanged, so just set the phys addr */ cec_s_phys_addr(aux->cec.adap, source_physical_address, false); diff --git a/drivers/gpu/drm/i915/display/intel_backlight.c b/drivers/gpu/drm/i915/display/intel_backlight.c index fc1e517e074a..7e6ce905bdaf 100644 --- a/drivers/gpu/drm/i915/display/intel_backlight.c +++ b/drivers/gpu/drm/i915/display/intel_backlight.c @@ -41,8 +41,9 @@ static u32 scale(u32 source_val, { u64 target_val; - WARN_ON(source_min > source_max); - WARN_ON(target_min > target_max); + if (WARN_ON(source_min >= source_max) || + WARN_ON(target_min > target_max)) + return target_min; /* defensive */ source_val = clamp(source_val, source_min, source_max); diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index f1f3b1bb1e89..aa77ddcee42c 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -1791,7 +1791,7 @@ int intel_dp_dsc_max_src_input_bpc(struct intel_display *display) if (DISPLAY_VER(display) == 11) return 10; - return 0; + return intel_dp_dsc_min_src_input_bpc(); } int intel_dp_dsc_compute_max_bpp(const struct intel_connector *connector, @@ -2072,11 +2072,10 @@ icl_dsc_compute_link_config(struct intel_dp *intel_dp, /* Compressed BPP should be less than the Input DSC bpp */ dsc_max_bpp = min(dsc_max_bpp, pipe_bpp - 1); - for (i = 0; i < ARRAY_SIZE(valid_dsc_bpp); i++) { - if (valid_dsc_bpp[i] < dsc_min_bpp) + for (i = ARRAY_SIZE(valid_dsc_bpp) - 1; i >= 0; i--) { + if (valid_dsc_bpp[i] < dsc_min_bpp || + valid_dsc_bpp[i] > dsc_max_bpp) continue; - if (valid_dsc_bpp[i] > dsc_max_bpp) - break; ret = dsc_compute_link_config(intel_dp, pipe_config, @@ -2829,7 +2828,6 @@ static void intel_dp_compute_as_sdp(struct intel_dp *intel_dp, crtc_state->infoframes.enable |= intel_hdmi_infoframe_enable(DP_SDP_ADAPTIVE_SYNC); - /* Currently only DP_AS_SDP_AVT_FIXED_VTOTAL mode supported */ as_sdp->sdp_type = DP_SDP_ADAPTIVE_SYNC; as_sdp->length = 0x9; as_sdp->duration_incr_ms = 0; @@ -2840,7 +2838,7 @@ static void intel_dp_compute_as_sdp(struct intel_dp *intel_dp, as_sdp->target_rr = drm_mode_vrefresh(adjusted_mode); as_sdp->target_rr_divider = true; } else { - as_sdp->mode = DP_AS_SDP_AVT_FIXED_VTOTAL; + as_sdp->mode = DP_AS_SDP_AVT_DYNAMIC_VTOTAL; as_sdp->vtotal = adjusted_mode->vtotal; as_sdp->target_rr = 0; } diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c b/drivers/gpu/drm/i915/display/intel_dp_mst.c index 0c44fc7dd86c..a65cf97ad12d 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/display/intel_dp_mst.c @@ -341,6 +341,10 @@ int intel_dp_mtp_tu_compute_config(struct intel_dp *intel_dp, break; } + + /* Allow using zero step to indicate one try */ + if (!step) + break; } if (slots < 0) { diff --git a/drivers/gpu/drm/i915/display/intel_hdcp.c b/drivers/gpu/drm/i915/display/intel_hdcp.c index 7464b44c8bb3..1bab7c34a794 100644 --- a/drivers/gpu/drm/i915/display/intel_hdcp.c +++ b/drivers/gpu/drm/i915/display/intel_hdcp.c @@ -41,7 +41,7 @@ intel_hdcp_adjust_hdcp_line_rekeying(struct intel_encoder *encoder, u32 rekey_bit = 0; /* Here we assume HDMI is in TMDS mode of operation */ - if (encoder->type != INTEL_OUTPUT_HDMI) + if (!intel_encoder_is_hdmi(encoder)) return; if (DISPLAY_VER(display) >= 30) { @@ -2188,6 +2188,19 @@ static int intel_hdcp2_check_link(struct intel_connector *connector) drm_dbg_kms(display->drm, "HDCP2.2 Downstream topology change\n"); + + ret = hdcp2_authenticate_repeater_topology(connector); + if (!ret) { + intel_hdcp_update_value(connector, + DRM_MODE_CONTENT_PROTECTION_ENABLED, + true); + goto out; + } + + drm_dbg_kms(display->drm, + "[CONNECTOR:%d:%s] Repeater topology auth failed.(%d)\n", + connector->base.base.id, connector->base.name, + ret); } else { drm_dbg_kms(display->drm, "[CONNECTOR:%d:%s] HDCP2.2 link failed, retrying auth\n", diff --git a/drivers/gpu/drm/i915/display/skl_universal_plane.c b/drivers/gpu/drm/i915/display/skl_universal_plane.c index ff9764cac1e7..80e558042d97 100644 --- a/drivers/gpu/drm/i915/display/skl_universal_plane.c +++ b/drivers/gpu/drm/i915/display/skl_universal_plane.c @@ -106,8 +106,6 @@ static const u32 icl_sdr_y_plane_formats[] = { DRM_FORMAT_Y216, DRM_FORMAT_XYUV8888, DRM_FORMAT_XVYU2101010, - DRM_FORMAT_XVYU12_16161616, - DRM_FORMAT_XVYU16161616, }; static const u32 icl_sdr_uv_plane_formats[] = { @@ -134,8 +132,6 @@ static const u32 icl_sdr_uv_plane_formats[] = { DRM_FORMAT_Y216, DRM_FORMAT_XYUV8888, DRM_FORMAT_XVYU2101010, - DRM_FORMAT_XVYU12_16161616, - DRM_FORMAT_XVYU16161616, }; static const u32 icl_hdr_plane_formats[] = { diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index fe69f2c8527d..ae3343c81a64 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -209,8 +209,6 @@ static int shmem_get_pages(struct drm_i915_gem_object *obj) struct address_space *mapping = obj->base.filp->f_mapping; unsigned int max_segment = i915_sg_segment_size(i915->drm.dev); struct sg_table *st; - struct sgt_iter sgt_iter; - struct page *page; int ret; /* @@ -239,9 +237,7 @@ rebuild_st: * for PAGE_SIZE chunks instead may be helpful. */ if (max_segment > PAGE_SIZE) { - for_each_sgt_page(page, sgt_iter, st) - put_page(page); - sg_free_table(st); + shmem_sg_free_table(st, mapping, false, false); kfree(st); max_segment = PAGE_SIZE; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 12f1ba7ca9c1..cc05bd9e43b4 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1469,6 +1469,19 @@ static void __reset_guc_busyness_stats(struct intel_guc *guc) spin_unlock_irqrestore(&guc->timestamp.lock, flags); } +static void __update_guc_busyness_running_state(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + struct intel_engine_cs *engine; + enum intel_engine_id id; + unsigned long flags; + + spin_lock_irqsave(&guc->timestamp.lock, flags); + for_each_engine(engine, gt, id) + engine->stats.guc.running = false; + spin_unlock_irqrestore(&guc->timestamp.lock, flags); +} + static void __update_guc_busyness_stats(struct intel_guc *guc) { struct intel_gt *gt = guc_to_gt(guc); @@ -1619,6 +1632,9 @@ void intel_guc_busyness_park(struct intel_gt *gt) if (!guc_submission_initialized(guc)) return; + /* Assume no engines are running and set running state to false */ + __update_guc_busyness_running_state(guc); + /* * There is a race with suspend flow where the worker runs after suspend * and causes an unclaimed register access warning. Cancel the worker @@ -5519,12 +5535,20 @@ static inline void guc_log_context(struct drm_printer *p, { drm_printf(p, "GuC lrc descriptor %u:\n", ce->guc_id.id); drm_printf(p, "\tHW Context Desc: 0x%08x\n", ce->lrc.lrca); - drm_printf(p, "\t\tLRC Head: Internal %u, Memory %u\n", - ce->ring->head, - ce->lrc_reg_state[CTX_RING_HEAD]); - drm_printf(p, "\t\tLRC Tail: Internal %u, Memory %u\n", - ce->ring->tail, - ce->lrc_reg_state[CTX_RING_TAIL]); + if (intel_context_pin_if_active(ce)) { + drm_printf(p, "\t\tLRC Head: Internal %u, Memory %u\n", + ce->ring->head, + ce->lrc_reg_state[CTX_RING_HEAD]); + drm_printf(p, "\t\tLRC Tail: Internal %u, Memory %u\n", + ce->ring->tail, + ce->lrc_reg_state[CTX_RING_TAIL]); + intel_context_unpin(ce); + } else { + drm_printf(p, "\t\tLRC Head: Internal %u, Memory not pinned\n", + ce->ring->head); + drm_printf(p, "\t\tLRC Tail: Internal %u, Memory not pinned\n", + ce->ring->tail); + } drm_printf(p, "\t\tContext Pin Count: %u\n", atomic_read(&ce->pin_count)); drm_printf(p, "\t\tGuC ID Ref Count: %u\n", diff --git a/drivers/gpu/drm/xe/regs/xe_oa_regs.h b/drivers/gpu/drm/xe/regs/xe_oa_regs.h index a49561e9f3c3..a79ad2da070c 100644 --- a/drivers/gpu/drm/xe/regs/xe_oa_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_oa_regs.h @@ -51,6 +51,10 @@ /* Common to all OA units */ #define OA_OACONTROL_REPORT_BC_MASK REG_GENMASK(9, 9) #define OA_OACONTROL_COUNTER_SIZE_MASK REG_GENMASK(8, 8) +#define OAG_OACONTROL_USED_BITS \ + (OAG_OACONTROL_OA_PES_DISAG_EN | OAG_OACONTROL_OA_CCS_SELECT_MASK | \ + OAG_OACONTROL_OA_COUNTER_SEL_MASK | OAG_OACONTROL_OA_COUNTER_ENABLE | \ + OA_OACONTROL_REPORT_BC_MASK | OA_OACONTROL_COUNTER_SIZE_MASK) #define OAG_OA_DEBUG XE_REG(0xdaf8, XE_REG_OPTION_MASKED) #define OAG_OA_DEBUG_DISABLE_MMIO_TRG REG_BIT(14) @@ -78,6 +82,8 @@ #define OAM_CONTEXT_CONTROL_OFFSET (0x1bc) #define OAM_CONTROL_OFFSET (0x194) #define OAM_CONTROL_COUNTER_SEL_MASK REG_GENMASK(3, 1) +#define OAM_OACONTROL_USED_BITS \ + (OAM_CONTROL_COUNTER_SEL_MASK | OAG_OACONTROL_OA_COUNTER_ENABLE) #define OAM_DEBUG_OFFSET (0x198) #define OAM_STATUS_OFFSET (0x19c) #define OAM_MMIO_TRG_OFFSET (0x1d0) diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c index 81dc7795c065..39fe485d2085 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.c +++ b/drivers/gpu/drm/xe/xe_devcoredump.c @@ -119,11 +119,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count, drm_puts(&p, "\n**** GuC CT ****\n"); xe_guc_ct_snapshot_print(ss->guc.ct, &p); - /* - * Don't add a new section header here because the mesa debug decoder - * tool expects the context information to be in the 'GuC CT' section. - */ - /* drm_puts(&p, "\n**** Contexts ****\n"); */ + drm_puts(&p, "\n**** Contexts ****\n"); xe_guc_exec_queue_snapshot_print(ss->ge, &p); drm_puts(&p, "\n**** Job ****\n"); @@ -395,42 +391,34 @@ int xe_devcoredump_init(struct xe_device *xe) /** * xe_print_blob_ascii85 - print a BLOB to some useful location in ASCII85 * - * The output is split to multiple lines because some print targets, e.g. dmesg - * cannot handle arbitrarily long lines. Note also that printing to dmesg in - * piece-meal fashion is not possible, each separate call to drm_puts() has a - * line-feed automatically added! Therefore, the entire output line must be - * constructed in a local buffer first, then printed in one atomic output call. + * The output is split into multiple calls to drm_puts() because some print + * targets, e.g. dmesg, cannot handle arbitrarily long lines. These targets may + * add newlines, as is the case with dmesg: each drm_puts() call creates a + * separate line. * * There is also a scheduler yield call to prevent the 'task has been stuck for * 120s' kernel hang check feature from firing when printing to a slow target * such as dmesg over a serial port. * - * TODO: Add compression prior to the ASCII85 encoding to shrink huge buffers down. - * * @p: the printer object to output to * @prefix: optional prefix to add to output string + * @suffix: optional suffix to add at the end. 0 disables it and is + * not added to the output, which is useful when using multiple calls + * to dump data to @p * @blob: the Binary Large OBject to dump out * @offset: offset in bytes to skip from the front of the BLOB, must be a multiple of sizeof(u32) * @size: the size in bytes of the BLOB, must be a multiple of sizeof(u32) */ -void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, +void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffix, const void *blob, size_t offset, size_t size) { const u32 *blob32 = (const u32 *)blob; char buff[ASCII85_BUFSZ], *line_buff; size_t line_pos = 0; - /* - * Splitting blobs across multiple lines is not compatible with the mesa - * debug decoder tool. Note that even dropping the explicit '\n' below - * doesn't help because the GuC log is so big some underlying implementation - * still splits the lines at 512K characters. So just bail completely for - * the moment. - */ - return; - #define DMESG_MAX_LINE_LEN 800 -#define MIN_SPACE (ASCII85_BUFSZ + 2) /* 85 + "\n\0" */ + /* Always leave space for the suffix char and the \0 */ +#define MIN_SPACE (ASCII85_BUFSZ + 2) /* 85 + "<suffix>\0" */ if (size & 3) drm_printf(p, "Size not word aligned: %zu", size); @@ -462,7 +450,6 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, line_pos += strlen(line_buff + line_pos); if ((line_pos + MIN_SPACE) >= DMESG_MAX_LINE_LEN) { - line_buff[line_pos++] = '\n'; line_buff[line_pos++] = 0; drm_puts(p, line_buff); @@ -474,10 +461,11 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, } } + if (suffix) + line_buff[line_pos++] = suffix; + if (line_pos) { - line_buff[line_pos++] = '\n'; line_buff[line_pos++] = 0; - drm_puts(p, line_buff); } diff --git a/drivers/gpu/drm/xe/xe_devcoredump.h b/drivers/gpu/drm/xe/xe_devcoredump.h index 6a17e6d60102..5391a80a4d1b 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.h +++ b/drivers/gpu/drm/xe/xe_devcoredump.h @@ -29,7 +29,7 @@ static inline int xe_devcoredump_init(struct xe_device *xe) } #endif -void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, +void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffix, const void *blob, size_t offset, size_t size); #endif diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 26e64530ada2..5d6fb79957b6 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -532,8 +532,10 @@ static int all_fw_domain_init(struct xe_gt *gt) if (IS_SRIOV_PF(gt_to_xe(gt)) && !xe_gt_is_media_type(gt)) xe_lmtt_init_hw(>_to_tile(gt)->sriov.pf.lmtt); - if (IS_SRIOV_PF(gt_to_xe(gt))) + if (IS_SRIOV_PF(gt_to_xe(gt))) { + xe_gt_sriov_pf_init(gt); xe_gt_sriov_pf_init_hw(gt); + } xe_force_wake_put(gt_to_fw(gt), fw_ref); diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c index e71fc3d2bda2..6f906c8e8108 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c @@ -68,6 +68,19 @@ int xe_gt_sriov_pf_init_early(struct xe_gt *gt) return 0; } +/** + * xe_gt_sriov_pf_init - Prepare SR-IOV PF data structures on PF. + * @gt: the &xe_gt to initialize + * + * Late one-time initialization of the PF data. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_gt_sriov_pf_init(struct xe_gt *gt) +{ + return xe_gt_sriov_pf_migration_init(gt); +} + static bool pf_needs_enable_ggtt_guest_update(struct xe_device *xe) { return GRAPHICS_VERx100(xe) == 1200; @@ -90,7 +103,6 @@ void xe_gt_sriov_pf_init_hw(struct xe_gt *gt) pf_enable_ggtt_guest_update(gt); xe_gt_sriov_pf_service_update(gt); - xe_gt_sriov_pf_migration_init(gt); } static u32 pf_get_vf_regs_stride(struct xe_device *xe) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf.h index 96fab779a906..f474509411c0 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf.h +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf.h @@ -10,6 +10,7 @@ struct xe_gt; #ifdef CONFIG_PCI_IOV int xe_gt_sriov_pf_init_early(struct xe_gt *gt); +int xe_gt_sriov_pf_init(struct xe_gt *gt); void xe_gt_sriov_pf_init_hw(struct xe_gt *gt); void xe_gt_sriov_pf_sanitize_hw(struct xe_gt *gt, unsigned int vfid); void xe_gt_sriov_pf_restart(struct xe_gt *gt); @@ -19,6 +20,11 @@ static inline int xe_gt_sriov_pf_init_early(struct xe_gt *gt) return 0; } +static inline int xe_gt_sriov_pf_init(struct xe_gt *gt) +{ + return 0; +} + static inline void xe_gt_sriov_pf_init_hw(struct xe_gt *gt) { } diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index 8b65c5e959cc..50c8076b5158 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -1724,7 +1724,8 @@ void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot, snapshot->g2h_outstanding); if (snapshot->ctb) - xe_print_blob_ascii85(p, "CTB data", snapshot->ctb, 0, snapshot->ctb_size); + xe_print_blob_ascii85(p, "CTB data", '\n', + snapshot->ctb, 0, snapshot->ctb_size); } else { drm_puts(p, "CT disabled\n"); } diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c index df4cfb698cdb..2baa4d95571f 100644 --- a/drivers/gpu/drm/xe/xe_guc_log.c +++ b/drivers/gpu/drm/xe/xe_guc_log.c @@ -211,8 +211,10 @@ void xe_guc_log_snapshot_print(struct xe_guc_log_snapshot *snapshot, struct drm_ remain = snapshot->size; for (i = 0; i < snapshot->num_chunks; i++) { size_t size = min(GUC_LOG_CHUNK_SIZE, remain); + const char *prefix = i ? NULL : "Log data"; + char suffix = i == snapshot->num_chunks - 1 ? '\n' : 0; - xe_print_blob_ascii85(p, i ? NULL : "Log data", snapshot->copy[i], 0, size); + xe_print_blob_ascii85(p, prefix, suffix, snapshot->copy[i], 0, size); remain -= size; } } diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index eeb96b5f49e2..fa873f3d0a9d 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -237,7 +237,6 @@ static bool xe_oa_buffer_check_unlocked(struct xe_oa_stream *stream) u32 tail, hw_tail, partial_report_size, available; int report_size = stream->oa_buffer.format->size; unsigned long flags; - bool pollin; spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); @@ -282,11 +281,11 @@ static bool xe_oa_buffer_check_unlocked(struct xe_oa_stream *stream) stream->oa_buffer.tail = tail; available = xe_oa_circ_diff(stream, stream->oa_buffer.tail, stream->oa_buffer.head); - pollin = available >= stream->wait_num_reports * report_size; + stream->pollin = available >= stream->wait_num_reports * report_size; spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); - return pollin; + return stream->pollin; } static enum hrtimer_restart xe_oa_poll_check_timer_cb(struct hrtimer *hrtimer) @@ -294,10 +293,8 @@ static enum hrtimer_restart xe_oa_poll_check_timer_cb(struct hrtimer *hrtimer) struct xe_oa_stream *stream = container_of(hrtimer, typeof(*stream), poll_check_timer); - if (xe_oa_buffer_check_unlocked(stream)) { - stream->pollin = true; + if (xe_oa_buffer_check_unlocked(stream)) wake_up(&stream->poll_wq); - } hrtimer_forward_now(hrtimer, ns_to_ktime(stream->poll_period_ns)); @@ -452,6 +449,12 @@ static u32 __oa_ccs_select(struct xe_oa_stream *stream) return val; } +static u32 __oactrl_used_bits(struct xe_oa_stream *stream) +{ + return stream->hwe->oa_unit->type == DRM_XE_OA_UNIT_TYPE_OAG ? + OAG_OACONTROL_USED_BITS : OAM_OACONTROL_USED_BITS; +} + static void xe_oa_enable(struct xe_oa_stream *stream) { const struct xe_oa_format *format = stream->oa_buffer.format; @@ -472,14 +475,14 @@ static void xe_oa_enable(struct xe_oa_stream *stream) stream->hwe->oa_unit->type == DRM_XE_OA_UNIT_TYPE_OAG) val |= OAG_OACONTROL_OA_PES_DISAG_EN; - xe_mmio_write32(&stream->gt->mmio, regs->oa_ctrl, val); + xe_mmio_rmw32(&stream->gt->mmio, regs->oa_ctrl, __oactrl_used_bits(stream), val); } static void xe_oa_disable(struct xe_oa_stream *stream) { struct xe_mmio *mmio = &stream->gt->mmio; - xe_mmio_write32(mmio, __oa_regs(stream)->oa_ctrl, 0); + xe_mmio_rmw32(mmio, __oa_regs(stream)->oa_ctrl, __oactrl_used_bits(stream), 0); if (xe_mmio_wait32(mmio, __oa_regs(stream)->oa_ctrl, OAG_OACONTROL_OA_COUNTER_ENABLE, 0, 50000, NULL, false)) drm_err(&stream->oa->xe->drm, @@ -2534,6 +2537,8 @@ static void __xe_oa_init_oa_units(struct xe_gt *gt) u->type = DRM_XE_OA_UNIT_TYPE_OAM; } + xe_mmio_write32(>->mmio, u->regs.oa_ctrl, 0); + /* Ensure MMIO trigger remains disabled till there is a stream */ xe_mmio_write32(>->mmio, u->regs.oa_debug, oag_configure_mmio_trigger(NULL, false)); diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index b53eb569bd49..dfc245867a46 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -570,6 +570,8 @@ config HID_LED config HID_LENOVO tristate "Lenovo / Thinkpad devices" + depends on ACPI + select ACPI_PLATFORM_PROFILE select NEW_LEDS select LEDS_CLASS help @@ -1167,7 +1169,8 @@ config HID_TOPRE tristate "Topre REALFORCE keyboards" depends on HID help - Say Y for N-key rollover support on Topre REALFORCE R2 108/87 key keyboards. + Say Y for N-key rollover support on Topre REALFORCE R2 108/87 key and + Topre REALFORCE R3S 87 key keyboards. config HID_THINGM tristate "ThingM blink(1) USB RGB LED" @@ -1374,10 +1377,6 @@ endmenu source "drivers/hid/bpf/Kconfig" -endif # HID - -source "drivers/hid/usbhid/Kconfig" - source "drivers/hid/i2c-hid/Kconfig" source "drivers/hid/intel-ish-hid/Kconfig" @@ -1388,4 +1387,10 @@ source "drivers/hid/surface-hid/Kconfig" source "drivers/hid/intel-thc-hid/Kconfig" +endif # HID + +# USB support may be used with HID disabled + +source "drivers/hid/usbhid/Kconfig" + endif # HID_SUPPORT diff --git a/drivers/hid/amd-sfh-hid/Kconfig b/drivers/hid/amd-sfh-hid/Kconfig index 329de5e12c1a..3291786a5ee6 100644 --- a/drivers/hid/amd-sfh-hid/Kconfig +++ b/drivers/hid/amd-sfh-hid/Kconfig @@ -5,7 +5,6 @@ menu "AMD SFH HID Support" config AMD_SFH_HID tristate "AMD Sensor Fusion Hub" - depends on HID depends on X86 help If you say yes to this option, support will be included for the diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 7e1ae2a2bcc2..49812a76b7ed 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -474,6 +474,7 @@ static int hidinput_apple_event(struct hid_device *hid, struct input_dev *input, hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_NUMPAD_2015) table = magic_keyboard_2015_fn_keys; else if (hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2021 || + hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2024 || hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_FINGERPRINT_2021 || hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_NUMPAD_2021) table = apple2021_fn_keys; @@ -545,6 +546,9 @@ static int hidinput_apple_event(struct hid_device *hid, struct input_dev *input, } } + if (usage->hid == 0xc0301) /* Omoton KB066 quirk */ + code = KEY_F6; + if (usage->code != code) { input_event_with_scancode(input, usage->type, code, usage->hid, value); @@ -1150,6 +1154,10 @@ static const struct hid_device_id apple_devices[] = { .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK | APPLE_RDESC_BATTERY }, { HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2021), .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2024), + .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK | APPLE_RDESC_BATTERY }, + { HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2024), + .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_FINGERPRINT_2021), .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK | APPLE_RDESC_BATTERY }, { HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_FINGERPRINT_2021), diff --git a/drivers/hid/hid-corsair-void.c b/drivers/hid/hid-corsair-void.c index 6ece56b850fc..56e858066c3c 100644 --- a/drivers/hid/hid-corsair-void.c +++ b/drivers/hid/hid-corsair-void.c @@ -553,7 +553,7 @@ static void corsair_void_battery_remove_work_handler(struct work_struct *work) static void corsair_void_battery_add_work_handler(struct work_struct *work) { struct corsair_void_drvdata *drvdata; - struct power_supply_config psy_cfg; + struct power_supply_config psy_cfg = {}; struct power_supply *new_supply; drvdata = container_of(work, struct corsair_void_drvdata, @@ -726,6 +726,7 @@ static void corsair_void_remove(struct hid_device *hid_dev) if (drvdata->battery) power_supply_unregister(drvdata->battery); + cancel_delayed_work_sync(&drvdata->delayed_status_work); cancel_delayed_work_sync(&drvdata->delayed_firmware_work); sysfs_remove_group(&hid_dev->dev.kobj, &corsair_void_attr_group); } diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index c448de53bf91..7e400624908e 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -184,6 +184,7 @@ #define USB_DEVICE_ID_APPLE_IRCONTROL4 0x8242 #define USB_DEVICE_ID_APPLE_IRCONTROL5 0x8243 #define USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2021 0x029c +#define USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2024 0x0320 #define USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_FINGERPRINT_2021 0x029a #define USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_NUMPAD_2021 0x029f #define USB_DEVICE_ID_APPLE_TOUCHBAR_BACKLIGHT 0x8102 @@ -1095,6 +1096,7 @@ #define USB_DEVICE_ID_QUANTA_OPTICAL_TOUCH_3001 0x3001 #define USB_DEVICE_ID_QUANTA_OPTICAL_TOUCH_3003 0x3003 #define USB_DEVICE_ID_QUANTA_OPTICAL_TOUCH_3008 0x3008 +#define USB_DEVICE_ID_QUANTA_HP_5MP_CAMERA_5473 0x5473 #define I2C_VENDOR_ID_RAYDIUM 0x2386 #define I2C_PRODUCT_ID_RAYDIUM_4B33 0x4b33 @@ -1301,6 +1303,7 @@ #define USB_VENDOR_ID_TOPRE 0x0853 #define USB_DEVICE_ID_TOPRE_REALFORCE_R2_108 0x0148 #define USB_DEVICE_ID_TOPRE_REALFORCE_R2_87 0x0146 +#define USB_DEVICE_ID_TOPRE_REALFORCE_R3S_87 0x0313 #define USB_VENDOR_ID_TOPSEED 0x0766 #define USB_DEVICE_ID_TOPSEED_CYBERLINK 0x0204 diff --git a/drivers/hid/hid-lenovo.c b/drivers/hid/hid-lenovo.c index 4d00bc4d656e..a7d9ca02779e 100644 --- a/drivers/hid/hid-lenovo.c +++ b/drivers/hid/hid-lenovo.c @@ -32,9 +32,7 @@ #include <linux/leds.h> #include <linux/workqueue.h> -#if IS_ENABLED(CONFIG_ACPI_PLATFORM_PROFILE) #include <linux/platform_profile.h> -#endif /* CONFIG_ACPI_PLATFORM_PROFILE */ #include "hid-ids.h" @@ -730,13 +728,10 @@ static int lenovo_raw_event_TP_X12_tab(struct hid_device *hdev, u32 raw_data) if (hdev->product == USB_DEVICE_ID_LENOVO_X12_TAB) { report_key_event(input, KEY_RFKILL); return 1; - } -#if IS_ENABLED(CONFIG_ACPI_PLATFORM_PROFILE) - else { + } else { platform_profile_cycle(); return 1; } -#endif /* CONFIG_ACPI_PLATFORM_PROFILE */ return 0; case TP_X12_RAW_HOTKEY_FN_F10: /* TAB1 has PICKUP Phone and TAB2 use Snipping tool*/ diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 82900857bfd8..e50887a6d22c 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -1679,9 +1679,12 @@ static int mt_input_configured(struct hid_device *hdev, struct hid_input *hi) break; } - if (suffix) + if (suffix) { hi->input->name = devm_kasprintf(&hdev->dev, GFP_KERNEL, "%s %s", hdev->name, suffix); + if (!hi->input->name) + return -ENOMEM; + } return 0; } diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index e0bbf0c6345d..5d7a418ccdbe 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -891,6 +891,7 @@ static const struct hid_device_id hid_ignore_list[] = { { HID_USB_DEVICE(USB_VENDOR_ID_SYNAPTICS, USB_DEVICE_ID_SYNAPTICS_DPAD) }, #endif { HID_USB_DEVICE(USB_VENDOR_ID_YEALINK, USB_DEVICE_ID_YEALINK_P1K_P4K_B2K) }, + { HID_USB_DEVICE(USB_VENDOR_ID_QUANTA, USB_DEVICE_ID_QUANTA_HP_5MP_CAMERA_5473) }, { } }; diff --git a/drivers/hid/hid-steam.c b/drivers/hid/hid-steam.c index af38fc8eb34f..c9e65e9088b3 100644 --- a/drivers/hid/hid-steam.c +++ b/drivers/hid/hid-steam.c @@ -313,6 +313,7 @@ struct steam_device { u16 rumble_left; u16 rumble_right; unsigned int sensor_timestamp_us; + struct work_struct unregister_work; }; static int steam_recv_report(struct steam_device *steam, @@ -1050,10 +1051,10 @@ static void steam_mode_switch_cb(struct work_struct *work) struct steam_device, mode_switch); unsigned long flags; bool client_opened; - steam->gamepad_mode = !steam->gamepad_mode; if (!lizard_mode) return; + steam->gamepad_mode = !steam->gamepad_mode; if (steam->gamepad_mode) steam_set_lizard_mode(steam, false); else { @@ -1072,6 +1073,31 @@ static void steam_mode_switch_cb(struct work_struct *work) } } +static void steam_work_unregister_cb(struct work_struct *work) +{ + struct steam_device *steam = container_of(work, struct steam_device, + unregister_work); + unsigned long flags; + bool connected; + bool opened; + + spin_lock_irqsave(&steam->lock, flags); + opened = steam->client_opened; + connected = steam->connected; + spin_unlock_irqrestore(&steam->lock, flags); + + if (connected) { + if (opened) { + steam_sensors_unregister(steam); + steam_input_unregister(steam); + } else { + steam_set_lizard_mode(steam, lizard_mode); + steam_input_register(steam); + steam_sensors_register(steam); + } + } +} + static bool steam_is_valve_interface(struct hid_device *hdev) { struct hid_report_enum *rep_enum; @@ -1117,8 +1143,7 @@ static int steam_client_ll_open(struct hid_device *hdev) steam->client_opened++; spin_unlock_irqrestore(&steam->lock, flags); - steam_sensors_unregister(steam); - steam_input_unregister(steam); + schedule_work(&steam->unregister_work); return 0; } @@ -1135,11 +1160,7 @@ static void steam_client_ll_close(struct hid_device *hdev) connected = steam->connected && !steam->client_opened; spin_unlock_irqrestore(&steam->lock, flags); - if (connected) { - steam_set_lizard_mode(steam, lizard_mode); - steam_input_register(steam); - steam_sensors_register(steam); - } + schedule_work(&steam->unregister_work); } static int steam_client_ll_raw_request(struct hid_device *hdev, @@ -1231,6 +1252,7 @@ static int steam_probe(struct hid_device *hdev, INIT_LIST_HEAD(&steam->list); INIT_WORK(&steam->rumble_work, steam_haptic_rumble_cb); steam->sensor_timestamp_us = 0; + INIT_WORK(&steam->unregister_work, steam_work_unregister_cb); /* * With the real steam controller interface, do not connect hidraw. @@ -1291,6 +1313,7 @@ err_cancel_work: cancel_work_sync(&steam->work_connect); cancel_delayed_work_sync(&steam->mode_switch); cancel_work_sync(&steam->rumble_work); + cancel_work_sync(&steam->unregister_work); return ret; } @@ -1307,6 +1330,7 @@ static void steam_remove(struct hid_device *hdev) cancel_delayed_work_sync(&steam->mode_switch); cancel_work_sync(&steam->work_connect); cancel_work_sync(&steam->rumble_work); + cancel_work_sync(&steam->unregister_work); hid_destroy_device(steam->client_hdev); steam->client_hdev = NULL; steam->client_opened = 0; @@ -1593,13 +1617,13 @@ static void steam_do_deck_input_event(struct steam_device *steam, if (!(b9 & BIT(6)) && steam->did_mode_switch) { steam->did_mode_switch = false; - cancel_delayed_work_sync(&steam->mode_switch); + cancel_delayed_work(&steam->mode_switch); } else if (!steam->client_opened && (b9 & BIT(6)) && !steam->did_mode_switch) { steam->did_mode_switch = true; schedule_delayed_work(&steam->mode_switch, 45 * HZ / 100); } - if (!steam->gamepad_mode) + if (!steam->gamepad_mode && lizard_mode) return; lpad_touched = b10 & BIT(3); @@ -1669,7 +1693,7 @@ static void steam_do_deck_sensors_event(struct steam_device *steam, */ steam->sensor_timestamp_us += 4000; - if (!steam->gamepad_mode) + if (!steam->gamepad_mode && lizard_mode) return; input_event(sensors, EV_MSC, MSC_TIMESTAMP, steam->sensor_timestamp_us); diff --git a/drivers/hid/hid-thrustmaster.c b/drivers/hid/hid-thrustmaster.c index 6c3e758bbb09..3b81468a1df2 100644 --- a/drivers/hid/hid-thrustmaster.c +++ b/drivers/hid/hid-thrustmaster.c @@ -171,7 +171,7 @@ static void thrustmaster_interrupts(struct hid_device *hdev) b_ep = ep->desc.bEndpointAddress; /* Are the expected endpoints present? */ - u8 ep_addr[1] = {b_ep}; + u8 ep_addr[2] = {b_ep, 0}; if (!usb_check_int_endpoints(usbif, ep_addr)) { hid_err(hdev, "Unexpected non-int endpoint\n"); diff --git a/drivers/hid/hid-topre.c b/drivers/hid/hid-topre.c index 848361f6225d..ccedf8721722 100644 --- a/drivers/hid/hid-topre.c +++ b/drivers/hid/hid-topre.c @@ -29,6 +29,11 @@ static const __u8 *topre_report_fixup(struct hid_device *hdev, __u8 *rdesc, hid_info(hdev, "fixing up Topre REALFORCE keyboard report descriptor\n"); rdesc[72] = 0x02; + } else if (*rsize >= 106 && rdesc[28] == 0x29 && rdesc[29] == 0xe7 && + rdesc[30] == 0x81 && rdesc[31] == 0x00) { + hid_info(hdev, + "fixing up Topre REALFORCE keyboard report descriptor\n"); + rdesc[31] = 0x02; } return rdesc; } @@ -38,6 +43,8 @@ static const struct hid_device_id topre_id_table[] = { USB_DEVICE_ID_TOPRE_REALFORCE_R2_108) }, { HID_USB_DEVICE(USB_VENDOR_ID_TOPRE, USB_DEVICE_ID_TOPRE_REALFORCE_R2_87) }, + { HID_USB_DEVICE(USB_VENDOR_ID_TOPRE, + USB_DEVICE_ID_TOPRE_REALFORCE_R3S_87) }, { } }; MODULE_DEVICE_TABLE(hid, topre_id_table); diff --git a/drivers/hid/hid-winwing.c b/drivers/hid/hid-winwing.c index 831b760c66ea..d4afbbd27807 100644 --- a/drivers/hid/hid-winwing.c +++ b/drivers/hid/hid-winwing.c @@ -106,6 +106,8 @@ static int winwing_init_led(struct hid_device *hdev, "%s::%s", dev_name(&input->dev), info->led_name); + if (!led->cdev.name) + return -ENOMEM; ret = devm_led_classdev_register(&hdev->dev, &led->cdev); if (ret) diff --git a/drivers/hid/i2c-hid/Kconfig b/drivers/hid/i2c-hid/Kconfig index ef7c595c9403..e8d51f410cc1 100644 --- a/drivers/hid/i2c-hid/Kconfig +++ b/drivers/hid/i2c-hid/Kconfig @@ -2,7 +2,7 @@ menuconfig I2C_HID tristate "I2C HID support" default y - depends on I2C && INPUT && HID + depends on I2C if I2C_HID diff --git a/drivers/hid/intel-ish-hid/Kconfig b/drivers/hid/intel-ish-hid/Kconfig index 253dc10d35ef..568c8688784e 100644 --- a/drivers/hid/intel-ish-hid/Kconfig +++ b/drivers/hid/intel-ish-hid/Kconfig @@ -6,7 +6,6 @@ config INTEL_ISH_HID tristate "Intel Integrated Sensor Hub" default n depends on X86 - depends on HID help The Integrated Sensor Hub (ISH) enables the ability to offload sensor polling and algorithm processing to a dedicated low power diff --git a/drivers/hid/intel-ish-hid/ipc/hw-ish.h b/drivers/hid/intel-ish-hid/ipc/hw-ish.h index cdd80c653918..07e90d51f073 100644 --- a/drivers/hid/intel-ish-hid/ipc/hw-ish.h +++ b/drivers/hid/intel-ish-hid/ipc/hw-ish.h @@ -36,6 +36,8 @@ #define PCI_DEVICE_ID_INTEL_ISH_ARL_H 0x7745 #define PCI_DEVICE_ID_INTEL_ISH_ARL_S 0x7F78 #define PCI_DEVICE_ID_INTEL_ISH_LNL_M 0xA845 +#define PCI_DEVICE_ID_INTEL_ISH_PTL_H 0xE345 +#define PCI_DEVICE_ID_INTEL_ISH_PTL_P 0xE445 #define REVISION_ID_CHT_A0 0x6 #define REVISION_ID_CHT_Ax_SI 0x0 diff --git a/drivers/hid/intel-ish-hid/ipc/ipc.c b/drivers/hid/intel-ish-hid/ipc/ipc.c index 3cd53fc80634..4c861119e97a 100644 --- a/drivers/hid/intel-ish-hid/ipc/ipc.c +++ b/drivers/hid/intel-ish-hid/ipc/ipc.c @@ -517,6 +517,10 @@ static int ish_fw_reset_handler(struct ishtp_device *dev) /* ISH FW is dead */ if (!ish_is_input_ready(dev)) return -EPIPE; + + /* Send clock sync at once after reset */ + ishtp_dev->prev_sync = 0; + /* * Set HOST2ISH.ILUP. Apparently we need this BEFORE sending * RESET_NOTIFY_ACK - FW will be checking for it @@ -577,15 +581,14 @@ static void fw_reset_work_fn(struct work_struct *work) */ static void _ish_sync_fw_clock(struct ishtp_device *dev) { - static unsigned long prev_sync; - uint64_t usec; + struct ipc_time_update_msg time = {}; - if (prev_sync && time_before(jiffies, prev_sync + 20 * HZ)) + if (dev->prev_sync && time_before(jiffies, dev->prev_sync + 20 * HZ)) return; - prev_sync = jiffies; - usec = ktime_to_us(ktime_get_boottime()); - ipc_send_mng_msg(dev, MNG_SYNC_FW_CLOCK, &usec, sizeof(uint64_t)); + dev->prev_sync = jiffies; + /* The fields of time would be updated while sending message */ + ipc_send_mng_msg(dev, MNG_SYNC_FW_CLOCK, &time, sizeof(time)); } /** diff --git a/drivers/hid/intel-ish-hid/ipc/pci-ish.c b/drivers/hid/intel-ish-hid/ipc/pci-ish.c index 9e2401291a2f..ff0fc8010072 100644 --- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c +++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c @@ -26,9 +26,11 @@ enum ishtp_driver_data_index { ISHTP_DRIVER_DATA_NONE, ISHTP_DRIVER_DATA_LNL_M, + ISHTP_DRIVER_DATA_PTL, }; #define ISH_FW_GEN_LNL_M "lnlm" +#define ISH_FW_GEN_PTL "ptl" #define ISH_FIRMWARE_PATH(gen) "intel/ish/ish_" gen ".bin" #define ISH_FIRMWARE_PATH_ALL "intel/ish/ish_*.bin" @@ -37,6 +39,9 @@ static struct ishtp_driver_data ishtp_driver_data[] = { [ISHTP_DRIVER_DATA_LNL_M] = { .fw_generation = ISH_FW_GEN_LNL_M, }, + [ISHTP_DRIVER_DATA_PTL] = { + .fw_generation = ISH_FW_GEN_PTL, + }, }; static const struct pci_device_id ish_pci_tbl[] = { @@ -63,6 +68,8 @@ static const struct pci_device_id ish_pci_tbl[] = { {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ISH_ARL_H)}, {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ISH_ARL_S)}, {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ISH_LNL_M), .driver_data = ISHTP_DRIVER_DATA_LNL_M}, + {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ISH_PTL_H), .driver_data = ISHTP_DRIVER_DATA_PTL}, + {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ISH_PTL_P), .driver_data = ISHTP_DRIVER_DATA_PTL}, {} }; MODULE_DEVICE_TABLE(pci, ish_pci_tbl); diff --git a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h index 44eddc411e97..ec9f6e87aaf2 100644 --- a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h +++ b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h @@ -253,6 +253,8 @@ struct ishtp_device { unsigned int ipc_tx_cnt; unsigned long long ipc_tx_bytes_cnt; + /* Time of the last clock sync */ + unsigned long prev_sync; const struct ishtp_hw_ops *ops; size_t mtu; uint32_t ishtp_msg_hdr; diff --git a/drivers/hid/intel-thc-hid/Kconfig b/drivers/hid/intel-thc-hid/Kconfig index 91ec84902db8..0351d1137607 100644 --- a/drivers/hid/intel-thc-hid/Kconfig +++ b/drivers/hid/intel-thc-hid/Kconfig @@ -7,7 +7,6 @@ menu "Intel THC HID Support" config INTEL_THC_HID tristate "Intel Touch Host Controller" depends on ACPI - select HID help THC (Touch Host Controller) is the name of the IP block in PCH that interfaces with Touch Devices (ex: touchscreen, touchpad etc.). It diff --git a/drivers/hid/surface-hid/Kconfig b/drivers/hid/surface-hid/Kconfig index 7ce9b5d641eb..d0cfd0d29926 100644 --- a/drivers/hid/surface-hid/Kconfig +++ b/drivers/hid/surface-hid/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0+ menu "Surface System Aggregator Module HID support" depends on SURFACE_AGGREGATOR - depends on INPUT config SURFACE_HID tristate "HID transport driver for Surface System Aggregator Module" @@ -39,4 +38,3 @@ endmenu config SURFACE_HID_CORE tristate - select HID diff --git a/drivers/hid/usbhid/Kconfig b/drivers/hid/usbhid/Kconfig index 7c2032f7f44d..f3194767a45e 100644 --- a/drivers/hid/usbhid/Kconfig +++ b/drivers/hid/usbhid/Kconfig @@ -5,8 +5,7 @@ menu "USB HID support" config USB_HID tristate "USB HID transport layer" default y - depends on USB && INPUT - select HID + depends on HID help Say Y here if you want to connect USB keyboards, mice, joysticks, graphic tablets, or any other HID based devices diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index 5546184df05f..35a221e2c11c 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -1300,12 +1300,14 @@ new_device_store(struct device *dev, struct device_attribute *attr, info.flags |= I2C_CLIENT_SLAVE; } - info.flags |= I2C_CLIENT_USER; - client = i2c_new_client_device(adap, &info); if (IS_ERR(client)) return PTR_ERR(client); + /* Keep track of the added device */ + mutex_lock(&adap->userspace_clients_lock); + list_add_tail(&client->detected, &adap->userspace_clients); + mutex_unlock(&adap->userspace_clients_lock); dev_info(dev, "%s: Instantiated device %s at 0x%02hx\n", "new_device", info.type, info.addr); @@ -1313,15 +1315,6 @@ new_device_store(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_WO(new_device); -static int __i2c_find_user_addr(struct device *dev, const void *addrp) -{ - struct i2c_client *client = i2c_verify_client(dev); - unsigned short addr = *(unsigned short *)addrp; - - return client && client->flags & I2C_CLIENT_USER && - i2c_encode_flags_to_addr(client) == addr; -} - /* * And of course let the users delete the devices they instantiated, if * they got it wrong. This interface can only be used to delete devices @@ -1336,7 +1329,7 @@ delete_device_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct i2c_adapter *adap = to_i2c_adapter(dev); - struct device *child_dev; + struct i2c_client *client, *next; unsigned short addr; char end; int res; @@ -1352,19 +1345,28 @@ delete_device_store(struct device *dev, struct device_attribute *attr, return -EINVAL; } - mutex_lock(&core_lock); /* Make sure the device was added through sysfs */ - child_dev = device_find_child(&adap->dev, &addr, __i2c_find_user_addr); - if (child_dev) { - i2c_unregister_device(i2c_verify_client(child_dev)); - put_device(child_dev); - } else { - dev_err(dev, "Can't find userspace-created device at %#x\n", addr); - count = -ENOENT; + res = -ENOENT; + mutex_lock_nested(&adap->userspace_clients_lock, + i2c_adapter_depth(adap)); + list_for_each_entry_safe(client, next, &adap->userspace_clients, + detected) { + if (i2c_encode_flags_to_addr(client) == addr) { + dev_info(dev, "%s: Deleting device %s at 0x%02hx\n", + "delete_device", client->name, client->addr); + + list_del(&client->detected); + i2c_unregister_device(client); + res = count; + break; + } } - mutex_unlock(&core_lock); + mutex_unlock(&adap->userspace_clients_lock); - return count; + if (res < 0) + dev_err(dev, "%s: Can't find device in list\n", + "delete_device"); + return res; } static DEVICE_ATTR_IGNORE_LOCKDEP(delete_device, S_IWUSR, NULL, delete_device_store); @@ -1535,6 +1537,8 @@ static int i2c_register_adapter(struct i2c_adapter *adap) adap->locked_flags = 0; rt_mutex_init(&adap->bus_lock); rt_mutex_init(&adap->mux_lock); + mutex_init(&adap->userspace_clients_lock); + INIT_LIST_HEAD(&adap->userspace_clients); /* Set default timeout to 1 second if not already set */ if (adap->timeout == 0) @@ -1700,6 +1704,23 @@ int i2c_add_numbered_adapter(struct i2c_adapter *adap) } EXPORT_SYMBOL_GPL(i2c_add_numbered_adapter); +static void i2c_do_del_adapter(struct i2c_driver *driver, + struct i2c_adapter *adapter) +{ + struct i2c_client *client, *_n; + + /* Remove the devices we created ourselves as the result of hardware + * probing (using a driver's detect method) */ + list_for_each_entry_safe(client, _n, &driver->clients, detected) { + if (client->adapter == adapter) { + dev_dbg(&adapter->dev, "Removing %s at 0x%x\n", + client->name, client->addr); + list_del(&client->detected); + i2c_unregister_device(client); + } + } +} + static int __unregister_client(struct device *dev, void *dummy) { struct i2c_client *client = i2c_verify_client(dev); @@ -1715,6 +1736,12 @@ static int __unregister_dummy(struct device *dev, void *dummy) return 0; } +static int __process_removed_adapter(struct device_driver *d, void *data) +{ + i2c_do_del_adapter(to_i2c_driver(d), data); + return 0; +} + /** * i2c_del_adapter - unregister I2C adapter * @adap: the adapter being unregistered @@ -1726,6 +1753,7 @@ static int __unregister_dummy(struct device *dev, void *dummy) void i2c_del_adapter(struct i2c_adapter *adap) { struct i2c_adapter *found; + struct i2c_client *client, *next; /* First make sure that this adapter was ever added */ mutex_lock(&core_lock); @@ -1737,16 +1765,31 @@ void i2c_del_adapter(struct i2c_adapter *adap) } i2c_acpi_remove_space_handler(adap); + /* Tell drivers about this removal */ + mutex_lock(&core_lock); + bus_for_each_drv(&i2c_bus_type, NULL, adap, + __process_removed_adapter); + mutex_unlock(&core_lock); + + /* Remove devices instantiated from sysfs */ + mutex_lock_nested(&adap->userspace_clients_lock, + i2c_adapter_depth(adap)); + list_for_each_entry_safe(client, next, &adap->userspace_clients, + detected) { + dev_dbg(&adap->dev, "Removing %s at 0x%x\n", client->name, + client->addr); + list_del(&client->detected); + i2c_unregister_device(client); + } + mutex_unlock(&adap->userspace_clients_lock); /* Detach any active clients. This can't fail, thus we do not * check the returned value. This is a two-pass process, because * we can't remove the dummy devices during the first pass: they * could have been instantiated by real devices wishing to clean * them up properly, so we give them a chance to do that first. */ - mutex_lock(&core_lock); device_for_each_child(&adap->dev, NULL, __unregister_client); device_for_each_child(&adap->dev, NULL, __unregister_dummy); - mutex_unlock(&core_lock); /* device name is gone after device_unregister */ dev_dbg(&adap->dev, "adapter [%s] unregistered\n", adap->name); @@ -1966,6 +2009,7 @@ int i2c_register_driver(struct module *owner, struct i2c_driver *driver) /* add the driver to the list of i2c drivers in the driver core */ driver->driver.owner = owner; driver->driver.bus = &i2c_bus_type; + INIT_LIST_HEAD(&driver->clients); /* When registration returns, the driver core * will have called probe() for all matching-but-unbound devices. @@ -1983,13 +2027,10 @@ int i2c_register_driver(struct module *owner, struct i2c_driver *driver) } EXPORT_SYMBOL(i2c_register_driver); -static int __i2c_unregister_detected_client(struct device *dev, void *argp) +static int __process_removed_driver(struct device *dev, void *data) { - struct i2c_client *client = i2c_verify_client(dev); - - if (client && client->flags & I2C_CLIENT_AUTO) - i2c_unregister_device(client); - + if (dev->type == &i2c_adapter_type) + i2c_do_del_adapter(data, to_i2c_adapter(dev)); return 0; } @@ -2000,12 +2041,7 @@ static int __i2c_unregister_detected_client(struct device *dev, void *argp) */ void i2c_del_driver(struct i2c_driver *driver) { - mutex_lock(&core_lock); - /* Satisfy __must_check, function can't fail */ - if (driver_for_each_device(&driver->driver, NULL, NULL, - __i2c_unregister_detected_client)) { - } - mutex_unlock(&core_lock); + i2c_for_each_dev(driver, __process_removed_driver); driver_unregister(&driver->driver); pr_debug("driver [%s] unregistered\n", driver->driver.name); @@ -2432,7 +2468,6 @@ static int i2c_detect_address(struct i2c_client *temp_client, /* Finally call the custom detection function */ memset(&info, 0, sizeof(struct i2c_board_info)); info.addr = addr; - info.flags = I2C_CLIENT_AUTO; err = driver->detect(temp_client, &info); if (err) { /* -ENODEV is returned if the detection fails. We catch it @@ -2459,7 +2494,9 @@ static int i2c_detect_address(struct i2c_client *temp_client, dev_dbg(&adapter->dev, "Creating %s at 0x%02x\n", info.type, info.addr); client = i2c_new_client_device(adapter, &info); - if (IS_ERR(client)) + if (!IS_ERR(client)) + list_add_tail(&client->detected, &driver->clients); + else dev_err(&adapter->dev, "Failed creating %s at 0x%02x\n", info.type, info.addr); } diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index be063bfb50c4..c11b9965c4ad 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -169,6 +169,7 @@ config IXP4XX_IRQ config LAN966X_OIC tristate "Microchip LAN966x OIC Support" + depends on MCHP_LAN966X_PCI || COMPILE_TEST select GENERIC_IRQ_CHIP select IRQ_DOMAIN help diff --git a/drivers/irqchip/irq-apple-aic.c b/drivers/irqchip/irq-apple-aic.c index da5250f0155c..2b1684c60e3c 100644 --- a/drivers/irqchip/irq-apple-aic.c +++ b/drivers/irqchip/irq-apple-aic.c @@ -577,7 +577,8 @@ static void __exception_irq_entry aic_handle_fiq(struct pt_regs *regs) AIC_FIQ_HWIRQ(AIC_TMR_EL02_VIRT)); } - if (read_sysreg_s(SYS_IMP_APL_PMCR0_EL1) & PMCR0_IACT) { + if ((read_sysreg_s(SYS_IMP_APL_PMCR0_EL1) & (PMCR0_IMODE | PMCR0_IACT)) == + (FIELD_PREP(PMCR0_IMODE, PMCR0_IMODE_FIQ) | PMCR0_IACT)) { int irq; if (cpumask_test_cpu(smp_processor_id(), &aic_irqc->fiq_aff[AIC_CPU_PMU_P]->aff)) diff --git a/drivers/irqchip/irq-mvebu-icu.c b/drivers/irqchip/irq-mvebu-icu.c index b337f6c05f18..4eebed39880a 100644 --- a/drivers/irqchip/irq-mvebu-icu.c +++ b/drivers/irqchip/irq-mvebu-icu.c @@ -68,7 +68,8 @@ static int mvebu_icu_translate(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *hwirq, unsigned int *type) { unsigned int param_count = static_branch_unlikely(&legacy_bindings) ? 3 : 2; - struct mvebu_icu_msi_data *msi_data = d->host_data; + struct msi_domain_info *info = d->host_data; + struct mvebu_icu_msi_data *msi_data = info->chip_data; struct mvebu_icu *icu = msi_data->icu; /* Check the count of the parameters in dt */ diff --git a/drivers/irqchip/irq-partition-percpu.c b/drivers/irqchip/irq-partition-percpu.c index 8e76d2913e6b..4441ffe149ea 100644 --- a/drivers/irqchip/irq-partition-percpu.c +++ b/drivers/irqchip/irq-partition-percpu.c @@ -98,7 +98,7 @@ static void partition_irq_print_chip(struct irq_data *d, struct seq_file *p) struct irq_chip *chip = irq_desc_get_chip(part->chained_desc); struct irq_data *data = irq_desc_get_irq_data(part->chained_desc); - seq_printf(p, " %5s-%lu", chip->name, data->hwirq); + seq_printf(p, "%5s-%lu", chip->name, data->hwirq); } static struct irq_chip partition_irq_chip = { diff --git a/drivers/irqchip/irq-riscv-imsic-early.c b/drivers/irqchip/irq-riscv-imsic-early.c index c5c2e6929a2f..275df5005705 100644 --- a/drivers/irqchip/irq-riscv-imsic-early.c +++ b/drivers/irqchip/irq-riscv-imsic-early.c @@ -27,7 +27,7 @@ static void imsic_ipi_send(unsigned int cpu) { struct imsic_local_config *local = per_cpu_ptr(imsic->global.local, cpu); - writel_relaxed(IMSIC_IPI_ID, local->msi_va); + writel(IMSIC_IPI_ID, local->msi_va); } static void imsic_ipi_starting_cpu(void) diff --git a/drivers/irqchip/irq-thead-c900-aclint-sswi.c b/drivers/irqchip/irq-thead-c900-aclint-sswi.c index b0e366ade427..8ff6e7a1363b 100644 --- a/drivers/irqchip/irq-thead-c900-aclint-sswi.c +++ b/drivers/irqchip/irq-thead-c900-aclint-sswi.c @@ -31,7 +31,7 @@ static DEFINE_PER_CPU(void __iomem *, sswi_cpu_regs); static void thead_aclint_sswi_ipi_send(unsigned int cpu) { - writel_relaxed(0x1, per_cpu(sswi_cpu_regs, cpu)); + writel(0x1, per_cpu(sswi_cpu_regs, cpu)); } static void thead_aclint_sswi_ipi_clear(void) diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index a382929ce7ba..369aed044b40 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -76,10 +76,8 @@ static int linear_set_limits(struct mddev *mddev) lim.max_write_zeroes_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); - if (err) { - queue_limits_cancel_update(mddev->gendisk->queue); + if (err) return err; - } return queue_limits_set(mddev->gendisk->queue, &lim); } diff --git a/drivers/mfd/syscon.c b/drivers/mfd/syscon.c index 226915ca3c93..aa4a9940b569 100644 --- a/drivers/mfd/syscon.c +++ b/drivers/mfd/syscon.c @@ -159,6 +159,7 @@ err_regmap: } static struct regmap *device_node_get_regmap(struct device_node *np, + bool create_regmap, bool check_res) { struct syscon *entry, *syscon = NULL; @@ -172,7 +173,7 @@ static struct regmap *device_node_get_regmap(struct device_node *np, } if (!syscon) { - if (of_device_is_compatible(np, "syscon")) + if (create_regmap) syscon = of_syscon_register(np, check_res); else syscon = ERR_PTR(-EINVAL); @@ -233,15 +234,37 @@ err_unlock: } EXPORT_SYMBOL_GPL(of_syscon_register_regmap); +/** + * device_node_to_regmap() - Get or create a regmap for specified device node + * @np: Device tree node + * + * Get a regmap for the specified device node. If there's not an existing + * regmap, then one is instantiated. This function should not be used if the + * device node has a custom regmap driver or has resources (clocks, resets) to + * be managed. Use syscon_node_to_regmap() instead for those cases. + * + * Return: regmap ptr on success, negative error code on failure. + */ struct regmap *device_node_to_regmap(struct device_node *np) { - return device_node_get_regmap(np, false); + return device_node_get_regmap(np, true, false); } EXPORT_SYMBOL_GPL(device_node_to_regmap); +/** + * syscon_node_to_regmap() - Get or create a regmap for specified syscon device node + * @np: Device tree node + * + * Get a regmap for the specified device node. If there's not an existing + * regmap, then one is instantiated if the node is a generic "syscon". This + * function is safe to use for a syscon registered with + * of_syscon_register_regmap(). + * + * Return: regmap ptr on success, negative error code on failure. + */ struct regmap *syscon_node_to_regmap(struct device_node *np) { - return device_node_get_regmap(np, true); + return device_node_get_regmap(np, of_device_is_compatible(np, "syscon"), true); } EXPORT_SYMBOL_GPL(syscon_node_to_regmap); diff --git a/drivers/net/can/c_can/c_can_platform.c b/drivers/net/can/c_can/c_can_platform.c index 6cba9717a6d8..399844809bbe 100644 --- a/drivers/net/can/c_can/c_can_platform.c +++ b/drivers/net/can/c_can/c_can_platform.c @@ -385,15 +385,16 @@ static int c_can_plat_probe(struct platform_device *pdev) if (ret) { dev_err(&pdev->dev, "registering %s failed (err=%d)\n", KBUILD_MODNAME, ret); - goto exit_free_device; + goto exit_pm_runtime; } dev_info(&pdev->dev, "%s device registered (regs=%p, irq=%d)\n", KBUILD_MODNAME, priv->base, dev->irq); return 0; -exit_free_device: +exit_pm_runtime: pm_runtime_disable(priv->device); +exit_free_device: free_c_can_dev(dev); exit: dev_err(&pdev->dev, "probe failed\n"); diff --git a/drivers/net/can/ctucanfd/ctucanfd_base.c b/drivers/net/can/ctucanfd/ctucanfd_base.c index 64c349fd4600..f65c1a1e05cc 100644 --- a/drivers/net/can/ctucanfd/ctucanfd_base.c +++ b/drivers/net/can/ctucanfd/ctucanfd_base.c @@ -867,10 +867,12 @@ static void ctucan_err_interrupt(struct net_device *ndev, u32 isr) } break; case CAN_STATE_ERROR_ACTIVE: - cf->can_id |= CAN_ERR_CNT; - cf->data[1] = CAN_ERR_CRTL_ACTIVE; - cf->data[6] = bec.txerr; - cf->data[7] = bec.rxerr; + if (skb) { + cf->can_id |= CAN_ERR_CNT; + cf->data[1] = CAN_ERR_CRTL_ACTIVE; + cf->data[6] = bec.txerr; + cf->data[7] = bec.rxerr; + } break; default: netdev_warn(ndev, "unhandled error state (%d:%s)!\n", diff --git a/drivers/net/can/rockchip/rockchip_canfd-core.c b/drivers/net/can/rockchip/rockchip_canfd-core.c index df18c85fc078..d9a937ba126c 100644 --- a/drivers/net/can/rockchip/rockchip_canfd-core.c +++ b/drivers/net/can/rockchip/rockchip_canfd-core.c @@ -622,7 +622,7 @@ rkcanfd_handle_rx_fifo_overflow_int(struct rkcanfd_priv *priv) netdev_dbg(priv->ndev, "RX-FIFO overflow\n"); skb = rkcanfd_alloc_can_err_skb(priv, &cf, ×tamp); - if (skb) + if (!skb) return 0; rkcanfd_get_berr_counter_corrected(priv, &bec); diff --git a/drivers/net/can/usb/etas_es58x/es58x_devlink.c b/drivers/net/can/usb/etas_es58x/es58x_devlink.c index eee20839d96f..0d155eb1b9e9 100644 --- a/drivers/net/can/usb/etas_es58x/es58x_devlink.c +++ b/drivers/net/can/usb/etas_es58x/es58x_devlink.c @@ -248,7 +248,11 @@ static int es58x_devlink_info_get(struct devlink *devlink, return ret; } - return devlink_info_serial_number_put(req, es58x_dev->udev->serial); + if (es58x_dev->udev->serial) + ret = devlink_info_serial_number_put(req, + es58x_dev->udev->serial); + + return ret; } const struct devlink_ops es58x_dl_ops = { diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index fe0e3e2a8117..71e50fc65c14 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -1441,7 +1441,9 @@ void aq_nic_deinit(struct aq_nic_s *self, bool link_down) aq_ptp_ring_free(self); aq_ptp_free(self); - if (likely(self->aq_fw_ops->deinit) && link_down) { + /* May be invoked during hot unplug. */ + if (pci_device_is_present(self->pdev) && + likely(self->aq_fw_ops->deinit) && link_down) { mutex_lock(&self->fwreq_mutex); self->aq_fw_ops->deinit(self->aq_hw); mutex_unlock(&self->fwreq_mutex); diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c index 0715ea5bf13e..3b082114f2e5 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c @@ -41,9 +41,12 @@ void bcmgenet_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) { struct bcmgenet_priv *priv = netdev_priv(dev); struct device *kdev = &priv->pdev->dev; + u32 phy_wolopts = 0; - if (dev->phydev) + if (dev->phydev) { phy_ethtool_get_wol(dev->phydev, wol); + phy_wolopts = wol->wolopts; + } /* MAC is not wake-up capable, return what the PHY does */ if (!device_can_wakeup(kdev)) @@ -51,9 +54,14 @@ void bcmgenet_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) /* Overlay MAC capabilities with that of the PHY queried before */ wol->supported |= WAKE_MAGIC | WAKE_MAGICSECURE | WAKE_FILTER; - wol->wolopts = priv->wolopts; - memset(wol->sopass, 0, sizeof(wol->sopass)); + wol->wolopts |= priv->wolopts; + /* Return the PHY configured magic password */ + if (phy_wolopts & WAKE_MAGICSECURE) + return; + + /* Otherwise the MAC one */ + memset(wol->sopass, 0, sizeof(wol->sopass)); if (wol->wolopts & WAKE_MAGICSECURE) memcpy(wol->sopass, priv->sopass, sizeof(priv->sopass)); } @@ -70,7 +78,7 @@ int bcmgenet_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol) /* Try Wake-on-LAN from the PHY first */ if (dev->phydev) { ret = phy_ethtool_set_wol(dev->phydev, wol); - if (ret != -EOPNOTSUPP) + if (ret != -EOPNOTSUPP && wol->wolopts) return ret; } diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 1c94bf1db718..d9d675f1ebfe 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -55,6 +55,7 @@ #include <linux/hwmon.h> #include <linux/hwmon-sysfs.h> #include <linux/crc32poly.h> +#include <linux/dmi.h> #include <net/checksum.h> #include <net/gso.h> @@ -18212,6 +18213,50 @@ unlock: static SIMPLE_DEV_PM_OPS(tg3_pm_ops, tg3_suspend, tg3_resume); +/* Systems where ACPI _PTS (Prepare To Sleep) S5 will result in a fatal + * PCIe AER event on the tg3 device if the tg3 device is not, or cannot + * be, powered down. + */ +static const struct dmi_system_id tg3_restart_aer_quirk_table[] = { + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R440"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R540"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R640"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R650"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R740"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R750"), + }, + }, + {} +}; + static void tg3_shutdown(struct pci_dev *pdev) { struct net_device *dev = pci_get_drvdata(pdev); @@ -18228,6 +18273,19 @@ static void tg3_shutdown(struct pci_dev *pdev) if (system_state == SYSTEM_POWER_OFF) tg3_power_down(tp); + else if (system_state == SYSTEM_RESTART && + dmi_first_match(tg3_restart_aer_quirk_table) && + pdev->current_state != PCI_D3cold && + pdev->current_state != PCI_UNKNOWN) { + /* Disable PCIe AER on the tg3 to avoid a fatal + * error during this system restart. + */ + pcie_capability_clear_word(pdev, PCI_EXP_DEVCTL, + PCI_EXP_DEVCTL_CERE | + PCI_EXP_DEVCTL_NFERE | + PCI_EXP_DEVCTL_FERE | + PCI_EXP_DEVCTL_URRE); + } rtnl_unlock(); diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 2d7a18fcc3be..852e5b62f0a5 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2903,8 +2903,8 @@ static void iavf_watchdog_task(struct work_struct *work) } mutex_unlock(&adapter->crit_lock); - netdev_unlock(netdev); restart_watchdog: + netdev_unlock(netdev); if (adapter->state >= __IAVF_DOWN) queue_work(adapter->wq, &adapter->adminq_task); if (adapter->aq_required) diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink.c b/drivers/net/ethernet/intel/ice/devlink/devlink.c index d116e2b10bce..dbdb83567364 100644 --- a/drivers/net/ethernet/intel/ice/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ice/devlink/devlink.c @@ -981,6 +981,9 @@ static int ice_devlink_rate_node_new(struct devlink_rate *rate_node, void **priv /* preallocate memory for ice_sched_node */ node = devm_kzalloc(ice_hw_to_dev(pi->hw), sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + *priv = node; return 0; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 5d2d7736fd5f..9c9ea4c1b93b 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -527,15 +527,14 @@ err: * @xdp: xdp_buff used as input to the XDP program * @xdp_prog: XDP program to run * @xdp_ring: ring to be used for XDP_TX action - * @rx_buf: Rx buffer to store the XDP action * @eop_desc: Last descriptor in packet to read metadata from * * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR} */ -static void +static u32 ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring, - struct ice_rx_buf *rx_buf, union ice_32b_rx_flex_desc *eop_desc) + union ice_32b_rx_flex_desc *eop_desc) { unsigned int ret = ICE_XDP_PASS; u32 act; @@ -574,7 +573,7 @@ out_failure: ret = ICE_XDP_CONSUMED; } exit: - ice_set_rx_bufs_act(xdp, rx_ring, ret); + return ret; } /** @@ -860,10 +859,8 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, xdp_buff_set_frags_flag(xdp); } - if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) { - ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED); + if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) return -ENOMEM; - } __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page, rx_buf->page_offset, size); @@ -924,7 +921,6 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size, struct ice_rx_buf *rx_buf; rx_buf = &rx_ring->rx_buf[ntc]; - rx_buf->pgcnt = page_count(rx_buf->page); prefetchw(rx_buf->page); if (!size) @@ -941,6 +937,31 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size, } /** + * ice_get_pgcnts - grab page_count() for gathered fragments + * @rx_ring: Rx descriptor ring to store the page counts on + * + * This function is intended to be called right before running XDP + * program so that the page recycling mechanism will be able to take + * a correct decision regarding underlying pages; this is done in such + * way as XDP program can change the refcount of page + */ +static void ice_get_pgcnts(struct ice_rx_ring *rx_ring) +{ + u32 nr_frags = rx_ring->nr_frags + 1; + u32 idx = rx_ring->first_desc; + struct ice_rx_buf *rx_buf; + u32 cnt = rx_ring->count; + + for (int i = 0; i < nr_frags; i++) { + rx_buf = &rx_ring->rx_buf[idx]; + rx_buf->pgcnt = page_count(rx_buf->page); + + if (++idx == cnt) + idx = 0; + } +} + +/** * ice_build_skb - Build skb around an existing buffer * @rx_ring: Rx descriptor ring to transact packets on * @xdp: xdp_buff pointing to the data @@ -1051,12 +1072,12 @@ ice_construct_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) rx_buf->page_offset + headlen, size, xdp->frame_sz); } else { - /* buffer is unused, change the act that should be taken later - * on; data was copied onto skb's linear part so there's no + /* buffer is unused, restore biased page count in Rx buffer; + * data was copied onto skb's linear part so there's no * need for adjusting page offset and we can reuse this buffer * as-is */ - rx_buf->act = ICE_SKB_CONSUMED; + rx_buf->pagecnt_bias++; } if (unlikely(xdp_buff_has_frags(xdp))) { @@ -1104,6 +1125,65 @@ ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf) } /** + * ice_put_rx_mbuf - ice_put_rx_buf() caller, for all frame frags + * @rx_ring: Rx ring with all the auxiliary data + * @xdp: XDP buffer carrying linear + frags part + * @xdp_xmit: XDP_TX/XDP_REDIRECT verdict storage + * @ntc: a current next_to_clean value to be stored at rx_ring + * @verdict: return code from XDP program execution + * + * Walk through gathered fragments and satisfy internal page + * recycle mechanism; we take here an action related to verdict + * returned by XDP program; + */ +static void ice_put_rx_mbuf(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, + u32 *xdp_xmit, u32 ntc, u32 verdict) +{ + u32 nr_frags = rx_ring->nr_frags + 1; + u32 idx = rx_ring->first_desc; + u32 cnt = rx_ring->count; + u32 post_xdp_frags = 1; + struct ice_rx_buf *buf; + int i; + + if (unlikely(xdp_buff_has_frags(xdp))) + post_xdp_frags += xdp_get_shared_info_from_buff(xdp)->nr_frags; + + for (i = 0; i < post_xdp_frags; i++) { + buf = &rx_ring->rx_buf[idx]; + + if (verdict & (ICE_XDP_TX | ICE_XDP_REDIR)) { + ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); + *xdp_xmit |= verdict; + } else if (verdict & ICE_XDP_CONSUMED) { + buf->pagecnt_bias++; + } else if (verdict == ICE_XDP_PASS) { + ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); + } + + ice_put_rx_buf(rx_ring, buf); + + if (++idx == cnt) + idx = 0; + } + /* handle buffers that represented frags released by XDP prog; + * for these we keep pagecnt_bias as-is; refcount from struct page + * has been decremented within XDP prog and we do not have to increase + * the biased refcnt + */ + for (; i < nr_frags; i++) { + buf = &rx_ring->rx_buf[idx]; + ice_put_rx_buf(rx_ring, buf); + if (++idx == cnt) + idx = 0; + } + + xdp->data = NULL; + rx_ring->first_desc = ntc; + rx_ring->nr_frags = 0; +} + +/** * ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf * @rx_ring: Rx descriptor ring to transact packets on * @budget: Total limit on number of packets to process @@ -1120,15 +1200,13 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) unsigned int total_rx_bytes = 0, total_rx_pkts = 0; unsigned int offset = rx_ring->rx_offset; struct xdp_buff *xdp = &rx_ring->xdp; - u32 cached_ntc = rx_ring->first_desc; struct ice_tx_ring *xdp_ring = NULL; struct bpf_prog *xdp_prog = NULL; u32 ntc = rx_ring->next_to_clean; + u32 cached_ntu, xdp_verdict; u32 cnt = rx_ring->count; u32 xdp_xmit = 0; - u32 cached_ntu; bool failure; - u32 first; xdp_prog = READ_ONCE(rx_ring->xdp_prog); if (xdp_prog) { @@ -1190,6 +1268,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) xdp_prepare_buff(xdp, hard_start, offset, size, !!offset); xdp_buff_clear_frags_flag(xdp); } else if (ice_add_xdp_frag(rx_ring, xdp, rx_buf, size)) { + ice_put_rx_mbuf(rx_ring, xdp, NULL, ntc, ICE_XDP_CONSUMED); break; } if (++ntc == cnt) @@ -1199,15 +1278,15 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) if (ice_is_non_eop(rx_ring, rx_desc)) continue; - ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_buf, rx_desc); - if (rx_buf->act == ICE_XDP_PASS) + ice_get_pgcnts(rx_ring); + xdp_verdict = ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_desc); + if (xdp_verdict == ICE_XDP_PASS) goto construct_skb; total_rx_bytes += xdp_get_buff_len(xdp); total_rx_pkts++; - xdp->data = NULL; - rx_ring->first_desc = ntc; - rx_ring->nr_frags = 0; + ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict); + continue; construct_skb: if (likely(ice_ring_uses_build_skb(rx_ring))) @@ -1217,18 +1296,12 @@ construct_skb: /* exit if we failed to retrieve a buffer */ if (!skb) { rx_ring->ring_stats->rx_stats.alloc_page_failed++; - rx_buf->act = ICE_XDP_CONSUMED; - if (unlikely(xdp_buff_has_frags(xdp))) - ice_set_rx_bufs_act(xdp, rx_ring, - ICE_XDP_CONSUMED); - xdp->data = NULL; - rx_ring->first_desc = ntc; - rx_ring->nr_frags = 0; - break; + xdp_verdict = ICE_XDP_CONSUMED; } - xdp->data = NULL; - rx_ring->first_desc = ntc; - rx_ring->nr_frags = 0; + ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict); + + if (!skb) + break; stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S); if (unlikely(ice_test_staterr(rx_desc->wb.status_error0, @@ -1257,23 +1330,6 @@ construct_skb: total_rx_pkts++; } - first = rx_ring->first_desc; - while (cached_ntc != first) { - struct ice_rx_buf *buf = &rx_ring->rx_buf[cached_ntc]; - - if (buf->act & (ICE_XDP_TX | ICE_XDP_REDIR)) { - ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); - xdp_xmit |= buf->act; - } else if (buf->act & ICE_XDP_CONSUMED) { - buf->pagecnt_bias++; - } else if (buf->act == ICE_XDP_PASS) { - ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); - } - - ice_put_rx_buf(rx_ring, buf); - if (++cached_ntc >= cnt) - cached_ntc = 0; - } rx_ring->next_to_clean = ntc; /* return up to cleaned_count buffers to hardware */ failure = ice_alloc_rx_bufs(rx_ring, ICE_RX_DESC_UNUSED(rx_ring)); diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index cb347c852ba9..806bce701df3 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -201,7 +201,6 @@ struct ice_rx_buf { struct page *page; unsigned int page_offset; unsigned int pgcnt; - unsigned int act; unsigned int pagecnt_bias; }; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h index 79f960c6680d..6cf32b404127 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h @@ -6,49 +6,6 @@ #include "ice.h" /** - * ice_set_rx_bufs_act - propagate Rx buffer action to frags - * @xdp: XDP buffer representing frame (linear and frags part) - * @rx_ring: Rx ring struct - * act: action to store onto Rx buffers related to XDP buffer parts - * - * Set action that should be taken before putting Rx buffer from first frag - * to the last. - */ -static inline void -ice_set_rx_bufs_act(struct xdp_buff *xdp, const struct ice_rx_ring *rx_ring, - const unsigned int act) -{ - u32 sinfo_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags; - u32 nr_frags = rx_ring->nr_frags + 1; - u32 idx = rx_ring->first_desc; - u32 cnt = rx_ring->count; - struct ice_rx_buf *buf; - - for (int i = 0; i < nr_frags; i++) { - buf = &rx_ring->rx_buf[idx]; - buf->act = act; - - if (++idx == cnt) - idx = 0; - } - - /* adjust pagecnt_bias on frags freed by XDP prog */ - if (sinfo_frags < rx_ring->nr_frags && act == ICE_XDP_CONSUMED) { - u32 delta = rx_ring->nr_frags - sinfo_frags; - - while (delta) { - if (idx == 0) - idx = cnt - 1; - else - idx--; - buf = &rx_ring->rx_buf[idx]; - buf->pagecnt_bias--; - delta--; - } - } -} - -/** * ice_test_staterr - tests bits in Rx descriptor status and error fields * @status_err_n: Rx descriptor status_error0 or status_error1 bits * @stat_err_bits: value to mask diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c index b4fbb99bfad2..a3d6b8f198a8 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_lib.c +++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c @@ -2159,8 +2159,13 @@ static int idpf_open(struct net_device *netdev) idpf_vport_ctrl_lock(netdev); vport = idpf_netdev_to_vport(netdev); + err = idpf_set_real_num_queues(vport); + if (err) + goto unlock; + err = idpf_vport_open(vport); +unlock: idpf_vport_ctrl_unlock(netdev); return err; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 2fa9c36e33c9..9be6a6b59c4e 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -3008,8 +3008,6 @@ static int idpf_rx_rsc(struct idpf_rx_queue *rxq, struct sk_buff *skb, return -EINVAL; rsc_segments = DIV_ROUND_UP(skb->data_len, rsc_seg_len); - if (unlikely(rsc_segments == 1)) - return 0; NAPI_GRO_CB(skb)->count = rsc_segments; skb_shinfo(skb)->gso_size = rsc_seg_len; @@ -3072,6 +3070,7 @@ idpf_rx_process_skb_fields(struct idpf_rx_queue *rxq, struct sk_buff *skb, idpf_rx_hash(rxq, skb, rx_desc, decoded); skb->protocol = eth_type_trans(skb, rxq->netdev); + skb_record_rx_queue(skb, rxq->idx); if (le16_get_bits(rx_desc->hdrlen_flags, VIRTCHNL2_RX_FLEX_DESC_ADV_RSC_M)) @@ -3080,8 +3079,6 @@ idpf_rx_process_skb_fields(struct idpf_rx_queue *rxq, struct sk_buff *skb, csum_bits = idpf_rx_splitq_extract_csum_bits(rx_desc); idpf_rx_csum(rxq, skb, csum_bits, decoded); - skb_record_rx_queue(skb, rxq->idx); - return 0; } diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 56a35d58e7a6..84307bb7313e 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -1096,6 +1096,7 @@ static int igc_init_empty_frame(struct igc_ring *ring, return -ENOMEM; } + buffer->type = IGC_TX_BUFFER_TYPE_SKB; buffer->skb = skb; buffer->protocol = 0; buffer->bytecount = skb->len; @@ -2701,8 +2702,9 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget) } static struct sk_buff *igc_construct_skb_zc(struct igc_ring *ring, - struct xdp_buff *xdp) + struct igc_xdp_buff *ctx) { + struct xdp_buff *xdp = &ctx->xdp; unsigned int totalsize = xdp->data_end - xdp->data_meta; unsigned int metasize = xdp->data - xdp->data_meta; struct sk_buff *skb; @@ -2721,27 +2723,28 @@ static struct sk_buff *igc_construct_skb_zc(struct igc_ring *ring, __skb_pull(skb, metasize); } + if (ctx->rx_ts) { + skb_shinfo(skb)->tx_flags |= SKBTX_HW_TSTAMP_NETDEV; + skb_hwtstamps(skb)->netdev_data = ctx->rx_ts; + } + return skb; } static void igc_dispatch_skb_zc(struct igc_q_vector *q_vector, union igc_adv_rx_desc *desc, - struct xdp_buff *xdp, - ktime_t timestamp) + struct igc_xdp_buff *ctx) { struct igc_ring *ring = q_vector->rx.ring; struct sk_buff *skb; - skb = igc_construct_skb_zc(ring, xdp); + skb = igc_construct_skb_zc(ring, ctx); if (!skb) { ring->rx_stats.alloc_failed++; set_bit(IGC_RING_FLAG_RX_ALLOC_FAILED, &ring->flags); return; } - if (timestamp) - skb_hwtstamps(skb)->hwtstamp = timestamp; - if (igc_cleanup_headers(ring, desc, skb)) return; @@ -2777,7 +2780,6 @@ static int igc_clean_rx_irq_zc(struct igc_q_vector *q_vector, const int budget) union igc_adv_rx_desc *desc; struct igc_rx_buffer *bi; struct igc_xdp_buff *ctx; - ktime_t timestamp = 0; unsigned int size; int res; @@ -2807,6 +2809,8 @@ static int igc_clean_rx_irq_zc(struct igc_q_vector *q_vector, const int budget) */ bi->xdp->data_meta += IGC_TS_HDR_LEN; size -= IGC_TS_HDR_LEN; + } else { + ctx->rx_ts = NULL; } bi->xdp->data_end = bi->xdp->data + size; @@ -2815,7 +2819,7 @@ static int igc_clean_rx_irq_zc(struct igc_q_vector *q_vector, const int budget) res = __igc_xdp_run_prog(adapter, prog, bi->xdp); switch (res) { case IGC_XDP_PASS: - igc_dispatch_skb_zc(q_vector, desc, bi->xdp, timestamp); + igc_dispatch_skb_zc(q_vector, desc, ctx); fallthrough; case IGC_XDP_CONSUMED: xsk_buff_free(bi->xdp); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 7236f20c9a30..467f81239e12 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2105,7 +2105,7 @@ static void ixgbe_put_rx_buffer(struct ixgbe_ring *rx_ring, /* hand second half of page back to the ring */ ixgbe_reuse_rx_page(rx_ring, rx_buffer); } else { - if (!IS_ERR(skb) && IXGBE_CB(skb)->dma == rx_buffer->dma) { + if (skb && IXGBE_CB(skb)->dma == rx_buffer->dma) { /* the page has been released from the ring */ IXGBE_CB(skb)->page_released = true; } else { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c index 2bed8c86b7cf..3f64cdbabfa3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c @@ -768,7 +768,9 @@ static void __mlxsw_sp_port_get_stats(struct net_device *dev, err = mlxsw_sp_get_hw_stats_by_group(&hw_stats, &len, grp); if (err) return; - mlxsw_sp_port_get_stats_raw(dev, grp, prio, ppcnt_pl); + err = mlxsw_sp_port_get_stats_raw(dev, grp, prio, ppcnt_pl); + if (err) + return; for (i = 0; i < len; i++) { data[data_index + i] = hw_stats[i].getter(ppcnt_pl); if (!hw_stats[i].cells_bytes) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index d04543e5697b..c0ae7db96f46 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2094,6 +2094,11 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv, pp_params.offset = stmmac_rx_offset(priv); pp_params.max_len = dma_conf->dma_buf_sz; + if (priv->sph) { + pp_params.offset = 0; + pp_params.max_len += stmmac_rx_offset(priv); + } + rx_q->page_pool = page_pool_create(&pp_params); if (IS_ERR(rx_q->page_pool)) { ret = PTR_ERR(rx_q->page_pool); @@ -2424,6 +2429,11 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv) u32 chan = 0; u8 qmode = 0; + if (rxfifosz == 0) + rxfifosz = priv->dma_cap.rx_fifo_size; + if (txfifosz == 0) + txfifosz = priv->dma_cap.tx_fifo_size; + /* Split up the shared Tx/Rx FIFO memory on DW QoS Eth and DW XGMAC */ if (priv->plat->has_gmac4 || priv->plat->has_xgmac) { rxfifosz /= rx_channels_count; @@ -2892,6 +2902,11 @@ static void stmmac_set_dma_operation_mode(struct stmmac_priv *priv, u32 txmode, int rxfifosz = priv->plat->rx_fifo_size; int txfifosz = priv->plat->tx_fifo_size; + if (rxfifosz == 0) + rxfifosz = priv->dma_cap.rx_fifo_size; + if (txfifosz == 0) + txfifosz = priv->dma_cap.tx_fifo_size; + /* Adjust for real per queue fifo size */ rxfifosz /= rx_channels_count; txfifosz /= tx_channels_count; @@ -5868,6 +5883,9 @@ static int stmmac_change_mtu(struct net_device *dev, int new_mtu) const int mtu = new_mtu; int ret; + if (txfifosz == 0) + txfifosz = priv->dma_cap.tx_fifo_size; + txfifosz /= priv->plat->tx_queues_to_use; if (stmmac_xdp_is_enabled(priv) && new_mtu > ETH_DATA_LEN) { @@ -7219,29 +7237,15 @@ static int stmmac_hw_init(struct stmmac_priv *priv) priv->plat->tx_queues_to_use = priv->dma_cap.number_tx_queues; } - if (!priv->plat->rx_fifo_size) { - if (priv->dma_cap.rx_fifo_size) { - priv->plat->rx_fifo_size = priv->dma_cap.rx_fifo_size; - } else { - dev_err(priv->device, "Can't specify Rx FIFO size\n"); - return -ENODEV; - } - } else if (priv->dma_cap.rx_fifo_size && - priv->plat->rx_fifo_size > priv->dma_cap.rx_fifo_size) { + if (priv->dma_cap.rx_fifo_size && + priv->plat->rx_fifo_size > priv->dma_cap.rx_fifo_size) { dev_warn(priv->device, "Rx FIFO size (%u) exceeds dma capability\n", priv->plat->rx_fifo_size); priv->plat->rx_fifo_size = priv->dma_cap.rx_fifo_size; } - if (!priv->plat->tx_fifo_size) { - if (priv->dma_cap.tx_fifo_size) { - priv->plat->tx_fifo_size = priv->dma_cap.tx_fifo_size; - } else { - dev_err(priv->device, "Can't specify Tx FIFO size\n"); - return -ENODEV; - } - } else if (priv->dma_cap.tx_fifo_size && - priv->plat->tx_fifo_size > priv->dma_cap.tx_fifo_size) { + if (priv->dma_cap.tx_fifo_size && + priv->plat->tx_fifo_size > priv->dma_cap.tx_fifo_size) { dev_warn(priv->device, "Tx FIFO size (%u) exceeds dma capability\n", priv->plat->tx_fifo_size); diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index b663271e79f7..2806238629f8 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -828,21 +828,30 @@ static void am65_cpsw_nuss_xmit_free(struct am65_cpsw_tx_chn *tx_chn, static void am65_cpsw_nuss_tx_cleanup(void *data, dma_addr_t desc_dma) { struct am65_cpsw_tx_chn *tx_chn = data; + enum am65_cpsw_tx_buf_type buf_type; struct cppi5_host_desc_t *desc_tx; + struct xdp_frame *xdpf; struct sk_buff *skb; void **swdata; desc_tx = k3_cppi_desc_pool_dma2virt(tx_chn->desc_pool, desc_dma); swdata = cppi5_hdesc_get_swdata(desc_tx); - skb = *(swdata); - am65_cpsw_nuss_xmit_free(tx_chn, desc_tx); + buf_type = am65_cpsw_nuss_buf_type(tx_chn, desc_dma); + if (buf_type == AM65_CPSW_TX_BUF_TYPE_SKB) { + skb = *(swdata); + dev_kfree_skb_any(skb); + } else { + xdpf = *(swdata); + xdp_return_frame(xdpf); + } - dev_kfree_skb_any(skb); + am65_cpsw_nuss_xmit_free(tx_chn, desc_tx); } static struct sk_buff *am65_cpsw_build_skb(void *page_addr, struct net_device *ndev, - unsigned int len) + unsigned int len, + unsigned int headroom) { struct sk_buff *skb; @@ -852,7 +861,7 @@ static struct sk_buff *am65_cpsw_build_skb(void *page_addr, if (unlikely(!skb)) return NULL; - skb_reserve(skb, AM65_CPSW_HEADROOM); + skb_reserve(skb, headroom); skb->dev = ndev; return skb; @@ -1169,9 +1178,11 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_rx_flow *flow, struct xdp_frame *xdpf; struct bpf_prog *prog; struct page *page; + int pkt_len; u32 act; int err; + pkt_len = *len; prog = READ_ONCE(port->xdp_prog); if (!prog) return AM65_CPSW_XDP_PASS; @@ -1189,8 +1200,10 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_rx_flow *flow, netif_txq = netdev_get_tx_queue(ndev, tx_chn->id); xdpf = xdp_convert_buff_to_frame(xdp); - if (unlikely(!xdpf)) + if (unlikely(!xdpf)) { + ndev->stats.tx_dropped++; goto drop; + } __netif_tx_lock(netif_txq, cpu); err = am65_cpsw_xdp_tx_frame(ndev, tx_chn, xdpf, @@ -1199,14 +1212,14 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_rx_flow *flow, if (err) goto drop; - dev_sw_netstats_tx_add(ndev, 1, *len); + dev_sw_netstats_rx_add(ndev, pkt_len); ret = AM65_CPSW_XDP_CONSUMED; goto out; case XDP_REDIRECT: if (unlikely(xdp_do_redirect(ndev, xdp, prog))) goto drop; - dev_sw_netstats_rx_add(ndev, *len); + dev_sw_netstats_rx_add(ndev, pkt_len); ret = AM65_CPSW_XDP_REDIRECT; goto out; default: @@ -1315,16 +1328,8 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_rx_flow *flow, dev_dbg(dev, "%s rx csum_info:%#x\n", __func__, csum_info); dma_unmap_single(rx_chn->dma_dev, buf_dma, buf_dma_len, DMA_FROM_DEVICE); - k3_cppi_desc_pool_free(rx_chn->desc_pool, desc_rx); - skb = am65_cpsw_build_skb(page_addr, ndev, - AM65_CPSW_MAX_PACKET_SIZE); - if (unlikely(!skb)) { - new_page = page; - goto requeue; - } - if (port->xdp_prog) { xdp_init_buff(&xdp, PAGE_SIZE, &port->xdp_rxq[flow->id]); xdp_prepare_buff(&xdp, page_addr, AM65_CPSW_HEADROOM, @@ -1334,9 +1339,16 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_rx_flow *flow, if (*xdp_state != AM65_CPSW_XDP_PASS) goto allocate; - /* Compute additional headroom to be reserved */ - headroom = (xdp.data - xdp.data_hard_start) - skb_headroom(skb); - skb_reserve(skb, headroom); + headroom = xdp.data - xdp.data_hard_start; + } else { + headroom = AM65_CPSW_HEADROOM; + } + + skb = am65_cpsw_build_skb(page_addr, ndev, + AM65_CPSW_MAX_PACKET_SIZE, headroom); + if (unlikely(!skb)) { + new_page = page; + goto requeue; } ndev_priv = netdev_priv(ndev); diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 214b62fba991..b00a315de060 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -2265,12 +2265,15 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy, /* Allow the MAC to stop its clock if the PHY has the capability */ pl->mac_tx_clk_stop = phy_eee_tx_clock_stop_capable(phy) > 0; - /* Explicitly configure whether the PHY is allowed to stop it's - * receive clock. - */ - ret = phy_eee_rx_clock_stop(phy, pl->config->eee_rx_clk_stop_enable); - if (ret == -EOPNOTSUPP) - ret = 0; + if (pl->mac_supports_eee_ops) { + /* Explicitly configure whether the PHY is allowed to stop it's + * receive clock. + */ + ret = phy_eee_rx_clock_stop(phy, + pl->config->eee_rx_clk_stop_enable); + if (ret == -EOPNOTSUPP) + ret = 0; + } return ret; } diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c index 4f2a54afc4d0..4602e26eb8c8 100644 --- a/drivers/net/pse-pd/pse_core.c +++ b/drivers/net/pse-pd/pse_core.c @@ -319,7 +319,7 @@ static int pse_pi_get_current_limit(struct regulator_dev *rdev) goto out; mW = ret; - ret = pse_pi_get_voltage(rdev); + ret = _pse_pi_get_voltage(rdev); if (!ret) { dev_err(pcdev->dev, "Voltage null\n"); ret = -ERANGE; @@ -356,7 +356,7 @@ static int pse_pi_set_current_limit(struct regulator_dev *rdev, int min_uA, id = rdev_get_id(rdev); mutex_lock(&pcdev->lock); - ret = pse_pi_get_voltage(rdev); + ret = _pse_pi_get_voltage(rdev); if (!ret) { dev_err(pcdev->dev, "Voltage null\n"); ret = -ERANGE; diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index dc7cbd6a9798..f4019815f473 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -2639,7 +2639,9 @@ int team_nl_options_set_doit(struct sk_buff *skb, struct genl_info *info) ctx.data.u32_val = nla_get_u32(attr_data); break; case TEAM_OPTION_TYPE_STRING: - if (nla_len(attr_data) > TEAM_STRING_MAX_LEN) { + if (nla_len(attr_data) > TEAM_STRING_MAX_LEN || + !memchr(nla_data(attr_data), '\0', + nla_len(attr_data))) { err = -EINVAL; goto team_put; } diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 28624cca91f8..acf96f262488 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -574,18 +574,14 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, return ret; } -static inline bool tun_capable(struct tun_struct *tun) +static inline bool tun_not_capable(struct tun_struct *tun) { const struct cred *cred = current_cred(); struct net *net = dev_net(tun->dev); - if (ns_capable(net->user_ns, CAP_NET_ADMIN)) - return 1; - if (uid_valid(tun->owner) && uid_eq(cred->euid, tun->owner)) - return 1; - if (gid_valid(tun->group) && in_egroup_p(tun->group)) - return 1; - return 0; + return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) || + (gid_valid(tun->group) && !in_egroup_p(tun->group))) && + !ns_capable(net->user_ns, CAP_NET_ADMIN); } static void tun_set_real_num_queues(struct tun_struct *tun) @@ -2782,7 +2778,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) !!(tun->flags & IFF_MULTI_QUEUE)) return -EINVAL; - if (!tun_capable(tun)) + if (tun_not_capable(tun)) return -EPERM; err = security_tun_dev_open(tun->security); if (err < 0) diff --git a/drivers/net/vmxnet3/vmxnet3_xdp.c b/drivers/net/vmxnet3/vmxnet3_xdp.c index 1341374a4588..616ecc38d172 100644 --- a/drivers/net/vmxnet3/vmxnet3_xdp.c +++ b/drivers/net/vmxnet3/vmxnet3_xdp.c @@ -28,7 +28,7 @@ vmxnet3_xdp_get_tq(struct vmxnet3_adapter *adapter) if (likely(cpu < tq_number)) tq = &adapter->tx_queue[cpu]; else - tq = &adapter->tx_queue[reciprocal_scale(cpu, tq_number)]; + tq = &adapter->tx_queue[cpu % tq_number]; return tq; } @@ -124,6 +124,7 @@ vmxnet3_xdp_xmit_frame(struct vmxnet3_adapter *adapter, u32 buf_size; u32 dw2; + spin_lock_irq(&tq->tx_lock); dw2 = (tq->tx_ring.gen ^ 0x1) << VMXNET3_TXD_GEN_SHIFT; dw2 |= xdpf->len; ctx.sop_txd = tq->tx_ring.base + tq->tx_ring.next2fill; @@ -134,6 +135,7 @@ vmxnet3_xdp_xmit_frame(struct vmxnet3_adapter *adapter, if (vmxnet3_cmd_ring_desc_avail(&tq->tx_ring) == 0) { tq->stats.tx_ring_full++; + spin_unlock_irq(&tq->tx_lock); return -ENOSPC; } @@ -142,8 +144,10 @@ vmxnet3_xdp_xmit_frame(struct vmxnet3_adapter *adapter, tbi->dma_addr = dma_map_single(&adapter->pdev->dev, xdpf->data, buf_size, DMA_TO_DEVICE); - if (dma_mapping_error(&adapter->pdev->dev, tbi->dma_addr)) + if (dma_mapping_error(&adapter->pdev->dev, tbi->dma_addr)) { + spin_unlock_irq(&tq->tx_lock); return -EFAULT; + } tbi->map_type |= VMXNET3_MAP_SINGLE; } else { /* XDP buffer from page pool */ page = virt_to_page(xdpf->data); @@ -182,6 +186,7 @@ vmxnet3_xdp_xmit_frame(struct vmxnet3_adapter *adapter, dma_wmb(); gdesc->dword[2] = cpu_to_le32(le32_to_cpu(gdesc->dword[2]) ^ VMXNET3_TXD_GEN); + spin_unlock_irq(&tq->tx_lock); /* No need to handle the case when tx_num_deferred doesn't reach * threshold. Backend driver at hypervisor side will poll and reset @@ -225,6 +230,7 @@ vmxnet3_xdp_xmit(struct net_device *dev, { struct vmxnet3_adapter *adapter = netdev_priv(dev); struct vmxnet3_tx_queue *tq; + struct netdev_queue *nq; int i; if (unlikely(test_bit(VMXNET3_STATE_BIT_QUIESCED, &adapter->state))) @@ -236,6 +242,9 @@ vmxnet3_xdp_xmit(struct net_device *dev, if (tq->stopped) return -ENETDOWN; + nq = netdev_get_tx_queue(adapter->netdev, tq->qid); + + __netif_tx_lock(nq, smp_processor_id()); for (i = 0; i < n; i++) { if (vmxnet3_xdp_xmit_frame(adapter, frames[i], tq, true)) { tq->stats.xdp_xmit_err++; @@ -243,6 +252,7 @@ vmxnet3_xdp_xmit(struct net_device *dev, } } tq->stats.xdp_xmit += i; + __netif_tx_unlock(nq); return i; } diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 05c10acb2a57..92516189e792 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2898,8 +2898,11 @@ static int vxlan_init(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); int err; - if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) - vxlan_vnigroup_init(vxlan); + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { + err = vxlan_vnigroup_init(vxlan); + if (err) + return err; + } err = gro_cells_init(&vxlan->gro_cells, dev); if (err) diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 4dd6cdf84571..abb510d235a5 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -4851,6 +4851,22 @@ static struct ath12k_reg_rule return reg_rule_ptr; } +static u8 ath12k_wmi_ignore_num_extra_rules(struct ath12k_wmi_reg_rule_ext_params *rule, + u32 num_reg_rules) +{ + u8 num_invalid_5ghz_rules = 0; + u32 count, start_freq; + + for (count = 0; count < num_reg_rules; count++) { + start_freq = le32_get_bits(rule[count].freq_info, REG_RULE_START_FREQ); + + if (start_freq >= ATH12K_MIN_6G_FREQ) + num_invalid_5ghz_rules++; + } + + return num_invalid_5ghz_rules; +} + static int ath12k_pull_reg_chan_list_ext_update_ev(struct ath12k_base *ab, struct sk_buff *skb, struct ath12k_reg_info *reg_info) @@ -4861,6 +4877,7 @@ static int ath12k_pull_reg_chan_list_ext_update_ev(struct ath12k_base *ab, u32 num_2g_reg_rules, num_5g_reg_rules; u32 num_6g_reg_rules_ap[WMI_REG_CURRENT_MAX_AP_TYPE]; u32 num_6g_reg_rules_cl[WMI_REG_CURRENT_MAX_AP_TYPE][WMI_REG_MAX_CLIENT_TYPE]; + u8 num_invalid_5ghz_ext_rules; u32 total_reg_rules = 0; int ret, i, j; @@ -4954,20 +4971,6 @@ static int ath12k_pull_reg_chan_list_ext_update_ev(struct ath12k_base *ab, memcpy(reg_info->alpha2, &ev->alpha2, REG_ALPHA2_LEN); - /* FIXME: Currently FW includes 6G reg rule also in 5G rule - * list for country US. - * Having same 6G reg rule in 5G and 6G rules list causes - * intersect check to be true, and same rules will be shown - * multiple times in iw cmd. So added hack below to avoid - * parsing 6G rule from 5G reg rule list, and this can be - * removed later, after FW updates to remove 6G reg rule - * from 5G rules list. - */ - if (memcmp(reg_info->alpha2, "US", 2) == 0) { - reg_info->num_5g_reg_rules = REG_US_5G_NUM_REG_RULES; - num_5g_reg_rules = reg_info->num_5g_reg_rules; - } - reg_info->dfs_region = le32_to_cpu(ev->dfs_region); reg_info->phybitmap = le32_to_cpu(ev->phybitmap); reg_info->num_phy = le32_to_cpu(ev->num_phy); @@ -5070,8 +5073,29 @@ static int ath12k_pull_reg_chan_list_ext_update_ev(struct ath12k_base *ab, } } + ext_wmi_reg_rule += num_2g_reg_rules; + + /* Firmware might include 6 GHz reg rule in 5 GHz rule list + * for few countries along with separate 6 GHz rule. + * Having same 6 GHz reg rule in 5 GHz and 6 GHz rules list + * causes intersect check to be true, and same rules will be + * shown multiple times in iw cmd. + * Hence, avoid parsing 6 GHz rule from 5 GHz reg rule list + */ + num_invalid_5ghz_ext_rules = ath12k_wmi_ignore_num_extra_rules(ext_wmi_reg_rule, + num_5g_reg_rules); + + if (num_invalid_5ghz_ext_rules) { + ath12k_dbg(ab, ATH12K_DBG_WMI, + "CC: %s 5 GHz reg rules number %d from fw, %d number of invalid 5 GHz rules", + reg_info->alpha2, reg_info->num_5g_reg_rules, + num_invalid_5ghz_ext_rules); + + num_5g_reg_rules = num_5g_reg_rules - num_invalid_5ghz_ext_rules; + reg_info->num_5g_reg_rules = num_5g_reg_rules; + } + if (num_5g_reg_rules) { - ext_wmi_reg_rule += num_2g_reg_rules; reg_info->reg_rules_5g_ptr = create_ext_reg_rules_from_wmi(num_5g_reg_rules, ext_wmi_reg_rule); @@ -5083,7 +5107,12 @@ static int ath12k_pull_reg_chan_list_ext_update_ev(struct ath12k_base *ab, } } - ext_wmi_reg_rule += num_5g_reg_rules; + /* We have adjusted the number of 5 GHz reg rules above. But still those + * many rules needs to be adjusted in ext_wmi_reg_rule. + * + * NOTE: num_invalid_5ghz_ext_rules will be 0 for rest other cases. + */ + ext_wmi_reg_rule += (num_5g_reg_rules + num_invalid_5ghz_ext_rules); for (i = 0; i < WMI_REG_CURRENT_MAX_AP_TYPE; i++) { reg_info->reg_rules_6g_ap_ptr[i] = diff --git a/drivers/net/wireless/ath/ath12k/wmi.h b/drivers/net/wireless/ath/ath12k/wmi.h index b6a197389277..45fe699ce8a5 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.h +++ b/drivers/net/wireless/ath/ath12k/wmi.h @@ -4073,7 +4073,6 @@ struct ath12k_wmi_eht_rate_set_params { #define MAX_REG_RULES 10 #define REG_ALPHA2_LEN 2 #define MAX_6G_REG_RULES 5 -#define REG_US_5G_NUM_REG_RULES 4 enum wmi_start_event_param { WMI_VDEV_START_RESP_EVENT = 0, diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c index e4395b1f8c11..d2caa80e9412 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c @@ -2712,7 +2712,7 @@ static const struct pci_device_id brcmf_pcie_devid_table[] = { BRCMF_PCIE_DEVICE(BRCM_PCIE_4350_DEVICE_ID, WCC), BRCMF_PCIE_DEVICE_SUB(0x4355, BRCM_PCIE_VENDOR_ID_BROADCOM, 0x4355, WCC), BRCMF_PCIE_DEVICE(BRCM_PCIE_4354_RAW_DEVICE_ID, WCC), - BRCMF_PCIE_DEVICE(BRCM_PCIE_4355_DEVICE_ID, WCC), + BRCMF_PCIE_DEVICE(BRCM_PCIE_4355_DEVICE_ID, WCC_SEED), BRCMF_PCIE_DEVICE(BRCM_PCIE_4356_DEVICE_ID, WCC), BRCMF_PCIE_DEVICE(BRCM_PCIE_43567_DEVICE_ID, WCC), BRCMF_PCIE_DEVICE(BRCM_PCIE_43570_DEVICE_ID, WCC), @@ -2723,7 +2723,7 @@ static const struct pci_device_id brcmf_pcie_devid_table[] = { BRCMF_PCIE_DEVICE(BRCM_PCIE_43602_2G_DEVICE_ID, WCC), BRCMF_PCIE_DEVICE(BRCM_PCIE_43602_5G_DEVICE_ID, WCC), BRCMF_PCIE_DEVICE(BRCM_PCIE_43602_RAW_DEVICE_ID, WCC), - BRCMF_PCIE_DEVICE(BRCM_PCIE_4364_DEVICE_ID, WCC), + BRCMF_PCIE_DEVICE(BRCM_PCIE_4364_DEVICE_ID, WCC_SEED), BRCMF_PCIE_DEVICE(BRCM_PCIE_4365_DEVICE_ID, BCA), BRCMF_PCIE_DEVICE(BRCM_PCIE_4365_2G_DEVICE_ID, BCA), BRCMF_PCIE_DEVICE(BRCM_PCIE_4365_5G_DEVICE_ID, BCA), diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 40046770f1bf..818d4e49aab5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1700,7 +1700,13 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0, &result); - if (status < 0) + + /* + * It's either a kernel error or the host observed a connection + * lost. In either case it's not possible communicate with the + * controller and thus enter the error code path. + */ + if (status < 0 || status == NVME_SC_HOST_PATH_ERROR) return status; /* diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 094be164ffdc..f4f1866fbd5b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -781,11 +781,19 @@ restart: static void nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl) { + enum nvme_ctrl_state state; + unsigned long flags; + dev_info(ctrl->ctrl.device, "NVME-FC{%d}: controller connectivity lost. Awaiting " "Reconnect", ctrl->cnum); - switch (nvme_ctrl_state(&ctrl->ctrl)) { + spin_lock_irqsave(&ctrl->lock, flags); + set_bit(ASSOC_FAILED, &ctrl->flags); + state = nvme_ctrl_state(&ctrl->ctrl); + spin_unlock_irqrestore(&ctrl->lock, flags); + + switch (state) { case NVME_CTRL_NEW: case NVME_CTRL_LIVE: /* @@ -2079,7 +2087,8 @@ done: nvme_fc_complete_rq(rq); check_error: - if (terminate_assoc && ctrl->ctrl.state != NVME_CTRL_RESETTING) + if (terminate_assoc && + nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_RESETTING) queue_work(nvme_reset_wq, &ctrl->ioerr_work); } @@ -2533,6 +2542,8 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) { + enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl); + /* * if an error (io timeout, etc) while (re)connecting, the remote * port requested terminating of the association (disconnect_ls) @@ -2540,9 +2551,8 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) * the controller. Abort any ios on the association and let the * create_association error path resolve things. */ - if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) { + if (state == NVME_CTRL_CONNECTING) { __nvme_fc_abort_outstanding_ios(ctrl, true); - set_bit(ASSOC_FAILED, &ctrl->flags); dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: transport error during (re)connect\n", ctrl->cnum); @@ -2550,7 +2560,7 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) } /* Otherwise, only proceed if in LIVE state - e.g. on first error */ - if (ctrl->ctrl.state != NVME_CTRL_LIVE) + if (state != NVME_CTRL_LIVE) return; dev_warn(ctrl->ctrl.device, @@ -3167,12 +3177,18 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) else ret = nvme_fc_recreate_io_queues(ctrl); } - if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags)) - ret = -EIO; if (ret) goto out_term_aen_ops; - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + spin_lock_irqsave(&ctrl->lock, flags); + if (!test_bit(ASSOC_FAILED, &ctrl->flags)) + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + else + ret = -EIO; + spin_unlock_irqrestore(&ctrl->lock, flags); + + if (ret) + goto out_term_aen_ops; ctrl->ctrl.nr_reconnects = 0; @@ -3578,8 +3594,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list); spin_unlock_irqrestore(&rport->lock, flags); - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING) || - !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { dev_err(ctrl->ctrl.device, "NVME-FC{%d}: failed to init ctrl state\n", ctrl->cnum); goto fail_ctrl; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 278bed4e35bb..9197a5b173fd 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2153,14 +2153,6 @@ static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred, return 0; out_free_bufs: - while (--i >= 0) { - size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE; - - dma_free_attrs(dev->dev, size, bufs[i], - le64_to_cpu(descs[i].addr), - DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); - } - kfree(bufs); out_free_descs: dma_free_coherent(dev->dev, descs_size, descs, descs_dma); @@ -3147,7 +3139,9 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) * because of high power consumption (> 2 Watt) in s2idle * sleep. Only some boards with Intel CPU are affected. */ - if (dmi_match(DMI_BOARD_NAME, "GMxPXxx") || + if (dmi_match(DMI_BOARD_NAME, "DN50Z-140HC-YD") || + dmi_match(DMI_BOARD_NAME, "GMxPXxx") || + dmi_match(DMI_BOARD_NAME, "GXxMRXx") || dmi_match(DMI_BOARD_NAME, "PH4PG31") || dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1") || dmi_match(DMI_BOARD_NAME, "PH6PG01_PH6PG71")) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index b68a9e5f1ea3..3a41b9ab0f13 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -792,7 +792,7 @@ static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, return a->mode; } -const struct attribute_group nvme_tls_attrs_group = { +static const struct attribute_group nvme_tls_attrs_group = { .attrs = nvme_tls_attrs, .is_visible = nvme_tls_attrs_are_visible, }; diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index e670dc185a96..acc138bbf8f2 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1068,6 +1068,7 @@ static void nvme_execute_identify_ns_nvm(struct nvmet_req *req) goto out; } status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + kfree(id); out: nvmet_req_complete(req, status); } diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index a7ff05b3be29..eb406c90c167 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -287,7 +287,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) args.subsysnqn = d->subsysnqn; args.hostnqn = d->hostnqn; args.hostid = &d->hostid; - args.kato = c->kato; + args.kato = le32_to_cpu(c->kato); ctrl = nvmet_alloc_ctrl(&args); if (!ctrl) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index c1f574fe3280..83be0657e6df 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -272,7 +272,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) iter_flags = SG_MITER_FROM_SG; } - if (req->cmd->rw.control & NVME_RW_LR) + if (req->cmd->rw.control & cpu_to_le16(NVME_RW_LR)) opf |= REQ_FAILFAST_DEV; if (is_pci_p2pdma_page(sg_page(req->sg))) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index b540216c0c9a..4be8d22d2d8d 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -589,7 +589,7 @@ struct nvmet_alloc_ctrl_args { const struct nvmet_fabrics_ops *ops; struct device *p2p_client; u32 kato; - u32 result; + __le32 result; u16 error_loc; u16 status; }; diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index e0bc90597dca..da3e7edcf49d 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -108,9 +108,6 @@ void pci_save_aspm_l1ss_state(struct pci_dev *pdev) pci_read_config_dword(pdev, pdev->l1ss + PCI_L1SS_CTL2, cap++); pci_read_config_dword(pdev, pdev->l1ss + PCI_L1SS_CTL1, cap++); - if (parent->state_saved) - return; - /* * Save parent's L1 substate configuration so we have it for * pci_restore_aspm_l1ss_state(pdev) to restore. diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c index 1e604fbbda65..07de59ca2ebf 100644 --- a/drivers/pci/tph.c +++ b/drivers/pci/tph.c @@ -360,7 +360,7 @@ int pcie_tph_set_st_entry(struct pci_dev *pdev, unsigned int index, u16 tag) return err; } - set_ctrl_reg_req_en(pdev, pdev->tph_mode); + set_ctrl_reg_req_en(pdev, pdev->tph_req_type); pci_dbg(pdev, "set steering tag: %s table, index=%d, tag=%#04x\n", (loc == PCI_TPH_LOC_MSIX) ? "MSI-X" : "ST", index, tag); diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c index 0b13d7f17b32..42547f64453e 100644 --- a/drivers/pinctrl/pinconf-generic.c +++ b/drivers/pinctrl/pinconf-generic.c @@ -89,12 +89,12 @@ static void pinconf_generic_dump_one(struct pinctrl_dev *pctldev, seq_puts(s, items[i].display); /* Print unit if available */ if (items[i].has_arg) { - seq_printf(s, " (0x%x", - pinconf_to_config_argument(config)); + u32 val = pinconf_to_config_argument(config); + if (items[i].format) - seq_printf(s, " %s)", items[i].format); + seq_printf(s, " (%u %s)", val, items[i].format); else - seq_puts(s, ")"); + seq_printf(s, " (0x%x)", val); } } } diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index 0d6c2027d4c1..d73004b4a45e 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -42,7 +42,7 @@ #define CY8C95X0_PORTSEL 0x18 /* Port settings, write PORTSEL first */ #define CY8C95X0_INTMASK 0x19 -#define CY8C95X0_PWMSEL 0x1A +#define CY8C95X0_SELPWM 0x1A #define CY8C95X0_INVERT 0x1B #define CY8C95X0_DIRECTION 0x1C /* Drive mode register change state on writing '1' */ @@ -328,14 +328,14 @@ static int cypress_get_pin_mask(struct cy8c95x0_pinctrl *chip, unsigned int pin) static bool cy8c95x0_readable_register(struct device *dev, unsigned int reg) { /* - * Only 12 registers are present per port (see Table 6 in the - * datasheet). + * Only 12 registers are present per port (see Table 6 in the datasheet). */ - if (reg >= CY8C95X0_VIRTUAL && (reg % MUXED_STRIDE) < 12) - return true; + if (reg >= CY8C95X0_VIRTUAL && (reg % MUXED_STRIDE) >= 12) + return false; switch (reg) { case 0x24 ... 0x27: + case 0x31 ... 0x3f: return false; default: return true; @@ -344,8 +344,11 @@ static bool cy8c95x0_readable_register(struct device *dev, unsigned int reg) static bool cy8c95x0_writeable_register(struct device *dev, unsigned int reg) { - if (reg >= CY8C95X0_VIRTUAL) - return true; + /* + * Only 12 registers are present per port (see Table 6 in the datasheet). + */ + if (reg >= CY8C95X0_VIRTUAL && (reg % MUXED_STRIDE) >= 12) + return false; switch (reg) { case CY8C95X0_INPUT_(0) ... CY8C95X0_INPUT_(7): @@ -353,6 +356,7 @@ static bool cy8c95x0_writeable_register(struct device *dev, unsigned int reg) case CY8C95X0_DEVID: return false; case 0x24 ... 0x27: + case 0x31 ... 0x3f: return false; default: return true; @@ -365,8 +369,8 @@ static bool cy8c95x0_volatile_register(struct device *dev, unsigned int reg) case CY8C95X0_INPUT_(0) ... CY8C95X0_INPUT_(7): case CY8C95X0_INTSTATUS_(0) ... CY8C95X0_INTSTATUS_(7): case CY8C95X0_INTMASK: + case CY8C95X0_SELPWM: case CY8C95X0_INVERT: - case CY8C95X0_PWMSEL: case CY8C95X0_DIRECTION: case CY8C95X0_DRV_PU: case CY8C95X0_DRV_PD: @@ -395,7 +399,7 @@ static bool cy8c95x0_muxed_register(unsigned int reg) { switch (reg) { case CY8C95X0_INTMASK: - case CY8C95X0_PWMSEL: + case CY8C95X0_SELPWM: case CY8C95X0_INVERT: case CY8C95X0_DIRECTION: case CY8C95X0_DRV_PU: @@ -466,7 +470,11 @@ static const struct regmap_config cy8c9520_i2c_regmap = { .max_register = 0, /* Updated at runtime */ .num_reg_defaults_raw = 0, /* Updated at runtime */ .use_single_read = true, /* Workaround for regcache bug */ +#if IS_ENABLED(CONFIG_DEBUG_PINCTRL) + .disable_locking = false, +#else .disable_locking = true, +#endif }; static inline int cy8c95x0_regmap_update_bits_base(struct cy8c95x0_pinctrl *chip, @@ -789,7 +797,7 @@ static int cy8c95x0_gpio_get_pincfg(struct cy8c95x0_pinctrl *chip, reg = CY8C95X0_DIRECTION; break; case PIN_CONFIG_MODE_PWM: - reg = CY8C95X0_PWMSEL; + reg = CY8C95X0_SELPWM; break; case PIN_CONFIG_OUTPUT: reg = CY8C95X0_OUTPUT; @@ -868,7 +876,7 @@ static int cy8c95x0_gpio_set_pincfg(struct cy8c95x0_pinctrl *chip, reg = CY8C95X0_DRV_PP_FAST; break; case PIN_CONFIG_MODE_PWM: - reg = CY8C95X0_PWMSEL; + reg = CY8C95X0_SELPWM; break; case PIN_CONFIG_OUTPUT_ENABLE: return cy8c95x0_pinmux_direction(chip, off, !arg); @@ -1153,7 +1161,7 @@ static void cy8c95x0_pin_dbg_show(struct pinctrl_dev *pctldev, struct seq_file * bitmap_zero(mask, MAX_LINE); __set_bit(pin, mask); - if (cy8c95x0_read_regs_mask(chip, CY8C95X0_PWMSEL, pwm, mask)) { + if (cy8c95x0_read_regs_mask(chip, CY8C95X0_SELPWM, pwm, mask)) { seq_puts(s, "not available"); return; } @@ -1198,7 +1206,7 @@ static int cy8c95x0_set_mode(struct cy8c95x0_pinctrl *chip, unsigned int off, bo u8 port = cypress_get_port(chip, off); u8 bit = cypress_get_pin_mask(chip, off); - return cy8c95x0_regmap_write_bits(chip, CY8C95X0_PWMSEL, port, bit, mode ? bit : 0); + return cy8c95x0_regmap_write_bits(chip, CY8C95X0_SELPWM, port, bit, mode ? bit : 0); } static int cy8c95x0_pinmux_mode(struct cy8c95x0_pinctrl *chip, @@ -1347,7 +1355,7 @@ static int cy8c95x0_irq_setup(struct cy8c95x0_pinctrl *chip, int irq) ret = devm_request_threaded_irq(chip->dev, irq, NULL, cy8c95x0_irq_handler, - IRQF_ONESHOT | IRQF_SHARED | IRQF_TRIGGER_HIGH, + IRQF_ONESHOT | IRQF_SHARED, dev_name(chip->dev), chip); if (ret) { dev_err(chip->dev, "failed to request irq %d\n", irq); @@ -1438,15 +1446,15 @@ static int cy8c95x0_probe(struct i2c_client *client) switch (chip->tpin) { case 20: strscpy(chip->name, cy8c95x0_id[0].name); - regmap_range_conf.range_max = CY8C95X0_VIRTUAL + 3 * MUXED_STRIDE; + regmap_range_conf.range_max = CY8C95X0_VIRTUAL + 3 * MUXED_STRIDE - 1; break; case 40: strscpy(chip->name, cy8c95x0_id[1].name); - regmap_range_conf.range_max = CY8C95X0_VIRTUAL + 6 * MUXED_STRIDE; + regmap_range_conf.range_max = CY8C95X0_VIRTUAL + 6 * MUXED_STRIDE - 1; break; case 60: strscpy(chip->name, cy8c95x0_id[2].name); - regmap_range_conf.range_max = CY8C95X0_VIRTUAL + 8 * MUXED_STRIDE; + regmap_range_conf.range_max = CY8C95X0_VIRTUAL + 8 * MUXED_STRIDE - 1; break; default: return -ENODEV; diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c index dfb5d4b8c046..30bd366d7b58 100644 --- a/drivers/platform/x86/ideapad-laptop.c +++ b/drivers/platform/x86/ideapad-laptop.c @@ -1121,7 +1121,7 @@ static int ideapad_dytc_profile_init(struct ideapad_private *priv) /* Create platform_profile structure and register */ priv->dytc->ppdev = devm_platform_profile_register(&priv->platform_device->dev, - "ideapad-laptop", &priv->dytc, + "ideapad-laptop", priv->dytc, &dytc_profile_ops); if (IS_ERR(priv->dytc->ppdev)) { err = PTR_ERR(priv->dytc->ppdev); diff --git a/drivers/platform/x86/intel/ifs/ifs.h b/drivers/platform/x86/intel/ifs/ifs.h index 5c3c0dfa1bf8..f369fb0d3d82 100644 --- a/drivers/platform/x86/intel/ifs/ifs.h +++ b/drivers/platform/x86/intel/ifs/ifs.h @@ -23,12 +23,14 @@ * IFS Image * --------- * - * Intel provides a firmware file containing the scan tests via - * github [#f1]_. Similar to microcode there is a separate file for each + * Intel provides firmware files containing the scan tests via the webpage [#f1]_. + * Look under "In-Field Scan Test Images Download" section towards the + * end of the page. Similar to microcode, there are separate files for each * family-model-stepping. IFS Images are not applicable for some test types. * Wherever applicable the sysfs directory would provide a "current_batch" file * (see below) for loading the image. * + * .. [#f1] https://intel.com/InFieldScan * * IFS Image Loading * ----------------- @@ -125,9 +127,6 @@ * 2) Hardware allows for some number of cores to be tested in parallel. * The driver does not make use of this, it only tests one core at a time. * - * .. [#f1] https://github.com/intel/TBD - * - * * Structural Based Functional Test at Field (SBAF): * ------------------------------------------------- * diff --git a/drivers/platform/x86/intel/int3472/discrete.c b/drivers/platform/x86/intel/int3472/discrete.c index 31015ebe20d8..092252eb95a8 100644 --- a/drivers/platform/x86/intel/int3472/discrete.c +++ b/drivers/platform/x86/intel/int3472/discrete.c @@ -2,6 +2,7 @@ /* Author: Dan Scally <djrscally@gmail.com> */ #include <linux/acpi.h> +#include <linux/array_size.h> #include <linux/bitfield.h> #include <linux/device.h> #include <linux/gpio/consumer.h> @@ -55,7 +56,7 @@ static void skl_int3472_log_sensor_module_name(struct int3472_discrete_device *i static int skl_int3472_fill_gpiod_lookup(struct gpiod_lookup *table_entry, struct acpi_resource_gpio *agpio, - const char *func, u32 polarity) + const char *func, unsigned long gpio_flags) { char *path = agpio->resource_source.string_ptr; struct acpi_device *adev; @@ -70,14 +71,14 @@ static int skl_int3472_fill_gpiod_lookup(struct gpiod_lookup *table_entry, if (!adev) return -ENODEV; - *table_entry = GPIO_LOOKUP(acpi_dev_name(adev), agpio->pin_table[0], func, polarity); + *table_entry = GPIO_LOOKUP(acpi_dev_name(adev), agpio->pin_table[0], func, gpio_flags); return 0; } static int skl_int3472_map_gpio_to_sensor(struct int3472_discrete_device *int3472, struct acpi_resource_gpio *agpio, - const char *func, u32 polarity) + const char *func, unsigned long gpio_flags) { int ret; @@ -87,7 +88,7 @@ static int skl_int3472_map_gpio_to_sensor(struct int3472_discrete_device *int347 } ret = skl_int3472_fill_gpiod_lookup(&int3472->gpios.table[int3472->n_sensor_gpios], - agpio, func, polarity); + agpio, func, gpio_flags); if (ret) return ret; @@ -100,7 +101,7 @@ static int skl_int3472_map_gpio_to_sensor(struct int3472_discrete_device *int347 static struct gpio_desc * skl_int3472_gpiod_get_from_temp_lookup(struct int3472_discrete_device *int3472, struct acpi_resource_gpio *agpio, - const char *func, u32 polarity) + const char *func, unsigned long gpio_flags) { struct gpio_desc *desc; int ret; @@ -111,7 +112,7 @@ skl_int3472_gpiod_get_from_temp_lookup(struct int3472_discrete_device *int3472, return ERR_PTR(-ENOMEM); lookup->dev_id = dev_name(int3472->dev); - ret = skl_int3472_fill_gpiod_lookup(&lookup->table[0], agpio, func, polarity); + ret = skl_int3472_fill_gpiod_lookup(&lookup->table[0], agpio, func, gpio_flags); if (ret) return ERR_PTR(ret); @@ -122,32 +123,76 @@ skl_int3472_gpiod_get_from_temp_lookup(struct int3472_discrete_device *int3472, return desc; } -static void int3472_get_func_and_polarity(u8 type, const char **func, u32 *polarity) +/** + * struct int3472_gpio_map - Map GPIOs to whatever is expected by the + * sensor driver (as in DT bindings) + * @hid: The ACPI HID of the device without the instance number e.g. INT347E + * @type_from: The GPIO type from ACPI ?SDT + * @type_to: The assigned GPIO type, typically same as @type_from + * @func: The function, e.g. "enable" + * @polarity_low: GPIO_ACTIVE_LOW true if the @polarity_low is true, + * GPIO_ACTIVE_HIGH otherwise + */ +struct int3472_gpio_map { + const char *hid; + u8 type_from; + u8 type_to; + bool polarity_low; + const char *func; +}; + +static const struct int3472_gpio_map int3472_gpio_map[] = { + { "INT347E", INT3472_GPIO_TYPE_RESET, INT3472_GPIO_TYPE_RESET, false, "enable" }, +}; + +static void int3472_get_func_and_polarity(struct acpi_device *adev, u8 *type, + const char **func, unsigned long *gpio_flags) { - switch (type) { + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(int3472_gpio_map); i++) { + /* + * Map the firmware-provided GPIO to whatever a driver expects + * (as in DT bindings). First check if the type matches with the + * GPIO map, then further check that the device _HID matches. + */ + if (*type != int3472_gpio_map[i].type_from) + continue; + + if (!acpi_dev_hid_uid_match(adev, int3472_gpio_map[i].hid, NULL)) + continue; + + *type = int3472_gpio_map[i].type_to; + *gpio_flags = int3472_gpio_map[i].polarity_low ? + GPIO_ACTIVE_LOW : GPIO_ACTIVE_HIGH; + *func = int3472_gpio_map[i].func; + return; + } + + switch (*type) { case INT3472_GPIO_TYPE_RESET: *func = "reset"; - *polarity = GPIO_ACTIVE_LOW; + *gpio_flags = GPIO_ACTIVE_LOW; break; case INT3472_GPIO_TYPE_POWERDOWN: *func = "powerdown"; - *polarity = GPIO_ACTIVE_LOW; + *gpio_flags = GPIO_ACTIVE_LOW; break; case INT3472_GPIO_TYPE_CLK_ENABLE: *func = "clk-enable"; - *polarity = GPIO_ACTIVE_HIGH; + *gpio_flags = GPIO_ACTIVE_HIGH; break; case INT3472_GPIO_TYPE_PRIVACY_LED: *func = "privacy-led"; - *polarity = GPIO_ACTIVE_HIGH; + *gpio_flags = GPIO_ACTIVE_HIGH; break; case INT3472_GPIO_TYPE_POWER_ENABLE: *func = "power-enable"; - *polarity = GPIO_ACTIVE_HIGH; + *gpio_flags = GPIO_ACTIVE_HIGH; break; default: *func = "unknown"; - *polarity = GPIO_ACTIVE_HIGH; + *gpio_flags = GPIO_ACTIVE_HIGH; break; } } @@ -194,7 +239,7 @@ static int skl_int3472_handle_gpio_resources(struct acpi_resource *ares, struct gpio_desc *gpio; const char *err_msg; const char *func; - u32 polarity; + unsigned long gpio_flags; int ret; if (!acpi_gpio_get_io_resource(ares, &agpio)) @@ -217,7 +262,7 @@ static int skl_int3472_handle_gpio_resources(struct acpi_resource *ares, type = FIELD_GET(INT3472_GPIO_DSM_TYPE, obj->integer.value); - int3472_get_func_and_polarity(type, &func, &polarity); + int3472_get_func_and_polarity(int3472->sensor, &type, &func, &gpio_flags); pin = FIELD_GET(INT3472_GPIO_DSM_PIN, obj->integer.value); /* Pin field is not really used under Windows and wraps around at 8 bits */ @@ -227,16 +272,16 @@ static int skl_int3472_handle_gpio_resources(struct acpi_resource *ares, active_value = FIELD_GET(INT3472_GPIO_DSM_SENSOR_ON_VAL, obj->integer.value); if (!active_value) - polarity ^= GPIO_ACTIVE_LOW; + gpio_flags ^= GPIO_ACTIVE_LOW; dev_dbg(int3472->dev, "%s %s pin %d active-%s\n", func, agpio->resource_source.string_ptr, agpio->pin_table[0], - str_high_low(polarity == GPIO_ACTIVE_HIGH)); + str_high_low(gpio_flags == GPIO_ACTIVE_HIGH)); switch (type) { case INT3472_GPIO_TYPE_RESET: case INT3472_GPIO_TYPE_POWERDOWN: - ret = skl_int3472_map_gpio_to_sensor(int3472, agpio, func, polarity); + ret = skl_int3472_map_gpio_to_sensor(int3472, agpio, func, gpio_flags); if (ret) err_msg = "Failed to map GPIO pin to sensor\n"; @@ -244,7 +289,7 @@ static int skl_int3472_handle_gpio_resources(struct acpi_resource *ares, case INT3472_GPIO_TYPE_CLK_ENABLE: case INT3472_GPIO_TYPE_PRIVACY_LED: case INT3472_GPIO_TYPE_POWER_ENABLE: - gpio = skl_int3472_gpiod_get_from_temp_lookup(int3472, agpio, func, polarity); + gpio = skl_int3472_gpiod_get_from_temp_lookup(int3472, agpio, func, gpio_flags); if (IS_ERR(gpio)) { ret = PTR_ERR(gpio); err_msg = "Failed to get GPIO\n"; diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c index 10f04b944117..1ee0fb5f8250 100644 --- a/drivers/platform/x86/intel/pmc/core.c +++ b/drivers/platform/x86/intel/pmc/core.c @@ -626,8 +626,8 @@ static u32 convert_ltr_scale(u32 val) static int pmc_core_ltr_show(struct seq_file *s, void *unused) { struct pmc_dev *pmcdev = s->private; - u64 decoded_snoop_ltr, decoded_non_snoop_ltr; - u32 ltr_raw_data, scale, val; + u64 decoded_snoop_ltr, decoded_non_snoop_ltr, val; + u32 ltr_raw_data, scale; u16 snoop_ltr, nonsnoop_ltr; unsigned int i, index, ltr_index = 0; diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 1fcb0f99695a..72a10ed2017c 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -7885,6 +7885,7 @@ static struct ibm_struct volume_driver_data = { #define FAN_NS_CTRL_STATUS BIT(2) /* Bit which determines control is enabled or not */ #define FAN_NS_CTRL BIT(4) /* Bit which determines control is by host or EC */ +#define FAN_CLOCK_TPM (22500*60) /* Ticks per minute for a 22.5 kHz clock */ enum { /* Fan control constants */ fan_status_offset = 0x2f, /* EC register 0x2f */ @@ -7940,6 +7941,7 @@ static int fan_watchdog_maxinterval; static bool fan_with_ns_addr; static bool ecfw_with_fan_dec_rpm; +static bool fan_speed_in_tpr; static struct mutex fan_mutex; @@ -8142,8 +8144,11 @@ static int fan_get_speed(unsigned int *speed) !acpi_ec_read(fan_rpm_offset + 1, &hi))) return -EIO; - if (likely(speed)) + if (likely(speed)) { *speed = (hi << 8) | lo; + if (fan_speed_in_tpr && *speed != 0) + *speed = FAN_CLOCK_TPM / *speed; + } break; case TPACPI_FAN_RD_TPEC_NS: if (!acpi_ec_read(fan_rpm_status_ns, &lo)) @@ -8176,8 +8181,11 @@ static int fan2_get_speed(unsigned int *speed) if (rc) return -EIO; - if (likely(speed)) + if (likely(speed)) { *speed = (hi << 8) | lo; + if (fan_speed_in_tpr && *speed != 0) + *speed = FAN_CLOCK_TPM / *speed; + } break; case TPACPI_FAN_RD_TPEC_NS: @@ -8788,6 +8796,7 @@ static const struct attribute_group fan_driver_attr_group = { #define TPACPI_FAN_NOFAN 0x0008 /* no fan available */ #define TPACPI_FAN_NS 0x0010 /* For EC with non-Standard register addresses */ #define TPACPI_FAN_DECRPM 0x0020 /* For ECFW's with RPM in register as decimal */ +#define TPACPI_FAN_TPR 0x0040 /* Fan speed is in Ticks Per Revolution */ static const struct tpacpi_quirk fan_quirk_table[] __initconst = { TPACPI_QEC_IBM('1', 'Y', TPACPI_FAN_Q1), @@ -8817,6 +8826,7 @@ static const struct tpacpi_quirk fan_quirk_table[] __initconst = { TPACPI_Q_LNV3('R', '0', 'V', TPACPI_FAN_NS), /* 11e Gen5 KL-Y */ TPACPI_Q_LNV3('N', '1', 'O', TPACPI_FAN_NOFAN), /* X1 Tablet (2nd gen) */ TPACPI_Q_LNV3('R', '0', 'Q', TPACPI_FAN_DECRPM),/* L480 */ + TPACPI_Q_LNV('8', 'F', TPACPI_FAN_TPR), /* ThinkPad x120e */ }; static int __init fan_init(struct ibm_init_struct *iibm) @@ -8887,6 +8897,8 @@ static int __init fan_init(struct ibm_init_struct *iibm) if (quirks & TPACPI_FAN_Q1) fan_quirk1_setup(); + if (quirks & TPACPI_FAN_TPR) + fan_speed_in_tpr = true; /* Try and probe the 2nd fan */ tp_features.second_fan = 1; /* needed for get_speed to work */ res = fan2_get_speed(&speed); @@ -10319,6 +10331,10 @@ static struct ibm_struct proxsensor_driver_data = { #define DYTC_MODE_PSC_BALANCE 5 /* Default mode aka balanced */ #define DYTC_MODE_PSC_PERFORM 7 /* High power mode aka performance */ +#define DYTC_MODE_PSCV9_LOWPOWER 1 /* Low power mode */ +#define DYTC_MODE_PSCV9_BALANCE 3 /* Default mode aka balanced */ +#define DYTC_MODE_PSCV9_PERFORM 4 /* High power mode aka performance */ + #define DYTC_ERR_MASK 0xF /* Bits 0-3 in cmd result are the error result */ #define DYTC_ERR_SUCCESS 1 /* CMD completed successful */ @@ -10339,6 +10355,10 @@ static int dytc_capabilities; static bool dytc_mmc_get_available; static int profile_force; +static int platform_psc_profile_lowpower = DYTC_MODE_PSC_LOWPOWER; +static int platform_psc_profile_balanced = DYTC_MODE_PSC_BALANCE; +static int platform_psc_profile_performance = DYTC_MODE_PSC_PERFORM; + static int convert_dytc_to_profile(int funcmode, int dytcmode, enum platform_profile_option *profile) { @@ -10360,19 +10380,15 @@ static int convert_dytc_to_profile(int funcmode, int dytcmode, } return 0; case DYTC_FUNCTION_PSC: - switch (dytcmode) { - case DYTC_MODE_PSC_LOWPOWER: + if (dytcmode == platform_psc_profile_lowpower) *profile = PLATFORM_PROFILE_LOW_POWER; - break; - case DYTC_MODE_PSC_BALANCE: + else if (dytcmode == platform_psc_profile_balanced) *profile = PLATFORM_PROFILE_BALANCED; - break; - case DYTC_MODE_PSC_PERFORM: + else if (dytcmode == platform_psc_profile_performance) *profile = PLATFORM_PROFILE_PERFORMANCE; - break; - default: /* Unknown mode */ + else return -EINVAL; - } + return 0; case DYTC_FUNCTION_AMT: /* For now return balanced. It's the closest we have to 'auto' */ @@ -10393,19 +10409,19 @@ static int convert_profile_to_dytc(enum platform_profile_option profile, int *pe if (dytc_capabilities & BIT(DYTC_FC_MMC)) *perfmode = DYTC_MODE_MMC_LOWPOWER; else if (dytc_capabilities & BIT(DYTC_FC_PSC)) - *perfmode = DYTC_MODE_PSC_LOWPOWER; + *perfmode = platform_psc_profile_lowpower; break; case PLATFORM_PROFILE_BALANCED: if (dytc_capabilities & BIT(DYTC_FC_MMC)) *perfmode = DYTC_MODE_MMC_BALANCE; else if (dytc_capabilities & BIT(DYTC_FC_PSC)) - *perfmode = DYTC_MODE_PSC_BALANCE; + *perfmode = platform_psc_profile_balanced; break; case PLATFORM_PROFILE_PERFORMANCE: if (dytc_capabilities & BIT(DYTC_FC_MMC)) *perfmode = DYTC_MODE_MMC_PERFORM; else if (dytc_capabilities & BIT(DYTC_FC_PSC)) - *perfmode = DYTC_MODE_PSC_PERFORM; + *perfmode = platform_psc_profile_performance; break; default: /* Unknown profile */ return -EOPNOTSUPP; @@ -10599,6 +10615,7 @@ static int tpacpi_dytc_profile_init(struct ibm_init_struct *iibm) if (output & BIT(DYTC_QUERY_ENABLE_BIT)) dytc_version = (output >> DYTC_QUERY_REV_BIT) & 0xF; + dbg_printk(TPACPI_DBG_INIT, "DYTC version %d\n", dytc_version); /* Check DYTC is enabled and supports mode setting */ if (dytc_version < 5) return -ENODEV; @@ -10637,6 +10654,11 @@ static int tpacpi_dytc_profile_init(struct ibm_init_struct *iibm) } } else if (dytc_capabilities & BIT(DYTC_FC_PSC)) { /* PSC MODE */ pr_debug("PSC is supported\n"); + if (dytc_version >= 9) { /* update profiles for DYTC 9 and up */ + platform_psc_profile_lowpower = DYTC_MODE_PSCV9_LOWPOWER; + platform_psc_profile_balanced = DYTC_MODE_PSCV9_BALANCE; + platform_psc_profile_performance = DYTC_MODE_PSCV9_PERFORM; + } } else { dbg_printk(TPACPI_DBG_INIT, "No DYTC support available\n"); return -ENODEV; @@ -10646,8 +10668,8 @@ static int tpacpi_dytc_profile_init(struct ibm_init_struct *iibm) "DYTC version %d: thermal mode available\n", dytc_version); /* Create platform_profile structure and register */ - tpacpi_pprof = devm_platform_profile_register(&tpacpi_pdev->dev, "thinkpad-acpi", - NULL, &dytc_profile_ops); + tpacpi_pprof = platform_profile_register(&tpacpi_pdev->dev, "thinkpad-acpi-profile", + NULL, &dytc_profile_ops); /* * If for some reason platform_profiles aren't enabled * don't quit terminally. @@ -10665,8 +10687,15 @@ static int tpacpi_dytc_profile_init(struct ibm_init_struct *iibm) return 0; } +static void dytc_profile_exit(void) +{ + if (!IS_ERR_OR_NULL(tpacpi_pprof)) + platform_profile_remove(tpacpi_pprof); +} + static struct ibm_struct dytc_profile_driver_data = { .name = "dytc-profile", + .exit = dytc_profile_exit, }; /************************************************************************* diff --git a/drivers/powercap/powercap_sys.c b/drivers/powercap/powercap_sys.c index 52c32dcbf7d8..4112a0097338 100644 --- a/drivers/powercap/powercap_sys.c +++ b/drivers/powercap/powercap_sys.c @@ -627,8 +627,7 @@ struct powercap_control_type *powercap_register_control_type( dev_set_name(&control_type->dev, "%s", name); result = device_register(&control_type->dev); if (result) { - if (control_type->allocated) - kfree(control_type); + put_device(&control_type->dev); return ERR_PTR(result); } idr_init(&control_type->idr); diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c index 0a2cfc8ad3c5..b3a83b03d9c1 100644 --- a/drivers/ptp/ptp_vmclock.c +++ b/drivers/ptp/ptp_vmclock.c @@ -414,16 +414,16 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, } static const struct file_operations vmclock_miscdev_fops = { + .owner = THIS_MODULE, .mmap = vmclock_miscdev_mmap, .read = vmclock_miscdev_read, }; /* module operations */ -static void vmclock_remove(struct platform_device *pdev) +static void vmclock_remove(void *data) { - struct device *dev = &pdev->dev; - struct vmclock_state *st = dev_get_drvdata(dev); + struct vmclock_state *st = data; if (st->ptp_clock) ptp_clock_unregister(st->ptp_clock); @@ -506,14 +506,13 @@ static int vmclock_probe(struct platform_device *pdev) if (ret) { dev_info(dev, "Failed to obtain physical address: %d\n", ret); - goto out; + return ret; } if (resource_size(&st->res) < VMCLOCK_MIN_SIZE) { dev_info(dev, "Region too small (0x%llx)\n", resource_size(&st->res)); - ret = -EINVAL; - goto out; + return -EINVAL; } st->clk = devm_memremap(dev, st->res.start, resource_size(&st->res), MEMREMAP_WB | MEMREMAP_DEC); @@ -521,31 +520,34 @@ static int vmclock_probe(struct platform_device *pdev) ret = PTR_ERR(st->clk); dev_info(dev, "failed to map shared memory\n"); st->clk = NULL; - goto out; + return ret; } if (le32_to_cpu(st->clk->magic) != VMCLOCK_MAGIC || le32_to_cpu(st->clk->size) > resource_size(&st->res) || le16_to_cpu(st->clk->version) != 1) { dev_info(dev, "vmclock magic fields invalid\n"); - ret = -EINVAL; - goto out; + return -EINVAL; } ret = ida_alloc(&vmclock_ida, GFP_KERNEL); if (ret < 0) - goto out; + return ret; st->index = ret; ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st); if (ret) - goto out; + return ret; st->name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "vmclock%d", st->index); - if (!st->name) { - ret = -ENOMEM; - goto out; - } + if (!st->name) + return -ENOMEM; + + st->miscdev.minor = MISC_DYNAMIC_MINOR; + + ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, st); + if (ret) + return ret; /* * If the structure is big enough, it can be mapped to userspace. @@ -554,13 +556,12 @@ static int vmclock_probe(struct platform_device *pdev) * cross that bridge if/when we come to it. */ if (le32_to_cpu(st->clk->size) >= PAGE_SIZE) { - st->miscdev.minor = MISC_DYNAMIC_MINOR; st->miscdev.fops = &vmclock_miscdev_fops; st->miscdev.name = st->name; ret = misc_register(&st->miscdev); if (ret) - goto out; + return ret; } /* If there is valid clock information, register a PTP clock */ @@ -570,16 +571,14 @@ static int vmclock_probe(struct platform_device *pdev) if (IS_ERR(st->ptp_clock)) { ret = PTR_ERR(st->ptp_clock); st->ptp_clock = NULL; - vmclock_remove(pdev); - goto out; + return ret; } } if (!st->miscdev.minor && !st->ptp_clock) { /* Neither miscdev nor PTP registered */ dev_info(dev, "vmclock: Neither miscdev nor PTP available; not registering\n"); - ret = -ENODEV; - goto out; + return -ENODEV; } dev_info(dev, "%s: registered %s%s%s\n", st->name, @@ -587,10 +586,7 @@ static int vmclock_probe(struct platform_device *pdev) (st->miscdev.minor && st->ptp_clock) ? ", " : "", st->ptp_clock ? "PTP" : ""); - dev_set_drvdata(dev, st); - - out: - return ret; + return 0; } static const struct acpi_device_id vmclock_acpi_ids[] = { @@ -601,7 +597,6 @@ MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids); static struct platform_driver vmclock_platform_driver = { .probe = vmclock_probe, - .remove = vmclock_remove, .driver = { .name = "vmclock", .acpi_match_table = vmclock_acpi_ids, diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index a3adaec5504e..20328d695ef9 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -7050,14 +7050,16 @@ int qeth_open(struct net_device *dev) card->data.state = CH_STATE_UP; netif_tx_start_all_queues(dev); - local_bh_disable(); qeth_for_each_output_queue(card, queue, i) { netif_napi_add_tx(dev, &queue->napi, qeth_tx_poll); napi_enable(&queue->napi); - napi_schedule(&queue->napi); } - napi_enable(&card->napi); + + local_bh_disable(); + qeth_for_each_output_queue(card, queue, i) { + napi_schedule(&queue->napi); + } napi_schedule(&card->napi); /* kick-start the NAPI softirq: */ local_bh_enable(); diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c index 1fd2da0264e3..47d74f881948 100644 --- a/drivers/scsi/qla1280.c +++ b/drivers/scsi/qla1280.c @@ -2867,7 +2867,7 @@ qla1280_64bit_start_scsi(struct scsi_qla_host *ha, struct srb * sp) dprintk(3, "S/G Segment phys_addr=%x %x, len=0x%x\n", cpu_to_le32(upper_32_bits(dma_handle)), cpu_to_le32(lower_32_bits(dma_handle)), - cpu_to_le32(sg_dma_len(sg_next(s)))); + cpu_to_le32(sg_dma_len(s))); remseg--; } dprintk(5, "qla1280_64bit_start_scsi: Scatter/gather " diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index d776f13cd160..be0890e4e706 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -872,13 +872,18 @@ static void scsi_io_completion_action(struct scsi_cmnd *cmd, int result) case 0x1a: /* start stop unit in progress */ case 0x1b: /* sanitize in progress */ case 0x1d: /* configuration in progress */ - case 0x24: /* depopulation in progress */ - case 0x25: /* depopulation restore in progress */ action = ACTION_DELAYED_RETRY; break; case 0x0a: /* ALUA state transition */ action = ACTION_DELAYED_REPREP; break; + /* + * Depopulation might take many hours, + * thus it is not worthwhile to retry. + */ + case 0x24: /* depopulation in progress */ + case 0x25: /* depopulation restore in progress */ + fallthrough; default: action = ACTION_FAIL; break; diff --git a/drivers/scsi/scsi_lib_test.c b/drivers/scsi/scsi_lib_test.c index 99834426a100..ae8af0e0047a 100644 --- a/drivers/scsi/scsi_lib_test.c +++ b/drivers/scsi/scsi_lib_test.c @@ -67,6 +67,13 @@ static void scsi_lib_test_multiple_sense(struct kunit *test) }; int i; + /* Success */ + sc.result = 0; + KUNIT_EXPECT_EQ(test, 0, scsi_check_passthrough(&sc, &failures)); + KUNIT_EXPECT_EQ(test, 0, scsi_check_passthrough(&sc, NULL)); + /* Command failed but caller did not pass in a failures array */ + scsi_build_sense(&sc, 0, ILLEGAL_REQUEST, 0x91, 0x36); + KUNIT_EXPECT_EQ(test, 0, scsi_check_passthrough(&sc, NULL)); /* Match end of array */ scsi_build_sense(&sc, 0, ILLEGAL_REQUEST, 0x91, 0x36); KUNIT_EXPECT_EQ(test, -EAGAIN, scsi_check_passthrough(&sc, &failures)); diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 087fcbfc9aaa..96d7e1a9a7c7 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -246,7 +246,7 @@ static int scsi_realloc_sdev_budget_map(struct scsi_device *sdev, } ret = sbitmap_init_node(&sdev->budget_map, scsi_device_max_queue_depth(sdev), - new_shift, GFP_KERNEL, + new_shift, GFP_NOIO, sdev->request_queue->node, false, true); if (!ret) sbitmap_resize(&sdev->budget_map, depth); diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 5a101ac06c47..a8614e54544e 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1800,6 +1800,7 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd) length = scsi_bufflen(scmnd); payload = (struct vmbus_packet_mpb_array *)&cmd_request->mpb; + payload->range.len = 0; payload_sz = 0; if (scsi_sg_count(scmnd)) { diff --git a/drivers/soc/qcom/smp2p.c b/drivers/soc/qcom/smp2p.c index 4783ab1adb8d..a3e88ced328a 100644 --- a/drivers/soc/qcom/smp2p.c +++ b/drivers/soc/qcom/smp2p.c @@ -365,7 +365,7 @@ static void smp2p_irq_print_chip(struct irq_data *irqd, struct seq_file *p) { struct smp2p_entry *entry = irq_data_get_irq_chip_data(irqd); - seq_printf(p, " %8s", dev_name(entry->smp2p->dev)); + seq_printf(p, "%8s", dev_name(entry->smp2p->dev)); } static struct irq_chip smp2p_irq_chip = { diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c index c42cbde8a31b..210648a0092e 100644 --- a/drivers/target/target_core_stat.c +++ b/drivers/target/target_core_stat.c @@ -117,9 +117,9 @@ static ssize_t target_stat_tgt_status_show(struct config_item *item, char *page) { if (to_stat_tgt_dev(item)->export_count) - return snprintf(page, PAGE_SIZE, "activated"); + return snprintf(page, PAGE_SIZE, "activated\n"); else - return snprintf(page, PAGE_SIZE, "deactivated"); + return snprintf(page, PAGE_SIZE, "deactivated\n"); } static ssize_t target_stat_tgt_non_access_lus_show(struct config_item *item, diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index df08f13052ff..8bb1a01fef2a 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -798,7 +798,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) nonseekable_open(inode, filp); /* We refuse fsnotify events on ptmx, since it's a shared resource */ - filp->f_mode |= FMODE_NONOTIFY; + file_set_fsnotify_mode(filp, FMODE_NONOTIFY); retval = tty_alloc_file(filp); if (retval) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index cd404ade48dc..1893a7ad9531 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -2120,8 +2120,6 @@ static void ufshcd_init_clk_gating(struct ufs_hba *hba) INIT_DELAYED_WORK(&hba->clk_gating.gate_work, ufshcd_gate_work); INIT_WORK(&hba->clk_gating.ungate_work, ufshcd_ungate_work); - spin_lock_init(&hba->clk_gating.lock); - hba->clk_gating.clk_gating_workq = alloc_ordered_workqueue( "ufs_clk_gating_%d", WQ_MEM_RECLAIM | WQ_HIGHPRI, hba->host->host_no); @@ -3106,8 +3104,13 @@ ufshcd_dev_cmd_completion(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) case UPIU_TRANSACTION_QUERY_RSP: { u8 response = lrbp->ucd_rsp_ptr->header.response; - if (response == 0) + if (response == 0) { err = ufshcd_copy_query_response(hba, lrbp); + } else { + err = -EINVAL; + dev_err(hba->dev, "%s: unexpected response in Query RSP: %x\n", + __func__, response); + } break; } case UPIU_TRANSACTION_REJECT_UPIU: @@ -5976,24 +5979,6 @@ out: __func__, err); } -static void ufshcd_temp_exception_event_handler(struct ufs_hba *hba, u16 status) -{ - u32 value; - - if (ufshcd_query_attr_retry(hba, UPIU_QUERY_OPCODE_READ_ATTR, - QUERY_ATTR_IDN_CASE_ROUGH_TEMP, 0, 0, &value)) - return; - - dev_info(hba->dev, "exception Tcase %d\n", value - 80); - - ufs_hwmon_notify_event(hba, status & MASK_EE_URGENT_TEMP); - - /* - * A placeholder for the platform vendors to add whatever additional - * steps required - */ -} - static int __ufshcd_wb_toggle(struct ufs_hba *hba, bool set, enum flag_idn idn) { u8 index; @@ -6214,7 +6199,7 @@ static void ufshcd_exception_event_handler(struct work_struct *work) ufshcd_bkops_exception_event_handler(hba); if (status & hba->ee_drv_mask & MASK_EE_URGENT_TEMP) - ufshcd_temp_exception_event_handler(hba, status); + ufs_hwmon_notify_event(hba, status & MASK_EE_URGENT_TEMP); ufs_debugfs_exception_event(hba, status); } @@ -9160,7 +9145,7 @@ out: if (!IS_ERR_OR_NULL(clki->clk) && clki->enabled) clk_disable_unprepare(clki->clk); } - } else if (!ret && on) { + } else if (!ret && on && hba->clk_gating.is_initialized) { scoped_guard(spinlock_irqsave, &hba->clk_gating.lock) hba->clk_gating.state = CLKS_ON; trace_ufshcd_clk_gating(dev_name(hba->dev), @@ -10247,16 +10232,6 @@ EXPORT_SYMBOL_GPL(ufshcd_system_thaw); #endif /* CONFIG_PM_SLEEP */ /** - * ufshcd_dealloc_host - deallocate Host Bus Adapter (HBA) - * @hba: pointer to Host Bus Adapter (HBA) - */ -void ufshcd_dealloc_host(struct ufs_hba *hba) -{ - scsi_host_put(hba->host); -} -EXPORT_SYMBOL_GPL(ufshcd_dealloc_host); - -/** * ufshcd_set_dma_mask - Set dma mask based on the controller * addressing capability * @hba: per adapter instance @@ -10275,11 +10250,25 @@ static int ufshcd_set_dma_mask(struct ufs_hba *hba) } /** + * ufshcd_devres_release - devres cleanup handler, invoked during release of + * hba->dev + * @host: pointer to SCSI host + */ +static void ufshcd_devres_release(void *host) +{ + scsi_host_put(host); +} + +/** * ufshcd_alloc_host - allocate Host Bus Adapter (HBA) * @dev: pointer to device handle * @hba_handle: driver private handle * * Return: 0 on success, non-zero value on failure. + * + * NOTE: There is no corresponding ufshcd_dealloc_host() because this function + * keeps track of its allocations using devres and deallocates everything on + * device removal automatically. */ int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle) { @@ -10301,6 +10290,13 @@ int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle) err = -ENOMEM; goto out_error; } + + err = devm_add_action_or_reset(dev, ufshcd_devres_release, + host); + if (err) + return dev_err_probe(dev, err, + "failed to add ufshcd dealloc action\n"); + host->nr_maps = HCTX_TYPE_POLL + 1; hba = shost_priv(host); hba->host = host; @@ -10429,6 +10425,12 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) hba->irq = irq; hba->vps = &ufs_hba_vps; + /* + * Initialize clk_gating.lock early since it is being used in + * ufshcd_setup_clocks() + */ + spin_lock_init(&hba->clk_gating.lock); + err = ufshcd_hba_init(hba); if (err) goto out_error; diff --git a/drivers/ufs/host/ufshcd-pci.c b/drivers/ufs/host/ufshcd-pci.c index ea39c5d5b8cf..9cfcaad23cf9 100644 --- a/drivers/ufs/host/ufshcd-pci.c +++ b/drivers/ufs/host/ufshcd-pci.c @@ -562,7 +562,6 @@ static void ufshcd_pci_remove(struct pci_dev *pdev) pm_runtime_forbid(&pdev->dev); pm_runtime_get_noresume(&pdev->dev); ufshcd_remove(hba); - ufshcd_dealloc_host(hba); } /** @@ -605,7 +604,6 @@ ufshcd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) err = ufshcd_init(hba, mmio_base, pdev->irq); if (err) { dev_err(&pdev->dev, "Initialization failed\n"); - ufshcd_dealloc_host(hba); return err; } diff --git a/drivers/ufs/host/ufshcd-pltfrm.c b/drivers/ufs/host/ufshcd-pltfrm.c index 505572d4fa87..ffe5d1d2b215 100644 --- a/drivers/ufs/host/ufshcd-pltfrm.c +++ b/drivers/ufs/host/ufshcd-pltfrm.c @@ -465,21 +465,17 @@ int ufshcd_pltfrm_init(struct platform_device *pdev, struct device *dev = &pdev->dev; mmio_base = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(mmio_base)) { - err = PTR_ERR(mmio_base); - goto out; - } + if (IS_ERR(mmio_base)) + return PTR_ERR(mmio_base); irq = platform_get_irq(pdev, 0); - if (irq < 0) { - err = irq; - goto out; - } + if (irq < 0) + return irq; err = ufshcd_alloc_host(dev, &hba); if (err) { dev_err(dev, "Allocation failed\n"); - goto out; + return err; } hba->vops = vops; @@ -488,13 +484,13 @@ int ufshcd_pltfrm_init(struct platform_device *pdev, if (err) { dev_err(dev, "%s: clock parse failed %d\n", __func__, err); - goto dealloc_host; + return err; } err = ufshcd_parse_regulator_info(hba); if (err) { dev_err(dev, "%s: regulator init failed %d\n", __func__, err); - goto dealloc_host; + return err; } ufshcd_init_lanes_per_dir(hba); @@ -502,25 +498,20 @@ int ufshcd_pltfrm_init(struct platform_device *pdev, err = ufshcd_parse_operating_points(hba); if (err) { dev_err(dev, "%s: OPP parse failed %d\n", __func__, err); - goto dealloc_host; + return err; } err = ufshcd_init(hba, mmio_base, irq); if (err) { dev_err_probe(dev, err, "Initialization failed with error %d\n", err); - goto dealloc_host; + return err; } pm_runtime_set_active(dev); pm_runtime_enable(dev); return 0; - -dealloc_host: - ufshcd_dealloc_host(hba); -out: - return err; } EXPORT_SYMBOL_GPL(ufshcd_pltfrm_init); @@ -534,7 +525,6 @@ void ufshcd_pltfrm_remove(struct platform_device *pdev) pm_runtime_get_sync(&pdev->dev); ufshcd_remove(hba); - ufshcd_dealloc_host(hba); pm_runtime_disable(&pdev->dev); pm_runtime_put_noidle(&pdev->dev); } diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 85eea7a4dea3..fc7efd0a7525 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -61,6 +61,13 @@ config BCACHEFS_DEBUG The resulting code will be significantly slower than normal; you probably shouldn't select this option unless you're a developer. +config BCACHEFS_INJECT_TRANSACTION_RESTARTS + bool "Randomly inject transaction restarts" + depends on BCACHEFS_DEBUG + help + Randomly inject transaction restarts in a few core paths - may have a + significant performance penalty + config BCACHEFS_TESTS bool "bcachefs unit and performance tests" depends on BCACHEFS_FS diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index fc2ef33b67b3..3ea809990ef1 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1803,7 +1803,6 @@ struct discard_buckets_state { u64 open; u64 need_journal_commit; u64 discarded; - u64 need_journal_commit_this_dev; }; static int bch2_discard_one_bucket(struct btree_trans *trans, @@ -1827,11 +1826,11 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } - if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, - pos.inode, pos.offset)) { - s->need_journal_commit++; - s->need_journal_commit_this_dev++; + u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, + pos.inode, pos.offset); + if (seq_ready > c->journal.flushed_seq_ondisk) { + if (seq_ready > c->journal.flushing_seq) + s->need_journal_commit++; goto out; } @@ -1865,23 +1864,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, discard_locked = true; } - if (!bkey_eq(*discard_pos_done, iter.pos) && - ca->mi.discard && !c->opts.nochanges) { - /* - * This works without any other locks because this is the only - * thread that removes items from the need_discard tree - */ - bch2_trans_unlock_long(trans); - blkdev_issue_discard(ca->disk_sb.bdev, - k.k->p.offset * ca->mi.bucket_size, - ca->mi.bucket_size, - GFP_KERNEL); - *discard_pos_done = iter.pos; + if (!bkey_eq(*discard_pos_done, iter.pos)) { s->discarded++; + *discard_pos_done = iter.pos; - ret = bch2_trans_relock_notrace(trans); - if (ret) - goto out; + if (ca->mi.discard && !c->opts.nochanges) { + /* + * This works without any other locks because this is the only + * thread that removes items from the need_discard tree + */ + bch2_trans_unlock_long(trans); + blkdev_issue_discard(ca->disk_sb.bdev, + k.k->p.offset * ca->mi.bucket_size, + ca->mi.bucket_size, + GFP_KERNEL); + ret = bch2_trans_relock_notrace(trans); + if (ret) + goto out; + } } SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); @@ -1929,6 +1929,9 @@ static void bch2_do_discards_work(struct work_struct *work) POS(ca->dev_idx, U64_MAX), 0, k, bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false))); + if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal)) + bch2_journal_flush_async(&c->journal, NULL); + trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); @@ -2024,7 +2027,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work) break; } - trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); bch2_trans_put(trans); percpu_ref_put(&ca->io_ref); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 6df41c331a52..5a781fb4c794 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -205,8 +205,12 @@ static inline bool may_alloc_bucket(struct bch_fs *c, return false; } - if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, bucket.inode, bucket.offset)) { + u64 journal_seq_ready = + bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, + bucket.inode, bucket.offset); + if (journal_seq_ready > c->journal.flushed_seq_ondisk) { + if (journal_seq_ready > c->journal.flushing_seq) + s->need_journal_commit++; s->skipped_need_journal_commit++; return false; } @@ -570,7 +574,7 @@ alloc: ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); - if (s.skipped_need_journal_commit * 2 > avail) + if (s.need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 9bbb28e90b93..4aa8ee026cb8 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -18,6 +18,7 @@ struct bucket_alloc_state { u64 buckets_seen; u64 skipped_open; u64 skipped_need_journal_commit; + u64 need_journal_commit; u64 skipped_nocow; u64 skipped_nouse; u64 skipped_mi_btree_bitmap; diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 5988219c6908..e32fce4fd258 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2357,6 +2357,12 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en bch2_btree_iter_verify_entry_exit(iter); EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + if (iter->update_path) { bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_intent); @@ -2622,6 +2628,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp bch2_btree_iter_verify_entry_exit(iter); EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN)); + int ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + while (1) { k = __bch2_btree_iter_peek_prev(iter, search_key); if (unlikely(!k.k)) @@ -2749,6 +2761,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) bch2_btree_iter_verify_entry_exit(iter); EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + /* extents can't span inode numbers: */ if ((iter->flags & BTREE_ITER_is_extents) && unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { @@ -3106,6 +3124,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (ret) + return ERR_PTR(ret); + struct btree_transaction_stats *s = btree_trans_stats(trans); s->max_mem = max(s->max_mem, new_bytes); @@ -3163,7 +3185,8 @@ out_new_mem: if (old_bytes) { trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); + return ERR_PTR(btree_trans_restart_ip(trans, + BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); } out_change_top: p = trans->mem + trans->mem_top; @@ -3271,6 +3294,14 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->last_begin_ip = _RET_IP_; +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + if (trans->restarted) { + trans->restart_count_this_trans++; + } else { + trans->restart_count_this_trans = 0; + } +#endif + trans_set_locked(trans, false); if (trans->restarted) { diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index b9538e6e6d65..b96157f3dc9c 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -355,6 +355,18 @@ static int btree_trans_restart(struct btree_trans *trans, int err) return btree_trans_restart_ip(trans, err, _THIS_IP_); } +static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip) +{ +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) { + trace_and_count(trans->c, trans_restart_injected, trans, ip); + return btree_trans_restart_ip(trans, + BCH_ERR_transaction_restart_fault_inject, ip); + } +#endif + return 0; +} + bool bch2_btree_node_upgrade(struct btree_trans *, struct btree_path *, unsigned); @@ -739,7 +751,7 @@ transaction_restart: \ if (!_ret2) \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ \ - _ret2 ?: trans_was_restarted(_trans, _restart_count); \ + _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \ }) #define for_each_btree_key_max_continue(_trans, _iter, \ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index c378b97ebeca..1821f40c161a 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -748,7 +748,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) rcu_read_unlock(); mutex_lock(&bc->table.mutex); mutex_unlock(&bc->table.mutex); - rcu_read_lock(); continue; } for (i = 0; i < tbl->size; i++) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 2760dd9569ed..c4f524b2ca9a 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -999,6 +999,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) bch2_trans_verify_not_unlocked_or_in_restart(trans); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) + goto out_reset; + if (!trans->nr_updates && !trans->journal_entries_u64s) goto out_reset; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index a6f251eb4164..a09cbe9cd94f 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -509,6 +509,9 @@ struct btree_trans { bool notrace_relock_fail:1; enum bch_errcode restarted:16; u32 restart_count; +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + u32 restart_count_this_trans; +#endif u64 last_begin_time; unsigned long last_begin_ip; diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 7930ffea3075..26d646e1275c 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -278,12 +278,12 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct bt { struct bset_tree *t = bset_tree_last(b); struct btree_node_entry *bne = max(write_block(b), - (void *) btree_bkey_last(b, bset_tree_last(b))); + (void *) btree_bkey_last(b, t)); ssize_t remaining_space = __bch2_btree_u64s_remaining(b, bne->keys.start); if (unlikely(bset_written(b, bset(b, t)))) { - if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) + if (b->written + block_sectors(c) <= btree_sectors(c)) return bne; } else { if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c index f9fb150eda70..c8a488e6b7b8 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.c +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -22,23 +22,21 @@ static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_ memset(t->d, 0, sizeof(t->d[0]) << t->bits); } -bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, - u64 flushed_seq, - unsigned dev, u64 bucket) +u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b, + unsigned dev, u64 bucket) { struct buckets_waiting_for_journal_table *t; u64 dev_bucket = (u64) dev << 56 | bucket; - bool ret = false; - unsigned i; + u64 ret = 0; mutex_lock(&b->lock); t = b->t; - for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); if (h->dev_bucket == dev_bucket) { - ret = h->journal_seq > flushed_seq; + ret = h->journal_seq; break; } } diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h index d2ae19cbe18c..365619ca44c8 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.h +++ b/fs/bcachefs/buckets_waiting_for_journal.h @@ -4,8 +4,8 @@ #include "buckets_waiting_for_journal_types.h" -bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, - u64, unsigned, u64); +u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *, + unsigned, u64); int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, u64, unsigned, u64, u64); diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 5360cbb3ec29..f4372cafea2e 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -210,11 +210,13 @@ static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem * static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, u64 *v, unsigned nr) { + percpu_down_read(&c->mark_lock); struct bch_accounting_mem *acc = &c->accounting; unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &p); bch2_accounting_mem_read_counters(acc, idx, v, nr, false); + percpu_up_read(&c->mark_lock); } static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index d2e134528f0e..428b9be6af34 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -285,12 +285,14 @@ void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *); +#include "rebalance.h" + static inline struct bch_extent_rebalance bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode) { struct bch_io_opts io_opts; bch2_inode_opts_get(&io_opts, c, inode); - return io_opts_to_rebalance_opts(&io_opts); + return io_opts_to_rebalance_opts(c, &io_opts); } int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index dd508d93e9fc..03892388832b 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -411,6 +411,16 @@ void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) __bch2_write_op_error(out, op, op->pos.offset); } +static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, + struct bch_write_op *op, u64 offset) +{ + bch2_inum_offset_err_msg_trans(trans, out, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + prt_printf(out, "write error%s: ", + op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); +} + void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, const struct bkey_i *k, @@ -1193,7 +1203,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); struct printbuf buf = PRINTBUF; - __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); + bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k)); prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index cb2c3722f674..24c294d4634e 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -319,6 +319,16 @@ void bch2_journal_halt(struct journal *j) spin_unlock(&j->lock); } +void bch2_journal_halt_locked(struct journal *j) +{ + lockdep_assert_held(&j->lock); + + __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); + if (!j->err_seq) + j->err_seq = journal_cur_seq(j); + journal_wake(j); +} + static bool journal_entry_want_write(struct journal *j) { bool ret = !journal_entry_is_open(j) || @@ -381,9 +391,12 @@ static int journal_entry_open(struct journal *j) if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) return JOURNAL_ERR_max_in_flight; - if (bch2_fs_fatal_err_on(journal_cur_seq(j) >= JOURNAL_SEQ_MAX, - c, "cannot start: journal seq overflow")) + if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { + bch_err(c, "cannot start: journal seq overflow"); + if (bch2_fs_emergency_read_only_locked(c)) + bch_err(c, "fatal error - emergency read only"); return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + } BUG_ON(!j->cur_entry_sectors); @@ -783,6 +796,7 @@ recheck_need_open: } buf->must_flush = true; + j->flushing_seq = max(j->flushing_seq, seq); if (parent && !closure_wait(&buf->wait, parent)) BUG(); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index dccddd5420ad..107f7f901cd9 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -409,6 +409,7 @@ bool bch2_journal_noflush_seq(struct journal *, u64, u64); int bch2_journal_meta(struct journal *); void bch2_journal_halt(struct journal *); +void bch2_journal_halt_locked(struct journal *); static inline int bch2_journal_error(struct journal *j) { diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 6a9cefb635d6..d373cd181a7f 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -384,12 +384,16 @@ void bch2_journal_pin_drop(struct journal *j, spin_unlock(&j->lock); } -static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) +static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin, + journal_pin_flush_fn fn) { if (fn == bch2_btree_node_flush0 || - fn == bch2_btree_node_flush1) - return JOURNAL_PIN_TYPE_btree; - else if (fn == bch2_btree_key_cache_journal_flush) + fn == bch2_btree_node_flush1) { + unsigned idx = fn == bch2_btree_node_flush1; + struct btree *b = container_of(pin, struct btree, writes[idx].journal); + + return JOURNAL_PIN_TYPE_btree0 - b->c.level; + } else if (fn == bch2_btree_key_cache_journal_flush) return JOURNAL_PIN_TYPE_key_cache; else return JOURNAL_PIN_TYPE_other; @@ -441,7 +445,7 @@ void bch2_journal_pin_copy(struct journal *j, bool reclaim = __journal_pin_drop(j, dst); - bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); + bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); @@ -465,7 +469,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, bool reclaim = __journal_pin_drop(j, pin); - bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); + bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); @@ -587,7 +591,7 @@ static size_t journal_flush_pins(struct journal *j, spin_lock(&j->lock); /* Pin might have been dropped or rearmed: */ if (likely(!err && !j->flush_in_progress_dropped)) - list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]); + list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]); j->flush_in_progress = NULL; j->flush_in_progress_dropped = false; spin_unlock(&j->lock); @@ -869,18 +873,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - if (journal_flush_pins_or_still_flushing(j, seq_to_flush, - BIT(JOURNAL_PIN_TYPE_key_cache)| - BIT(JOURNAL_PIN_TYPE_other))) { - *did_work = true; - goto unlock; - } - - if (journal_flush_pins_or_still_flushing(j, seq_to_flush, - BIT(JOURNAL_PIN_TYPE_btree))) { - *did_work = true; - goto unlock; - } + for (int type = JOURNAL_PIN_TYPE_NR - 1; + type >= 0; + --type) + if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) { + *did_work = true; + goto unlock; + } if (seq_to_flush > journal_cur_seq(j)) bch2_journal_entry_close(j); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 3ba433a48eb8..1ef3a28ed6ab 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -53,7 +53,10 @@ struct journal_buf { */ enum journal_pin_type { - JOURNAL_PIN_TYPE_btree, + JOURNAL_PIN_TYPE_btree3, + JOURNAL_PIN_TYPE_btree2, + JOURNAL_PIN_TYPE_btree1, + JOURNAL_PIN_TYPE_btree0, JOURNAL_PIN_TYPE_key_cache, JOURNAL_PIN_TYPE_other, JOURNAL_PIN_TYPE_NR, @@ -237,6 +240,7 @@ struct journal { /* seq, last_seq from the most recent journal entry successfully written */ u64 seq_ondisk; u64 flushed_seq_ondisk; + u64 flushing_seq; u64 last_seq_ondisk; u64 err_seq; u64 last_empty_seq; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index a182b5d454ba..9d397fc2a1f0 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -659,18 +659,4 @@ static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); bool bch2_opt_is_inode_opt(enum bch_opt_id); -/* rebalance opts: */ - -static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_io_opts *opts) -{ - return (struct bch_extent_rebalance) { - .type = BIT(BCH_EXTENT_ENTRY_rebalance), -#define x(_name) \ - ._name = opts->_name, \ - ._name##_from_inode = opts->_name##_from_inode, - BCH_REBALANCE_OPTS() -#undef x - }; -}; - #endif /* _BCACHEFS_OPTS_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 4adc74cd3f70..d0a1f5cd5c2b 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -121,12 +121,10 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) } } incompressible: - if (opts->background_target && - bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) { + if (opts->background_target) bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) sectors += p.crc.compressed_size; - } return sectors; } @@ -140,7 +138,7 @@ static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opt const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) { - struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts); + struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts); return old == NULL || memcmp(old, &new, sizeof(new)); } else { return old != NULL; @@ -163,7 +161,7 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, k.k->u64s += sizeof(*old) / sizeof(u64); } - *old = io_opts_to_rebalance_opts(opts); + *old = io_opts_to_rebalance_opts(c, opts); } else { if (old) extent_entry_drop(k, (union bch_extent_entry *) old); diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 0a0821ab895d..62a3859d3823 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -4,8 +4,28 @@ #include "compress.h" #include "disk_groups.h" +#include "opts.h" #include "rebalance_types.h" +static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c, + struct bch_io_opts *opts) +{ + struct bch_extent_rebalance r = { + .type = BIT(BCH_EXTENT_ENTRY_rebalance), +#define x(_name) \ + ._name = opts->_name, \ + ._name##_from_inode = opts->_name##_from_inode, + BCH_REBALANCE_OPTS() +#undef x + }; + + if (r.background_target && + !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target)) + r.background_target = 0; + + return r; +}; + u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); int bch2_get_update_rebalance_opts(struct btree_trans *, diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 93ba4f4e47ca..376fd0a6e868 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -381,6 +381,8 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, not_found: if (flags & BTREE_TRIGGER_check_repair) { ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false); + if (ret == -BCH_ERR_missing_indirect_extent) + ret = 0; if (ret) goto err; } diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index ea0a18364751..b86ec013d7d7 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -180,9 +180,9 @@ enum bch_fsck_flags { x(ptr_crc_nonce_mismatch, 162, 0) \ x(ptr_stripe_redundant, 163, 0) \ x(reservation_key_nr_replicas_invalid, 164, 0) \ - x(reflink_v_refcount_wrong, 165, 0) \ + x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \ x(reflink_v_pos_bad, 292, 0) \ - x(reflink_p_to_missing_reflink_v, 166, 0) \ + x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \ x(reflink_refcount_underflow, 293, 0) \ x(stripe_pos_bad, 167, 0) \ x(stripe_val_size_bad, 168, 0) \ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index e3d0475232e5..b7b96283c316 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -428,7 +428,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots, POS(0, snapid), 0, snapshot); - ret = bkey_err(subvol); + ret = bkey_err(snapshot); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, "missing snapshot %u", snapid); if (ret) @@ -440,6 +440,11 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter, BTREE_ID_snapshot_trees, POS(0, treeid), 0, snapshot_tree); + ret = bkey_err(snapshot_tree); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing snapshot tree %u", treeid); + if (ret) + goto err; if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) { struct bkey_i_snapshot_tree *snapshot_tree_mut = diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index d97ea7bd1171..6d97d412fed9 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -411,6 +411,17 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) return ret; } +bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) +{ + bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); + + bch2_journal_halt_locked(&c->journal); + bch2_fs_read_only_async(c); + + wake_up(&bch2_read_only_wait); + return ret; +} + static int bch2_fs_read_write_late(struct bch_fs *c) { int ret; diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index fa6d52216510..04f8287eff5c 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -29,6 +29,7 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); bool bch2_fs_emergency_read_only(struct bch_fs *); +bool bch2_fs_emergency_read_only_locked(struct bch_fs *); void bch2_fs_read_only(struct bch_fs *); int bch2_fs_read_write(struct bch_fs *); diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 56a5a7fbc0fd..c1b51009edf6 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -727,7 +727,7 @@ DEFINE_EVENT(fs_str, bucket_alloc_fail, TP_ARGS(c, str) ); -TRACE_EVENT(discard_buckets, +DECLARE_EVENT_CLASS(discard_buckets_class, TP_PROTO(struct bch_fs *c, u64 seen, u64 open, u64 need_journal_commit, u64 discarded, const char *err), TP_ARGS(c, seen, open, need_journal_commit, discarded, err), @@ -759,6 +759,18 @@ TRACE_EVENT(discard_buckets, __entry->err) ); +DEFINE_EVENT(discard_buckets_class, discard_buckets, + TP_PROTO(struct bch_fs *c, u64 seen, u64 open, + u64 need_journal_commit, u64 discarded, const char *err), + TP_ARGS(c, seen, open, need_journal_commit, discarded, err) +); + +DEFINE_EVENT(discard_buckets_class, discard_buckets_fast, + TP_PROTO(struct bch_fs *c, u64 seen, u64 open, + u64 need_journal_commit, u64 discarded, const char *err), + TP_ARGS(c, seen, open, need_journal_commit, discarded, err) +); + TRACE_EVENT(bucket_invalidate, TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), TP_ARGS(c, dev, bucket, sectors), diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 92071ca0655f..3dc5a35dd19b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1496,6 +1496,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (!p->skip_locking) { btrfs_unlock_up_safe(p, parent_level + 1); + btrfs_maybe_reset_lockdep_class(root, tmp); tmp_locked = true; btrfs_tree_read_lock(tmp); btrfs_release_path(p); @@ -1539,6 +1540,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (!p->skip_locking) { ASSERT(ret == -EAGAIN); + btrfs_maybe_reset_lockdep_class(root, tmp); tmp_locked = true; btrfs_tree_read_lock(tmp); btrfs_release_path(p); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a2cac9d0a1a9..b2fae67f8fa3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -523,8 +523,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) u64 end; u32 len; - /* For now only order 0 folios are supported for data. */ - ASSERT(folio_order(folio) == 0); btrfs_debug(fs_info, "%s: bi_sector=%llu, err=%d, mirror=%u", __func__, bio->bi_iter.bi_sector, bio->bi_status, @@ -552,7 +550,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) if (likely(uptodate)) { loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> folio_shift(folio); /* * Zero out the remaining part if this range straddles @@ -561,9 +558,11 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * Here we should only zero the range inside the folio, * not touch anything else. * - * NOTE: i_size is exclusive while end is inclusive. + * NOTE: i_size is exclusive while end is inclusive and + * folio_contains() takes PAGE_SIZE units. */ - if (folio_index(folio) == end_index && i_size <= end) { + if (folio_contains(folio, i_size >> PAGE_SHIFT) && + i_size <= end) { u32 zero_start = max(offset_in_folio(folio, i_size), offset_in_folio(folio, start)); u32 zero_len = offset_in_folio(folio, end) + 1 - @@ -899,7 +898,6 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, u64 len, struct extent_map **em_cached) { struct extent_map *em; - struct extent_state *cached_state = NULL; ASSERT(em_cached); @@ -915,14 +913,12 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, *em_cached = NULL; } - btrfs_lock_and_flush_ordered_range(inode, start, start + len - 1, &cached_state); em = btrfs_get_extent(inode, folio, start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; } - unlock_extent(&inode->io_tree, start, start + len - 1, &cached_state); return em; } @@ -956,7 +952,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, return ret; } - if (folio->index == last_byte >> folio_shift(folio)) { + if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { size_t zero_offset = offset_in_folio(folio, last_byte); if (zero_offset) { @@ -1079,11 +1075,18 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, int btrfs_read_folio(struct file *file, struct folio *folio) { + struct btrfs_inode *inode = folio_to_inode(folio); + const u64 start = folio_pos(folio); + const u64 end = start + folio_size(folio) - 1; + struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; struct extent_map *em_cached = NULL; int ret; + btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); + unlock_extent(&inode->io_tree, start, end, &cached_state); + free_extent_map(em_cached); /* @@ -2380,12 +2383,20 @@ void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; struct folio *folio; + struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); + const u64 start = readahead_pos(rac); + const u64 end = start + readahead_length(rac) - 1; + struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; + btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + unlock_extent(&inode->io_tree, start, end, &cached_state); + if (em_cached) free_extent_map(em_cached); submit_one_bio(&bio_ctrl); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 36f51c311bb1..ed3c0d6546c5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1039,7 +1039,6 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) loff_t pos = iocb->ki_pos; int ret; loff_t oldsize; - loff_t start_pos; /* * Quickly bail out on NOWAIT writes if we don't have the nodatacow or @@ -1066,9 +1065,8 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) inode_inc_iversion(inode); } - start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); - if (start_pos > oldsize) { + if (pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ loff_t end_pos = round_up(pos + count, fs_info->sectorsize); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 30eceaf829a7..4aca7475fd82 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1229,6 +1229,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( */ if (WARN_ON_ONCE(len >= ordered->num_bytes)) return ERR_PTR(-EINVAL); + /* + * If our ordered extent had an error there's no point in continuing. + * The error may have come from a transaction abort done either by this + * task or some other concurrent task, and the transaction abort path + * iterates over all existing ordered extents and sets the flag + * BTRFS_ORDERED_IOERR on them. + */ + if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) { + const int fs_error = BTRFS_FS_ERROR(fs_info); + + return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO); + } /* We cannot split partially completed ordered extents. */ if (ordered->bytes_left) { ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b90fabe302e6..f9d3766c809b 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1880,11 +1880,7 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su * Commit current transaction to make sure all the rfer/excl numbers * get updated. */ - trans = btrfs_start_transaction(fs_info->quota_root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_commit_transaction(trans); + ret = btrfs_commit_current_transaction(fs_info->quota_root); if (ret < 0) return ret; @@ -1897,8 +1893,11 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su /* * It's squota and the subvolume still has numbers needed for future * accounting, in this case we can not delete it. Just skip it. + * + * Or the qgroup is already removed by a qgroup rescan. For both cases we're + * safe to ignore them. */ - if (ret == -EBUSY) + if (ret == -EBUSY || ret == -ENOENT) ret = 0; return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 15312013f2a3..aca83a98b75a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -274,8 +274,10 @@ loop: cur_trans = fs_info->running_transaction; if (cur_trans) { if (TRANS_ABORTED(cur_trans)) { + const int abort_error = cur_trans->aborted; + spin_unlock(&fs_info->trans_lock); - return cur_trans->aborted; + return abort_error; } if (btrfs_blocked_trans_types[cur_trans->state] & type) { spin_unlock(&fs_info->trans_lock); diff --git a/fs/dcache.c b/fs/dcache.c index 9cc0d47da321..e3634916ffb9 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1700,7 +1700,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ dentry->d_flags = 0; - lockref_init(&dentry->d_lockref, 1); + lockref_init(&dentry->d_lockref); seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; @@ -2966,11 +2966,11 @@ static int __d_unalias(struct dentry *dentry, struct dentry *alias) goto out_err; m2 = &alias->d_parent->d_inode->i_rwsem; out_unalias: - if (alias->d_op->d_unalias_trylock && + if (alias->d_op && alias->d_op->d_unalias_trylock && !alias->d_op->d_unalias_trylock(alias)) goto out_err; __d_move(alias, dentry, false); - if (alias->d_op->d_unalias_unlock) + if (alias->d_op && alias->d_op->d_unalias_unlock) alias->d_op->d_unalias_unlock(alias); ret = 0; out_err: diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 29f8963bb523..d771e06db738 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -726,7 +726,7 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe) if (IS_ERR(pcl)) return PTR_ERR(pcl); - lockref_init(&pcl->lockref, 1); /* one ref for this request */ + lockref_init(&pcl->lockref); /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; pcl->length = 0; pcl->partial = true; diff --git a/fs/file_table.c b/fs/file_table.c index f0291a66f9db..5c00dc38558d 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -194,6 +194,11 @@ static int init_file(struct file *f, int flags, const struct cred *cred) * refcount bumps we should reinitialize the reused file first. */ file_ref_init(&f->f_ref, 1); + /* + * Disable permission and pre-content events for all files by default. + * They may be enabled later by file_set_fsnotify_mode_from_watchers(). + */ + file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM); return 0; } @@ -375,7 +380,13 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, if (IS_ERR(file)) { ihold(inode); path_put(&path); + return file; } + /* + * Disable all fsnotify events for pseudo files by default. + * They may be enabled by caller with file_set_fsnotify_mode(). + */ + file_set_fsnotify_mode(file, FMODE_NONOTIFY); return file; } EXPORT_SYMBOL(alloc_file_pseudo); @@ -400,6 +411,11 @@ struct file *alloc_file_pseudo_noaccount(struct inode *inode, return file; } file_init_path(file, &path, fops); + /* + * Disable all fsnotify events for pseudo files by default. + * They may be enabled by caller with file_set_fsnotify_mode(). + */ + file_set_fsnotify_mode(file, FMODE_NONOTIFY); return file; } EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 8c4c1f871a88..65c07aa95718 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1201,8 +1201,8 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, if (glops->go_instantiate) gl->gl_flags |= BIT(GLF_INSTANTIATE_NEEDED); gl->gl_name = name; + lockref_init(&gl->gl_lockref); lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass); - gl->gl_lockref.count = 1; gl->gl_state = LM_ST_UNLOCKED; gl->gl_target = LM_ST_UNLOCKED; gl->gl_demote_state = LM_ST_EXCLUSIVE; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 04cadc02e5a6..0727f60ad028 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -51,7 +51,6 @@ static void gfs2_init_glock_once(void *foo) { struct gfs2_glock *gl = foo; - spin_lock_init(&gl->gl_lockref.lock); INIT_LIST_HEAD(&gl->gl_holders); INIT_LIST_HEAD(&gl->gl_lru); INIT_LIST_HEAD(&gl->gl_ail_list); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 58bc5013ca49..2298e06797ac 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -236,7 +236,7 @@ static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, str return NULL; qd->qd_sbd = sdp; - lockref_init(&qd->qd_lockref, 0); + lockref_init(&qd->qd_lockref); qd->qd_id = qid; qd->qd_slot = -1; INIT_LIST_HEAD(&qd->qd_lru); @@ -297,7 +297,6 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, spin_lock_bucket(hash); *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); if (qd == NULL) { - new_qd->qd_lockref.count++; *qdp = new_qd; list_add(&new_qd->qd_list, &sdp->sd_quota_list); hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]); @@ -1450,6 +1449,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) if (qd == NULL) goto fail_brelse; + qd->qd_lockref.count = 0; set_bit(QDF_CHANGE, &qd->qd_flags); qd->qd_change = qc_change; qd->qd_slot = slot; diff --git a/fs/namespace.c b/fs/namespace.c index a3ed3f2980cb..8f1000f9f3df 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5087,30 +5087,29 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct super_block *sb = mnt->mnt_sb; + size_t start = seq->count; int err; - if (sb->s_op->show_options) { - size_t start = seq->count; - - err = security_sb_show_options(seq, sb); - if (err) - return err; + err = security_sb_show_options(seq, sb); + if (err) + return err; + if (sb->s_op->show_options) { err = sb->s_op->show_options(seq, mnt->mnt_root); if (err) return err; + } - if (unlikely(seq_has_overflowed(seq))) - return -EAGAIN; + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; - if (seq->count == start) - return 0; + if (seq->count == start) + return 0; - /* skip leading comma */ - memmove(seq->buf + start, seq->buf + start + 1, - seq->count - start - 1); - seq->count--; - } + /* skip leading comma */ + memmove(seq->buf + start, seq->buf + start + 1, + seq->count - start - 1); + seq->count--; return 0; } @@ -5191,39 +5190,45 @@ static int statmount_string(struct kstatmount *s, u64 flag) size_t kbufsize; struct seq_file *seq = &s->seq; struct statmount *sm = &s->sm; - u32 start = seq->count; + u32 start, *offp; + + /* Reserve an empty string at the beginning for any unset offsets */ + if (!seq->count) + seq_putc(seq, 0); + + start = seq->count; switch (flag) { case STATMOUNT_FS_TYPE: - sm->fs_type = start; + offp = &sm->fs_type; ret = statmount_fs_type(s, seq); break; case STATMOUNT_MNT_ROOT: - sm->mnt_root = start; + offp = &sm->mnt_root; ret = statmount_mnt_root(s, seq); break; case STATMOUNT_MNT_POINT: - sm->mnt_point = start; + offp = &sm->mnt_point; ret = statmount_mnt_point(s, seq); break; case STATMOUNT_MNT_OPTS: - sm->mnt_opts = start; + offp = &sm->mnt_opts; ret = statmount_mnt_opts(s, seq); break; case STATMOUNT_OPT_ARRAY: - sm->opt_array = start; + offp = &sm->opt_array; ret = statmount_opt_array(s, seq); break; case STATMOUNT_OPT_SEC_ARRAY: - sm->opt_sec_array = start; + offp = &sm->opt_sec_array; ret = statmount_opt_sec_array(s, seq); break; case STATMOUNT_FS_SUBTYPE: - sm->fs_subtype = start; + offp = &sm->fs_subtype; statmount_fs_subtype(s, seq); break; case STATMOUNT_SB_SOURCE: - sm->sb_source = start; + offp = &sm->sb_source; ret = statmount_sb_source(s, seq); break; default: @@ -5251,6 +5256,7 @@ static int statmount_string(struct kstatmount *s, u64 flag) seq->buf[seq->count++] = '\0'; sm->mask |= flag; + *offp = start; return 0; } diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 0e552d873eaa..fb9b1656a287 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -446,11 +446,20 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose) struct nfsd_file, nf_gc); struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id); struct nfsd_fcache_disposal *l = nn->fcache_disposal; + struct svc_serv *serv; spin_lock(&l->lock); list_move_tail(&nf->nf_gc, &l->freeme); spin_unlock(&l->lock); - svc_wake_up(nn->nfsd_serv); + + /* + * The filecache laundrette is shut down after the + * nn->nfsd_serv pointer is cleared, but before the + * svc_serv is freed. + */ + serv = nn->nfsd_serv; + if (serv) + svc_wake_up(serv); } } diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 4e3be7201b1c..5fb202acb0fd 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -84,6 +84,8 @@ out: fail: posix_acl_release(resp->acl_access); posix_acl_release(resp->acl_default); + resp->acl_access = NULL; + resp->acl_default = NULL; goto out; } diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 5e34e98db969..7b5433bd3019 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -76,6 +76,8 @@ out: fail: posix_acl_release(resp->acl_access); posix_acl_release(resp->acl_default); + resp->acl_access = NULL; + resp->acl_default = NULL; goto out; } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 50e468bdb8d4..484077200c5d 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -679,7 +679,7 @@ static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp, return status; status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status); - if (unlikely(status || cb->cb_seq_status)) + if (unlikely(status || cb->cb_status)) return status; if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0) return -NFSERR_BAD_XDR; @@ -1583,8 +1583,11 @@ nfsd4_run_cb_work(struct work_struct *work) nfsd4_process_cb_update(cb); clnt = clp->cl_cb_client; - if (!clnt) { - /* Callback channel broken, or client killed; give up: */ + if (!clnt || clp->cl_state == NFSD4_COURTESY) { + /* + * Callback channel broken, client killed or + * nfs4_client in courtesy state; give up. + */ nfsd41_destroy_cb(cb); return; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b7a0cfd05401..153eeea2c7c9 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4459,10 +4459,11 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } } while (slot && --cnt > 0); } + +out: seq->maxslots = max(session->se_target_maxslots, seq->maxslots); seq->target_maxslots = session->se_target_maxslots; -out: switch (clp->cl_cb_state) { case NFSD4_CB_DOWN: seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 32019751a41e..aef474f1b84b 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -380,8 +380,9 @@ __fh_verify(struct svc_rqst *rqstp, error = check_nfsd_access(exp, rqstp, may_bypass_gss); if (error) goto out; - - svc_xprt_set_valid(rqstp->rq_xprt); + /* During LOCALIO call to fh_verify will be called with a NULL rqstp */ + if (rqstp) + svc_xprt_set_valid(rqstp->rq_xprt); /* Finally, check access permissions. */ error = nfsd_permission(cred, exp, dentry, access); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 8ee495a58d0a..fae1b6d397ea 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -648,7 +648,7 @@ EXPORT_SYMBOL_GPL(fsnotify); * Later, fsnotify permission hooks do not check if there are permission event * watches, but that there were permission event watches at open time. */ -void file_set_fsnotify_mode(struct file *file) +void file_set_fsnotify_mode_from_watchers(struct file *file) { struct dentry *dentry = file->f_path.dentry, *parent; struct super_block *sb = dentry->d_sb; @@ -665,7 +665,7 @@ void file_set_fsnotify_mode(struct file *file) */ if (likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT))) { - file->f_mode |= FMODE_NONOTIFY_PERM; + file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); return; } @@ -676,7 +676,7 @@ void file_set_fsnotify_mode(struct file *file) if ((!d_is_dir(dentry) && !d_is_reg(dentry)) || likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_PRE_CONTENT))) { - file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM; + file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); return; } @@ -686,19 +686,25 @@ void file_set_fsnotify_mode(struct file *file) */ mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask, - FSNOTIFY_PRE_CONTENT_EVENTS))) + FSNOTIFY_PRE_CONTENT_EVENTS))) { + /* Enable pre-content events */ + file_set_fsnotify_mode(file, 0); return; + } /* Is parent watching for pre-content events on this file? */ if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { parent = dget_parent(dentry); p_mask = fsnotify_inode_watches_children(d_inode(parent)); dput(parent); - if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) + if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) { + /* Enable pre-content events */ + file_set_fsnotify_mode(file, 0); return; + } } /* Nobody watching for pre-content events from this file */ - file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM; + file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); } #endif diff --git a/fs/open.c b/fs/open.c index 932e5a6de63b..1be20de9f283 100644 --- a/fs/open.c +++ b/fs/open.c @@ -905,7 +905,8 @@ static int do_dentry_open(struct file *f, f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { - f->f_mode = FMODE_PATH | FMODE_OPENED | FMODE_NONOTIFY; + f->f_mode = FMODE_PATH | FMODE_OPENED; + file_set_fsnotify_mode(f, FMODE_NONOTIFY); f->f_op = &empty_fops; return 0; } @@ -935,10 +936,10 @@ static int do_dentry_open(struct file *f, /* * Set FMODE_NONOTIFY_* bits according to existing permission watches. - * If FMODE_NONOTIFY was already set for an fanotify fd, this doesn't - * change anything. + * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a + * pseudo file, this call will not change the mode. */ - file_set_fsnotify_mode(f); + file_set_fsnotify_mode_from_watchers(f); error = fsnotify_open_perm(f); if (error) goto cleanup_all; @@ -1122,7 +1123,7 @@ struct file *dentry_open_nonotify(const struct path *path, int flags, if (!IS_ERR(f)) { int error; - f->f_mode |= FMODE_NONOTIFY; + file_set_fsnotify_mode(f, FMODE_NONOTIFY); error = vfs_open(path, f); if (error) { fput(f); diff --git a/fs/pidfs.c b/fs/pidfs.c index 049352f973de..63f9699ebac3 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -287,7 +287,6 @@ static bool pidfs_ioctl_valid(unsigned int cmd) switch (cmd) { case FS_IOC_GETVERSION: case PIDFD_GET_CGROUP_NAMESPACE: - case PIDFD_GET_INFO: case PIDFD_GET_IPC_NAMESPACE: case PIDFD_GET_MNT_NAMESPACE: case PIDFD_GET_NET_NAMESPACE: @@ -300,6 +299,17 @@ static bool pidfs_ioctl_valid(unsigned int cmd) return true; } + /* Extensible ioctls require some more careful checks. */ + switch (_IOC_NR(cmd)) { + case _IOC_NR(PIDFD_GET_INFO): + /* + * Try to prevent performing a pidfd ioctl when someone + * erronously mistook the file descriptor for a pidfd. + * This is not perfect but will catch most cases. + */ + return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO)); + } + return false; } diff --git a/fs/pipe.c b/fs/pipe.c index 94b59045ab44..ce1af7592780 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -960,6 +960,12 @@ int create_pipe_files(struct file **res, int flags) res[1] = f; stream_open(inode, res[0]); stream_open(inode, res[1]); + /* + * Disable permission and pre-content events, but enable legacy + * inotify events for legacy users. + */ + file_set_fsnotify_mode(res[0], FMODE_NONOTIFY_PERM); + file_set_fsnotify_mode(res[1], FMODE_NONOTIFY_PERM); return 0; } diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index a68434ad744a..ac1f890a0d54 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -357,7 +357,7 @@ struct smb_version_operations { int (*handle_cancelled_mid)(struct mid_q_entry *, struct TCP_Server_Info *); void (*downgrade_oplock)(struct TCP_Server_Info *server, struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache); + __u16 epoch, bool *purge_cache); /* process transaction2 response */ bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *, char *, int); @@ -552,12 +552,12 @@ struct smb_version_operations { /* if we can do cache read operations */ bool (*is_read_op)(__u32); /* set oplock level for the inode */ - void (*set_oplock_level)(struct cifsInodeInfo *, __u32, unsigned int, - bool *); + void (*set_oplock_level)(struct cifsInodeInfo *cinode, __u32 oplock, __u16 epoch, + bool *purge_cache); /* create lease context buffer for CREATE request */ char * (*create_lease_buf)(u8 *lease_key, u8 oplock); /* parse lease context buffer and return oplock/epoch info */ - __u8 (*parse_lease_buf)(void *buf, unsigned int *epoch, char *lkey); + __u8 (*parse_lease_buf)(void *buf, __u16 *epoch, char *lkey); ssize_t (*copychunk_range)(const unsigned int, struct cifsFileInfo *src_file, struct cifsFileInfo *target_file, @@ -1447,7 +1447,7 @@ struct cifs_fid { __u8 create_guid[16]; __u32 access; struct cifs_pending_open *pending_open; - unsigned int epoch; + __u16 epoch; #ifdef CONFIG_CIFS_DEBUG2 __u64 mid; #endif /* CIFS_DEBUG2 */ @@ -1480,7 +1480,7 @@ struct cifsFileInfo { bool oplock_break_cancelled:1; bool status_file_deleted:1; /* file has been deleted */ bool offload:1; /* offload final part of _put to a wq */ - unsigned int oplock_epoch; /* epoch from the lease break */ + __u16 oplock_epoch; /* epoch from the lease break */ __u32 oplock_level; /* oplock/lease level from the lease break */ int count; spinlock_t file_info_lock; /* protects four flag/count fields above */ @@ -1577,7 +1577,7 @@ struct cifsInodeInfo { spinlock_t open_file_lock; /* protects openFileList */ __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ unsigned int oplock; /* oplock/lease level we have */ - unsigned int epoch; /* used to track lease state changes */ + __u16 epoch; /* used to track lease state changes */ #define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */ #define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */ #define CIFS_INODE_FLAG_UNUSED (2) /* Unused flag */ diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index dad521336b5e..f65a8a90ba27 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -150,25 +150,27 @@ again: if (rc) continue; - if (tgt.flags & DFSREF_STORAGE_SERVER) { - rc = cifs_mount_get_tcon(mnt_ctx); - if (!rc) - rc = cifs_is_path_remote(mnt_ctx); + rc = cifs_mount_get_tcon(mnt_ctx); + if (rc) { + if (tgt.server_type == DFS_TYPE_LINK && + DFS_INTERLINK(tgt.flags)) + rc = -EREMOTE; + } else { + rc = cifs_is_path_remote(mnt_ctx); if (!rc) { ref_walk_set_tgt_hint(rw); break; } - if (rc != -EREMOTE) - continue; } - - rc = ref_walk_advance(rw); - if (!rc) { - rc = setup_dfs_ref(&tgt, rw); - if (rc) - break; - ref_walk_mark_end(rw); - goto again; + if (rc == -EREMOTE) { + rc = ref_walk_advance(rw); + if (!rc) { + rc = setup_dfs_ref(&tgt, rw); + if (rc) + break; + ref_walk_mark_end(rw); + goto again; + } } } } while (rc && ref_walk_descend(rw)); diff --git a/fs/smb/client/dfs.h b/fs/smb/client/dfs.h index ed4cd7cf1ec6..e60f0a24a8a1 100644 --- a/fs/smb/client/dfs.h +++ b/fs/smb/client/dfs.h @@ -188,4 +188,11 @@ static inline void dfs_put_root_smb_sessions(struct list_head *head) } } +static inline const char *dfs_ses_refpath(struct cifs_ses *ses) +{ + const char *path = ses->server->leaf_fullpath; + + return path ? path + 1 : ERR_PTR(-ENOENT); +} + #endif /* _CIFS_DFS_H */ diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c index 5022bb1f122a..4dada26d56b5 100644 --- a/fs/smb/client/dfs_cache.c +++ b/fs/smb/client/dfs_cache.c @@ -1136,33 +1136,19 @@ static bool is_ses_good(struct cifs_ses *ses) return ret; } -static char *get_ses_refpath(struct cifs_ses *ses) -{ - struct TCP_Server_Info *server = ses->server; - char *path = ERR_PTR(-ENOENT); - - if (server->leaf_fullpath) { - path = kstrdup(server->leaf_fullpath + 1, GFP_KERNEL); - if (!path) - path = ERR_PTR(-ENOMEM); - } - return path; -} - /* Refresh dfs referral of @ses */ static void refresh_ses_referral(struct cifs_ses *ses) { struct cache_entry *ce; unsigned int xid; - char *path; + const char *path; int rc = 0; xid = get_xid(); - path = get_ses_refpath(ses); + path = dfs_ses_refpath(ses); if (IS_ERR(path)) { rc = PTR_ERR(path); - path = NULL; goto out; } @@ -1181,7 +1167,6 @@ static void refresh_ses_referral(struct cifs_ses *ses) out: free_xid(xid); - kfree(path); } static int __refresh_tcon_referral(struct cifs_tcon *tcon, @@ -1231,19 +1216,18 @@ static void refresh_tcon_referral(struct cifs_tcon *tcon, bool force_refresh) struct dfs_info3_param *refs = NULL; struct cache_entry *ce; struct cifs_ses *ses; - unsigned int xid; bool needs_refresh; - char *path; + const char *path; + unsigned int xid; int numrefs = 0; int rc = 0; xid = get_xid(); ses = tcon->ses; - path = get_ses_refpath(ses); + path = dfs_ses_refpath(ses); if (IS_ERR(path)) { rc = PTR_ERR(path); - path = NULL; goto out; } @@ -1271,7 +1255,6 @@ static void refresh_tcon_referral(struct cifs_tcon *tcon, bool force_refresh) out: free_xid(xid); - kfree(path); free_dfs_info_array(refs, numrefs); } diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 9756b876a75e..d6e2fb669c40 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -377,7 +377,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) static void cifs_downgrade_oplock(struct TCP_Server_Info *server, struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { cifs_set_oplock_level(cinode, oplock); } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 77309217dab4..ec36bed54b0b 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -3904,22 +3904,22 @@ static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode, static void smb2_downgrade_oplock(struct TCP_Server_Info *server, struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { server->ops->set_oplock_level(cinode, oplock, 0, NULL); } static void smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache); + __u16 epoch, bool *purge_cache); static void smb3_downgrade_oplock(struct TCP_Server_Info *server, struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { unsigned int old_state = cinode->oplock; - unsigned int old_epoch = cinode->epoch; + __u16 old_epoch = cinode->epoch; unsigned int new_state; if (epoch > old_epoch) { @@ -3939,7 +3939,7 @@ smb3_downgrade_oplock(struct TCP_Server_Info *server, static void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { oplock &= 0xFF; cinode->lease_granted = false; @@ -3963,7 +3963,7 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, static void smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { char message[5] = {0}; unsigned int new_oplock = 0; @@ -4000,7 +4000,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, static void smb3_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { unsigned int old_oplock = cinode->oplock; @@ -4114,7 +4114,7 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock) } static __u8 -smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) +smb2_parse_lease_buf(void *buf, __u16 *epoch, char *lease_key) { struct create_lease *lc = (struct create_lease *)buf; @@ -4125,7 +4125,7 @@ smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) } static __u8 -smb3_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) +smb3_parse_lease_buf(void *buf, __u16 *epoch, char *lease_key) { struct create_lease_v2 *lc = (struct create_lease_v2 *)buf; diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 40ad9e79437a..ed7812247ebc 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -2169,7 +2169,7 @@ tcon_exit: tcon_error_exit: if (rsp && rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) - cifs_tcon_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); + cifs_dbg(VFS | ONCE, "BAD_NETWORK_NAME: %s\n", tree); goto tcon_exit; } @@ -2329,7 +2329,7 @@ parse_posix_ctxt(struct create_context *cc, struct smb2_file_all_info *info, int smb2_parse_contexts(struct TCP_Server_Info *server, struct kvec *rsp_iov, - unsigned int *epoch, + __u16 *epoch, char *lease_key, __u8 *oplock, struct smb2_file_all_info *buf, struct create_posix_rsp *posix) diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 2336dfb23f36..4662c7e2d259 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -283,7 +283,7 @@ extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *, enum securityEnum); int smb2_parse_contexts(struct TCP_Server_Info *server, struct kvec *rsp_iov, - unsigned int *epoch, + __u16 *epoch, char *lease_key, __u8 *oplock, struct smb2_file_all_info *buf, struct create_posix_rsp *posix); diff --git a/fs/stat.c b/fs/stat.c index 2c0e111a098a..f13308bfdc98 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -281,6 +281,8 @@ static int vfs_statx_path(struct path *path, int flags, struct kstat *stat, u32 request_mask) { int error = vfs_getattr(path, stat, request_mask, flags); + if (error) + return error; if (request_mask & STATX_MNT_ID_UNIQUE) { stat->mnt_id = real_mount(path->mnt)->mnt_id_unique; @@ -302,7 +304,7 @@ static int vfs_statx_path(struct path *path, int flags, struct kstat *stat, if (S_ISBLK(stat->mode)) bdev_statx(path, stat, request_mask); - return error; + return 0; } static int vfs_statx_fd(int fd, int flags, struct kstat *stat, diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index e95b8a48d8a0..1d94bb784108 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -21,7 +21,8 @@ #define VBOXSF_SUPER_MAGIC 0x786f4256 /* 'VBox' little endian */ -static const unsigned char VBSF_MOUNT_SIGNATURE[4] = "\000\377\376\375"; +static const unsigned char VBSF_MOUNT_SIGNATURE[4] = { '\000', '\377', '\376', + '\375' }; static int follow_symlinks; module_param(follow_symlinks, int, 0444); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 40ad22fb808b..0ef19f1469ec 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3563,12 +3563,12 @@ xfs_bmap_btalloc_at_eof( int error; /* - * If there are already extents in the file, try an exact EOF block - * allocation to extend the file as a contiguous extent. If that fails, - * or it's the first allocation in a file, just try for a stripe aligned - * allocation. + * If there are already extents in the file, and xfs_bmap_adjacent() has + * given a better blkno, try an exact EOF block allocation to extend the + * file as a contiguous extent. If that fails, or it's the first + * allocation in a file, just try for a stripe aligned allocation. */ - if (ap->offset) { + if (ap->eof) { xfs_extlen_t nextminlen = 0; /* @@ -3736,7 +3736,8 @@ xfs_bmap_btalloc_best_length( int error; ap->blkno = XFS_INO_TO_FSB(args->mp, ap->ip->i_ino); - xfs_bmap_adjacent(ap); + if (!xfs_bmap_adjacent(ap)) + ap->eof = false; /* * Search for an allocation group with a single extent large enough for diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d1d4a0a22e13..15bb790359f8 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -41,8 +41,7 @@ struct kmem_cache *xfs_buf_cache; * * xfs_buf_rele: * b_lock - * pag_buf_lock - * lru_lock + * lru_lock * * xfs_buftarg_drain_rele * lru_lock @@ -220,23 +219,25 @@ _xfs_buf_alloc( */ flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); - spin_lock_init(&bp->b_lock); + /* + * A new buffer is held and locked by the owner. This ensures that the + * buffer is owned by the caller and racing RCU lookups right after + * inserting into the hash table are safe (and will have to wait for + * the unlock to do anything non-trivial). + */ bp->b_hold = 1; + sema_init(&bp->b_sema, 0); /* held, no waiters */ + + spin_lock_init(&bp->b_lock); atomic_set(&bp->b_lru_ref, 1); init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_list); INIT_LIST_HEAD(&bp->b_li_list); - sema_init(&bp->b_sema, 0); /* held, no waiters */ bp->b_target = target; bp->b_mount = target->bt_mount; bp->b_flags = flags; - /* - * Set length and io_length to the same value initially. - * I/O routines should use io_length, which will be the same in - * most cases but may be reset (e.g. XFS recovery). - */ error = xfs_buf_get_maps(bp, nmaps); if (error) { kmem_cache_free(xfs_buf_cache, bp); @@ -502,7 +503,6 @@ int xfs_buf_cache_init( struct xfs_buf_cache *bch) { - spin_lock_init(&bch->bc_lock); return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params); } @@ -652,17 +652,20 @@ xfs_buf_find_insert( if (error) goto out_free_buf; - spin_lock(&bch->bc_lock); + /* The new buffer keeps the perag reference until it is freed. */ + new_bp->b_pag = pag; + + rcu_read_lock(); bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash, &new_bp->b_rhash_head, xfs_buf_hash_params); if (IS_ERR(bp)) { + rcu_read_unlock(); error = PTR_ERR(bp); - spin_unlock(&bch->bc_lock); goto out_free_buf; } if (bp && xfs_buf_try_hold(bp)) { /* found an existing buffer */ - spin_unlock(&bch->bc_lock); + rcu_read_unlock(); error = xfs_buf_find_lock(bp, flags); if (error) xfs_buf_rele(bp); @@ -670,10 +673,8 @@ xfs_buf_find_insert( *bpp = bp; goto out_free_buf; } + rcu_read_unlock(); - /* The new buffer keeps the perag reference until it is freed. */ - new_bp->b_pag = pag; - spin_unlock(&bch->bc_lock); *bpp = new_bp; return 0; @@ -1090,7 +1091,6 @@ xfs_buf_rele_cached( } /* we are asked to drop the last reference */ - spin_lock(&bch->bc_lock); __xfs_buf_ioacct_dec(bp); if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { /* @@ -1102,7 +1102,6 @@ xfs_buf_rele_cached( bp->b_state &= ~XFS_BSTATE_DISPOSE; else bp->b_hold--; - spin_unlock(&bch->bc_lock); } else { bp->b_hold--; /* @@ -1120,7 +1119,6 @@ xfs_buf_rele_cached( ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head, xfs_buf_hash_params); - spin_unlock(&bch->bc_lock); if (pag) xfs_perag_put(pag); freebuf = true; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 7e73663c5d4a..3b4ed42e11c0 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -80,7 +80,6 @@ typedef unsigned int xfs_buf_flags_t; #define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ struct xfs_buf_cache { - spinlock_t bc_lock; struct rhashtable bc_hash; }; diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c index f340a2015c4c..0b41bdfecdfb 100644 --- a/fs/xfs/xfs_exchrange.c +++ b/fs/xfs/xfs_exchrange.c @@ -329,22 +329,6 @@ out_trans_cancel: * successfully but before locks are dropped. */ -/* Verify that we have security clearance to perform this operation. */ -static int -xfs_exchange_range_verify_area( - struct xfs_exchrange *fxr) -{ - int ret; - - ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length, - true); - if (ret) - return ret; - - return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length, - true); -} - /* * Performs necessary checks before doing a range exchange, having stabilized * mutable inode attributes via i_rwsem. @@ -355,11 +339,13 @@ xfs_exchange_range_checks( unsigned int alloc_unit) { struct inode *inode1 = file_inode(fxr->file1); + loff_t size1 = i_size_read(inode1); struct inode *inode2 = file_inode(fxr->file2); + loff_t size2 = i_size_read(inode2); uint64_t allocmask = alloc_unit - 1; int64_t test_len; uint64_t blen; - loff_t size1, size2, tmp; + loff_t tmp; int error; /* Don't touch certain kinds of inodes */ @@ -368,24 +354,25 @@ xfs_exchange_range_checks( if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2)) return -ETXTBSY; - size1 = i_size_read(inode1); - size2 = i_size_read(inode2); - /* Ranges cannot start after EOF. */ if (fxr->file1_offset > size1 || fxr->file2_offset > size2) return -EINVAL; - /* - * If the caller said to exchange to EOF, we set the length of the - * request large enough to cover everything to the end of both files. - */ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { + /* + * If the caller said to exchange to EOF, we set the length of + * the request large enough to cover everything to the end of + * both files. + */ fxr->length = max_t(int64_t, size1 - fxr->file1_offset, size2 - fxr->file2_offset); - - error = xfs_exchange_range_verify_area(fxr); - if (error) - return error; + } else { + /* + * Otherwise we require both ranges to end within EOF. + */ + if (fxr->file1_offset + fxr->length > size1 || + fxr->file2_offset + fxr->length > size2) + return -EINVAL; } /* @@ -402,15 +389,6 @@ xfs_exchange_range_checks( return -EINVAL; /* - * We require both ranges to end within EOF, unless we're exchanging - * to EOF. - */ - if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) && - (fxr->file1_offset + fxr->length > size1 || - fxr->file2_offset + fxr->length > size2)) - return -EINVAL; - - /* * Make sure we don't hit any file size limits. If we hit any size * limits such that test_length was adjusted, we abort the whole * operation. @@ -747,6 +725,7 @@ xfs_exchange_range( { struct inode *inode1 = file_inode(fxr->file1); struct inode *inode2 = file_inode(fxr->file2); + loff_t check_len = fxr->length; int ret; BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS & @@ -779,14 +758,18 @@ xfs_exchange_range( return -EBADF; /* - * If we're not exchanging to EOF, we can check the areas before - * stabilizing both files' i_size. + * If we're exchanging to EOF we can't calculate the length until taking + * the iolock. Pass a 0 length to remap_verify_area similar to the + * FICLONE and FICLONERANGE ioctls that support cloning to EOF as well. */ - if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) { - ret = xfs_exchange_range_verify_area(fxr); - if (ret) - return ret; - } + if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) + check_len = 0; + ret = remap_verify_area(fxr->file1, fxr->file1_offset, check_len, true); + if (ret) + return ret; + ret = remap_verify_area(fxr->file2, fxr->file2_offset, check_len, true); + if (ret) + return ret; /* Update cmtime if the fd/inode don't forbid it. */ if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c95fe1b1de4e..b1f9f156ec88 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1404,8 +1404,11 @@ xfs_inactive( goto out; /* Try to clean out the cow blocks if there are any. */ - if (xfs_inode_has_cow_data(ip)) - xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); + if (xfs_inode_has_cow_data(ip)) { + error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); + if (error) + goto out; + } if (VFS_I(ip)->i_nlink != 0) { /* diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 50fa3ef89f6c..d61460309a78 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -976,10 +976,8 @@ xfs_dax_write_iomap_end( if (!xfs_is_cow_inode(ip)) return 0; - if (!written) { - xfs_reflink_cancel_cow_range(ip, pos, length, true); - return 0; - } + if (!written) + return xfs_reflink_cancel_cow_range(ip, pos, length, true); return xfs_reflink_end_cow(ip, pos, written); } diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 54504013c749..02a4adb4a999 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -1038,6 +1038,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) *(.discard) \ *(.discard.*) \ *(.export_symbol) \ + *(.no_trim_symbol) \ *(.modinfo) \ /* ld.bfd warns about .gnu.version* even when not emitted */ \ *(.gnu.version*) \ diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h index f77fe1531cf8..9732f514566d 100644 --- a/include/drm/drm_print.h +++ b/include/drm/drm_print.h @@ -32,6 +32,7 @@ #include <linux/dynamic_debug.h> #include <drm/drm.h> +#include <drm/drm_device.h> struct debugfs_regset32; struct drm_device; diff --git a/include/linux/compiler.h b/include/linux/compiler.h index b087de2f3e94..200fd3c5bc70 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -191,6 +191,25 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, __v; \ }) +#ifdef __CHECKER__ +#define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) +#else /* __CHECKER__ */ +#define __BUILD_BUG_ON_ZERO_MSG(e, msg) ((int)sizeof(struct {_Static_assert(!(e), msg);})) +#endif /* __CHECKER__ */ + +/* &a[0] degrades to a pointer: a different type from an array */ +#define __is_array(a) (!__same_type((a), &(a)[0])) +#define __must_be_array(a) __BUILD_BUG_ON_ZERO_MSG(!__is_array(a), \ + "must be array") + +#define __is_byte_array(a) (__is_array(a) && sizeof((a)[0]) == 1) +#define __must_be_byte_array(a) __BUILD_BUG_ON_ZERO_MSG(!__is_byte_array(a), \ + "must be byte array") + +/* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */ +#define __must_be_cstr(p) \ + __BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)") + #endif /* __KERNEL__ */ /** @@ -231,19 +250,6 @@ static inline void *offset_to_ptr(const int *off) #define __ADDRESSABLE_ASM_STR(sym) __stringify(__ADDRESSABLE_ASM(sym)) -#ifdef __CHECKER__ -#define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) -#else /* __CHECKER__ */ -#define __BUILD_BUG_ON_ZERO_MSG(e, msg) ((int)sizeof(struct {_Static_assert(!(e), msg);})) -#endif /* __CHECKER__ */ - -/* &a[0] degrades to a pointer: a different type from an array */ -#define __must_be_array(a) __BUILD_BUG_ON_ZERO_MSG(__same_type((a), &(a)[0]), "must be array") - -/* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */ -#define __must_be_cstr(p) \ - __BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)") - /* * This returns a constant expression while determining if an argument is * a constant expression, most importantly without evaluating the argument. diff --git a/include/linux/fs.h b/include/linux/fs.h index be3ad155ec9f..2c3b2f8a621f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -222,7 +222,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_FSNOTIFY_HSM(mode) 0 #endif - /* * Attribute flags. These should be or-ed together to figure out what * has been changed! @@ -791,6 +790,19 @@ struct inode { static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { + int testlen; + + /* + * TODO: patch it into a debug-only check if relevant macros show up. + * In the meantime, since we are suffering strlen even on production kernels + * to find the right length, do a fixup if the wrong value got passed. + */ + testlen = strlen(link); + if (testlen != linklen) { + WARN_ONCE(1, "bad length passed for symlink [%s] (got %d, expected %d)", + link, linklen, testlen); + linklen = testlen; + } inode->i_link = link; inode->i_linklen = linklen; inode->i_opflags |= IOP_CACHED_LINK; @@ -3140,6 +3152,12 @@ static inline void exe_file_allow_write_access(struct file *exe_file) allow_write_access(exe_file); } +static inline void file_set_fsnotify_mode(struct file *file, fmode_t mode) +{ + file->f_mode &= ~FMODE_FSNOTIFY_MASK; + file->f_mode |= mode; +} + static inline bool inode_is_open_for_write(const struct inode *inode) { return atomic_read(&inode->i_writecount) > 0; diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 1a9ef8f6784d..6a33288bd6a1 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -129,7 +129,7 @@ static inline int fsnotify_file(struct file *file, __u32 mask) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS -void file_set_fsnotify_mode(struct file *file); +void file_set_fsnotify_mode_from_watchers(struct file *file); /* * fsnotify_file_area_perm - permission hook before access to file range @@ -213,7 +213,7 @@ static inline int fsnotify_open_perm(struct file *file) } #else -static inline void file_set_fsnotify_mode(struct file *file) +static inline void file_set_fsnotify_mode_from_watchers(struct file *file) { } diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index c3b4b7ed7c16..84a5045f80f3 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -125,6 +125,7 @@ struct hrtimer_cpu_base { ktime_t softirq_expires_next; struct hrtimer *softirq_next_timer; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; + call_single_data_t csd; } ____cacheline_aligned; diff --git a/include/linux/i2c.h b/include/linux/i2c.h index c31fd1dba3bd..2b2af24d2a43 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -244,6 +244,7 @@ enum i2c_driver_flags { * @id_table: List of I2C devices supported by this driver * @detect: Callback for device detection * @address_list: The I2C addresses to probe (for detect) + * @clients: List of detected clients we created (for i2c-core use only) * @flags: A bitmask of flags defined in &enum i2c_driver_flags * * The driver.owner field should be set to the module owner of this driver. @@ -298,6 +299,7 @@ struct i2c_driver { /* Device detection callback for automatic device creation */ int (*detect)(struct i2c_client *client, struct i2c_board_info *info); const unsigned short *address_list; + struct list_head clients; u32 flags; }; @@ -313,6 +315,8 @@ struct i2c_driver { * @dev: Driver model device node for the slave. * @init_irq: IRQ that was set at initialization * @irq: indicates the IRQ generated by this device (if any) + * @detected: member of an i2c_driver.clients list or i2c-core's + * userspace_devices list * @slave_cb: Callback when I2C slave mode of an adapter is used. The adapter * calls it to pass on slave events to the slave driver. * @devres_group_id: id of the devres group that will be created for resources @@ -332,8 +336,6 @@ struct i2c_client { #define I2C_CLIENT_SLAVE 0x20 /* we are the slave */ #define I2C_CLIENT_HOST_NOTIFY 0x40 /* We want to use I2C host notify */ #define I2C_CLIENT_WAKE 0x80 /* for board_info; true iff can wake */ -#define I2C_CLIENT_AUTO 0x100 /* client was auto-detected */ -#define I2C_CLIENT_USER 0x200 /* client was userspace-created */ #define I2C_CLIENT_SCCB 0x9000 /* Use Omnivision SCCB protocol */ /* Must match I2C_M_STOP|IGNORE_NAK */ @@ -345,6 +347,7 @@ struct i2c_client { struct device dev; /* the device structure */ int init_irq; /* irq set at initialization */ int irq; /* irq issued by device */ + struct list_head detected; #if IS_ENABLED(CONFIG_I2C_SLAVE) i2c_slave_cb_t slave_cb; /* callback for slave mode */ #endif @@ -751,6 +754,9 @@ struct i2c_adapter { char name[48]; struct completion dev_released; + struct mutex userspace_clients_lock; + struct list_head userspace_clients; + struct i2c_bus_recovery_info *bus_recovery_info; const struct i2c_adapter_quirks *quirks; diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index ed945f42e064..0ea8c9887429 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -537,7 +537,7 @@ static __always_inline unsigned long msecs_to_jiffies(const unsigned int m) * * Return: jiffies value */ -#define secs_to_jiffies(_secs) ((_secs) * HZ) +#define secs_to_jiffies(_secs) (unsigned long)((_secs) * HZ) extern unsigned long __usecs_to_jiffies(const unsigned int u); #if !(USEC_PER_SEC % HZ) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3cb9a32a6330..f34f4cfaa513 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1615,7 +1615,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu); -int kvm_arch_post_init_vm(struct kvm *kvm); void kvm_arch_pre_destroy_vm(struct kvm *kvm); void kvm_arch_create_vm_debugfs(struct kvm *kvm); diff --git a/include/linux/lockref.h b/include/linux/lockref.h index c39f119659ba..676721ee878d 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -37,12 +37,13 @@ struct lockref { /** * lockref_init - Initialize a lockref * @lockref: pointer to lockref structure - * @count: initial count + * + * Initializes @lockref->count to 1. */ -static inline void lockref_init(struct lockref *lockref, unsigned int count) +static inline void lockref_init(struct lockref *lockref) { spin_lock_init(&lockref->lock); - lockref->count = count; + lockref->count = 1; } void lockref_get(struct lockref *lockref); diff --git a/include/linux/module.h b/include/linux/module.h index 23792d5d7b74..30e5b19bafa9 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -306,7 +306,10 @@ extern int modules_disabled; /* for sysctl */ /* Get/put a kernel symbol (calls must be symmetric) */ void *__symbol_get(const char *symbol); void *__symbol_get_gpl(const char *symbol); -#define symbol_get(x) ((typeof(&x))(__symbol_get(__stringify(x)))) +#define symbol_get(x) ({ \ + static const char __notrim[] \ + __used __section(".no_trim_symbol") = __stringify(x); \ + (typeof(&x))(__symbol_get(__stringify(x))); }) /* modules using other modules: kdb wants to see this. */ struct module_use { diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2a59034a5fa2..c0a86afb85da 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2664,6 +2664,12 @@ struct net *dev_net(const struct net_device *dev) } static inline +struct net *dev_net_rcu(const struct net_device *dev) +{ + return read_pnet_rcu(&dev->nd_net); +} + +static inline void dev_net_set(struct net_device *dev, struct net *net) { write_pnet(&dev->nd_net, net); @@ -2904,9 +2910,9 @@ struct pcpu_sw_netstats { struct pcpu_dstats { u64_stats_t rx_packets; u64_stats_t rx_bytes; - u64_stats_t rx_drops; u64_stats_t tx_packets; u64_stats_t tx_bytes; + u64_stats_t rx_drops; u64_stats_t tx_drops; struct u64_stats_sync syncp; } __aligned(8 * sizeof(u64)); diff --git a/include/linux/string.h b/include/linux/string.h index 86d5d352068b..f8e21e80942f 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -414,7 +414,8 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, * must be discoverable by the compiler. */ #define strtomem_pad(dest, src, pad) do { \ - const size_t _dest_len = __builtin_object_size(dest, 1); \ + const size_t _dest_len = __must_be_byte_array(dest) + \ + ARRAY_SIZE(dest); \ const size_t _src_len = __builtin_object_size(src, 1); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ @@ -437,7 +438,8 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, * must be discoverable by the compiler. */ #define strtomem(dest, src) do { \ - const size_t _dest_len = __builtin_object_size(dest, 1); \ + const size_t _dest_len = __must_be_byte_array(dest) + \ + ARRAY_SIZE(dest); \ const size_t _src_len = __builtin_object_size(src, 1); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ @@ -456,7 +458,8 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, * Note that sizes of @dest and @src must be known at compile-time. */ #define memtostr(dest, src) do { \ - const size_t _dest_len = __builtin_object_size(dest, 1); \ + const size_t _dest_len = __must_be_byte_array(dest) + \ + ARRAY_SIZE(dest); \ const size_t _src_len = __builtin_object_size(src, 1); \ const size_t _src_chars = strnlen(src, _src_len); \ const size_t _copy_len = min(_dest_len - 1, _src_chars); \ @@ -481,7 +484,8 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, * Note that sizes of @dest and @src must be known at compile-time. */ #define memtostr_pad(dest, src) do { \ - const size_t _dest_len = __builtin_object_size(dest, 1); \ + const size_t _dest_len = __must_be_byte_array(dest) + \ + ARRAY_SIZE(dest); \ const size_t _src_len = __builtin_object_size(src, 1); \ const size_t _src_chars = strnlen(src, _src_len); \ const size_t _copy_len = min(_dest_len - 1, _src_chars); \ diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index d9c767cf773d..9189354c568f 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -668,7 +668,7 @@ struct l2cap_conn { struct l2cap_chan *smp; struct list_head chan_l; - struct mutex chan_lock; + struct mutex lock; struct kref ref; struct list_head users; }; @@ -970,6 +970,7 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err); void l2cap_send_conn_req(struct l2cap_chan *chan); struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn); +struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *conn); void l2cap_conn_put(struct l2cap_conn *conn); int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user); diff --git a/include/net/ip.h b/include/net/ip.h index 9f5e33e371fc..ba7b43447775 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -471,9 +471,12 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { const struct rtable *rt = dst_rtable(dst); - struct net *net = dev_net(dst->dev); - unsigned int mtu; + unsigned int mtu, res; + struct net *net; + + rcu_read_lock(); + net = dev_net_rcu(dst->dev); if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || !forwarding) { @@ -497,7 +500,11 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, out: mtu = min_t(unsigned int, mtu, IP_MAX_MTU); - return mtu - lwtunnel_headroom(dst->lwtstate, mtu); + res = mtu - lwtunnel_headroom(dst->lwtstate, mtu); + + rcu_read_unlock(); + + return res; } static inline unsigned int ip_skb_dst_mtu(struct sock *sk, diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 2d6141f28b53..f7fe796e8429 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -198,10 +198,12 @@ struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto) if (netif_is_l3_slave(dev)) { struct net_device *master; + rcu_read_lock(); master = netdev_master_upper_dev_get_rcu(dev); if (master && master->l3mdev_ops->l3mdev_l3_out) skb = master->l3mdev_ops->l3mdev_l3_out(master, sk, skb, proto); + rcu_read_unlock(); } return skb; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 0f5eb9db0c62..7ba1402ca779 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -398,7 +398,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet) #endif } -static inline struct net *read_pnet_rcu(possible_net_t *pnet) +static inline struct net *read_pnet_rcu(const possible_net_t *pnet) { #ifdef CONFIG_NET_NS return rcu_dereference(pnet->net); diff --git a/include/net/route.h b/include/net/route.h index f86775be3e29..c605fd5ec0c0 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -382,10 +382,15 @@ static inline int inet_iif(const struct sk_buff *skb) static inline int ip4_dst_hoplimit(const struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); - struct net *net = dev_net(dst->dev); - if (hoplimit == 0) + if (hoplimit == 0) { + const struct net *net; + + rcu_read_lock(); + net = dev_net_rcu(dst->dev); hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); + rcu_read_unlock(); + } return hoplimit; } diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index d635c5b47eba..d48c657191cd 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -851,7 +851,7 @@ static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, } static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, - __u64 bytes, __u32 packets) + __u64 bytes, __u64 packets) { u64_stats_update_begin(&bstats->syncp); u64_stats_add(&bstats->bytes, bytes); diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 2f119d18a061..cad50d91077e 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -219,6 +219,7 @@ EM(rxrpc_conn_get_conn_input, "GET inp-conn") \ EM(rxrpc_conn_get_idle, "GET idle ") \ EM(rxrpc_conn_get_poke_abort, "GET pk-abort") \ + EM(rxrpc_conn_get_poke_secured, "GET secured ") \ EM(rxrpc_conn_get_poke_timer, "GET poke ") \ EM(rxrpc_conn_get_service_conn, "GET svc-conn") \ EM(rxrpc_conn_new_client, "NEW client ") \ diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index efe5de6ce208..aaa4f3bc688b 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -411,13 +411,20 @@ struct drm_amdgpu_gem_userptr { /* GFX12 and later: */ #define AMDGPU_TILING_GFX12_SWIZZLE_MODE_SHIFT 0 #define AMDGPU_TILING_GFX12_SWIZZLE_MODE_MASK 0x7 -/* These are DCC recompression setting for memory management: */ +/* These are DCC recompression settings for memory management: */ #define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_SHIFT 3 #define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_MASK 0x3 /* 0:64B, 1:128B, 2:256B */ #define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_SHIFT 5 #define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_MASK 0x7 /* CB_COLOR0_INFO.NUMBER_TYPE */ #define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_SHIFT 8 #define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_MASK 0x3f /* [0:4]:CB_COLOR0_INFO.FORMAT, [5]:MM */ +/* When clearing the buffer or moving it from VRAM to GTT, don't compress and set DCC metadata + * to uncompressed. Set when parts of an allocation bypass DCC and read raw data. */ +#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_SHIFT 14 +#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_MASK 0x1 +/* bit gap */ +#define AMDGPU_TILING_GFX12_SCANOUT_SHIFT 63 +#define AMDGPU_TILING_GFX12_SCANOUT_MASK 0x1 /* Set/Get helpers for tiling flags. */ #define AMDGPU_TILING_SET(field, value) \ diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index d1089b88efc7..9b18c4cfe56f 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -682,6 +682,7 @@ enum ethtool_link_ext_substate_module { * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics * @ETH_SS_STATS_RMON: names of RMON statistics * @ETH_SS_STATS_PHY: names of PHY(dev) statistics + * @ETH_SS_TS_FLAGS: hardware timestamping flags * * @ETH_SS_COUNT: number of defined string sets */ @@ -708,6 +709,7 @@ enum ethtool_stringset { ETH_SS_STATS_ETH_CTRL, ETH_SS_STATS_RMON, ETH_SS_STATS_PHY, + ETH_SS_TS_FLAGS, /* add new constants above here */ ETH_SS_COUNT diff --git a/include/ufs/ufs.h b/include/ufs/ufs.h index 89672ad8c3bb..f151feb0ca8c 100644 --- a/include/ufs/ufs.h +++ b/include/ufs/ufs.h @@ -385,8 +385,8 @@ enum { /* Possible values for dExtendedUFSFeaturesSupport */ enum { - UFS_DEV_LOW_TEMP_NOTIF = BIT(4), - UFS_DEV_HIGH_TEMP_NOTIF = BIT(5), + UFS_DEV_HIGH_TEMP_NOTIF = BIT(4), + UFS_DEV_LOW_TEMP_NOTIF = BIT(5), UFS_DEV_EXT_TEMP_NOTIF = BIT(6), UFS_DEV_HPB_SUPPORT = BIT(7), UFS_DEV_WRITE_BOOSTER_SUP = BIT(8), diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 650ff238cd74..8bf31e6ca4e5 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -1309,7 +1309,6 @@ static inline void ufshcd_rmwl(struct ufs_hba *hba, u32 mask, u32 val, u32 reg) void ufshcd_enable_irq(struct ufs_hba *hba); void ufshcd_disable_irq(struct ufs_hba *hba); int ufshcd_alloc_host(struct device *, struct ufs_hba **); -void ufshcd_dealloc_host(struct ufs_hba *); int ufshcd_hba_enable(struct ufs_hba *hba); int ufshcd_init(struct ufs_hba *, void __iomem *, unsigned int); int ufshcd_link_recovery(struct ufs_hba *hba); diff --git a/io_uring/futex.c b/io_uring/futex.c index 3159a2b7eeca..43e2143255f5 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -338,7 +338,7 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) hlist_add_head(&req->hash_node, &ctx->futex_list); io_ring_submit_unlock(ctx, issue_flags); - futex_queue(&ifd->q, hb); + futex_queue(&ifd->q, hb, NULL); return IOU_ISSUE_SKIP_COMPLETE; } diff --git a/kernel/futex/core.c b/kernel/futex/core.c index ebdd76b4ecbb..3db8567f5a44 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -532,7 +532,8 @@ void futex_q_unlock(struct futex_hash_bucket *hb) futex_hb_waiters_dec(hb); } -void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) +void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, + struct task_struct *task) { int prio; @@ -548,7 +549,7 @@ void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); - q->task = current; + q->task = task; } /** diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 99b32e728c4a..6b2f4c7eb720 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -285,13 +285,15 @@ static inline int futex_get_value_locked(u32 *dest, u32 __user *from) } extern void __futex_unqueue(struct futex_q *q); -extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb); +extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, + struct task_struct *task); extern int futex_unqueue(struct futex_q *q); /** * futex_queue() - Enqueue the futex_q on the futex_hash_bucket * @q: The futex_q to enqueue * @hb: The destination hash bucket + * @task: Task queueing this futex * * The hb->lock must be held by the caller, and is released here. A call to * futex_queue() is typically paired with exactly one call to futex_unqueue(). The @@ -299,11 +301,14 @@ extern int futex_unqueue(struct futex_q *q); * or nothing if the unqueue is done as part of the wake process and the unqueue * state is implicit in the state of woken task (see futex_wait_requeue_pi() for * an example). + * + * Note that @task may be NULL, for async usage of futexes. */ -static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) +static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, + struct task_struct *task) __releases(&hb->lock) { - __futex_queue(q, hb); + __futex_queue(q, hb, task); spin_unlock(&hb->lock); } diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index daea650b16f5..7a941845f7ee 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -982,7 +982,7 @@ retry_private: /* * Only actually queue now that the atomic ops are done: */ - __futex_queue(&q, hb); + __futex_queue(&q, hb, current); if (trylock) { ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index eb86a7ade06a..25877d4f2f8f 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -349,7 +349,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, * access to the hash list and forcing another memory barrier. */ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); - futex_queue(q, hb); + futex_queue(q, hb, current); /* Arm the timer */ if (timeout) @@ -460,7 +460,7 @@ retry: * next futex. Queue each futex at this moment so hb can * be unlocked. */ - futex_queue(q, hb); + futex_queue(q, hb, current); continue; } diff --git a/kernel/kthread.c b/kernel/kthread.c index 4005b13ebd7f..5dc5b0d7238e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -859,7 +859,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; unsigned long flags; - int ret; + int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { WARN_ON(1); @@ -892,7 +892,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) out: free_cpumask_var(affinity); - return 0; + return ret; } /* diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index fd7e85220715..ef047add7f9e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1262,6 +1262,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, if (task_has_dl_policy(p)) { P(dl.runtime); P(dl.deadline); + } else if (fair_policy(p->policy)) { + P(se.slice); } #ifdef CONFIG_SCHED_CLASS_EXT __PS("ext.enabled", task_on_scx(p)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ce2e94ccad0c..1c0ef435a7aa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5385,6 +5385,15 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void set_delayed(struct sched_entity *se) { se->sched_delayed = 1; + + /* + * Delayed se of cfs_rq have no tasks queued on them. + * Do not adjust h_nr_runnable since dequeue_entities() + * will account it for blocked tasks. + */ + if (!entity_is_task(se)) + return; + for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -5397,6 +5406,16 @@ static void set_delayed(struct sched_entity *se) static void clear_delayed(struct sched_entity *se) { se->sched_delayed = 0; + + /* + * Delayed se of cfs_rq have no tasks queued on them. + * Do not adjust h_nr_runnable since a dequeue has + * already accounted for it or an enqueue of a task + * below it will account for it in enqueue_task_fair(). + */ + if (!entity_is_task(se)) + return; + for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index f59381c4a2ff..7bbb408431eb 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -749,6 +749,15 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, if (WARN_ON_ONCE(!fprog)) return false; + /* Our single exception to filtering. */ +#ifdef __NR_uretprobe +#ifdef SECCOMP_ARCH_COMPAT + if (sd->arch == SECCOMP_ARCH_NATIVE) +#endif + if (sd->nr == __NR_uretprobe) + return true; +#endif + for (pc = 0; pc < fprog->len; pc++) { struct sock_filter *insn = &fprog->filter[pc]; u16 code = insn->code; @@ -1023,6 +1032,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, */ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, +#ifdef __NR_uretprobe + __NR_uretprobe, +#endif -1, /* negative terminated */ }; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7304d7cf47f2..2a7802ec480c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -373,16 +373,18 @@ void clocksource_verify_percpu(struct clocksource *cs) cpumask_clear(&cpus_ahead); cpumask_clear(&cpus_behind); cpus_read_lock(); - preempt_disable(); + migrate_disable(); clocksource_verify_choose_cpus(); if (cpumask_empty(&cpus_chosen)) { - preempt_enable(); + migrate_enable(); cpus_read_unlock(); pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); return; } testcpu = smp_processor_id(); - pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", + cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + preempt_disable(); for_each_cpu(cpu, &cpus_chosen) { if (cpu == testcpu) continue; @@ -402,6 +404,7 @@ void clocksource_verify_percpu(struct clocksource *cs) cs_nsec_min = cs_nsec; } preempt_enable(); + migrate_enable(); cpus_read_unlock(); if (!cpumask_empty(&cpus_ahead)) pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f6d8df94045c..deb1aa32814e 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -58,6 +58,8 @@ #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) +static void retrigger_next_event(void *arg); + /* * The timer bases: * @@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, - } + }, + .csd = CSD_INIT(retrigger_next_event, NULL) }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { @@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_TAI] = HRTIMER_BASE_TAI, }; +static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) +{ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return true; + else + return likely(base->online); +} + /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -145,11 +156,6 @@ static struct hrtimer_cpu_base migration_cpu_base = { #define migration_base migration_cpu_base.clock_base[0] -static inline bool is_migration_base(struct hrtimer_clock_base *base) -{ - return base == &migration_base; -} - /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are @@ -183,27 +189,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } /* - * We do not migrate the timer when it is expiring before the next - * event on the target cpu. When high resolution is enabled, we cannot - * reprogram the target cpu hardware and we would cause it to fire - * late. To keep it simple, we handle the high resolution enabled and - * disabled case similar. + * Check if the elected target is suitable considering its next + * event and the hotplug state of the current CPU. + * + * If the elected target is remote and its next event is after the timer + * to queue, then a remote reprogram is necessary. However there is no + * guarantee the IPI handling the operation would arrive in time to meet + * the high resolution deadline. In this case the local CPU becomes a + * preferred target, unless it is offline. + * + * High and low resolution modes are handled the same way for simplicity. * * Called with cpu_base->lock of target cpu held. */ -static int -hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base, + struct hrtimer_cpu_base *new_cpu_base, + struct hrtimer_cpu_base *this_cpu_base) { ktime_t expires; + /* + * The local CPU clockevent can be reprogrammed. Also get_target_base() + * guarantees it is online. + */ + if (new_cpu_base == this_cpu_base) + return true; + + /* + * The offline local CPU can't be the default target if the + * next remote target event is after this timer. Keep the + * elected new base. An IPI will we issued to reprogram + * it as a last resort. + */ + if (!hrtimer_base_is_online(this_cpu_base)) + return true; + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires < new_base->cpu_base->expires_next; + + return expires >= new_base->cpu_base->expires_next; } -static inline -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, - int pinned) +static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { + if (!hrtimer_base_is_online(base)) { + int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); + + return &per_cpu(hrtimer_bases, cpu); + } + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); @@ -254,8 +287,8 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, + this_cpu_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; @@ -264,8 +297,7 @@ again: } WRITE_ONCE(timer->base, new_base); } else { - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { new_cpu_base = this_cpu_base; goto again; } @@ -275,11 +307,6 @@ again: #else /* CONFIG_SMP */ -static inline bool is_migration_base(struct hrtimer_clock_base *base) -{ - return false; -} - static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __acquires(&timer->base->cpu_base->lock) @@ -716,8 +743,6 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } -static void retrigger_next_event(void *arg); - /* * Switch to high resolution mode */ @@ -1205,6 +1230,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { + struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *new_base; bool force_local, first; @@ -1216,10 +1242,16 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ - force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local = base->cpu_base == this_cpu_base; force_local &= base->cpu_base->next_timer == timer; /* + * Don't force local queuing if this enqueue happens on a unplugged + * CPU after hrtimer_cpu_dying() has been invoked. + */ + force_local &= this_cpu_base->online; + + /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the * remote data correctly. @@ -1248,8 +1280,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) - return first; + if (!force_local) { + /* + * If the current CPU base is online, then the timer is + * never queued on a remote CPU if it would be the first + * expiring timer there. + */ + if (hrtimer_base_is_online(this_cpu_base)) + return first; + + /* + * Timer was enqueued remote because the current base is + * already offline. If the timer is the first to expire, + * kick the remote CPU to reprogram the clock event. + */ + if (first) { + struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; + + smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); + } + return 0; + } /* * Timer was forced to stay on the current CPU to avoid @@ -1370,6 +1421,18 @@ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, } } +#ifdef CONFIG_SMP +static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return base == &migration_base; +} +#else +static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return false; +} +#endif + /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 9cb9b6584ea1..2f6330831f08 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1675,6 +1675,9 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) } while (i < tmigr_hierarchy_levels); + /* Assert single root */ + WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top])); + while (i > 0) { group = stack[--i]; @@ -1716,7 +1719,12 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) WARN_ON_ONCE(top == 0); lvllist = &tmigr_level_list[top]; - if (group->num_children == 1 && list_is_singular(lvllist)) { + + /* + * Newly created root level should have accounted the upcoming + * CPU's child group and pre-accounted the old root. + */ + if (group->num_children == 2 && list_is_singular(lvllist)) { /* * The target CPU must never do the prepare work, except * on early boot when the boot CPU is the target. Otherwise diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 54d850997c0a..136c750b0b4d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -198,7 +198,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, * returning from the function. */ if (ftrace_graph_notrace_addr(trace->func)) { - *task_var |= TRACE_GRAPH_NOTRACE_BIT; + *task_var |= TRACE_GRAPH_NOTRACE; /* * Need to return 1 to have the return called * that will clear the NOTRACE bit. diff --git a/lib/stackinit_kunit.c b/lib/stackinit_kunit.c index fbe910c9c825..135322592faf 100644 --- a/lib/stackinit_kunit.c +++ b/lib/stackinit_kunit.c @@ -75,8 +75,10 @@ static bool stackinit_range_contains(char *haystack_start, size_t haystack_size, */ #ifdef CONFIG_M68K #define FILL_SIZE_STRING 8 +#define FILL_SIZE_ARRAY 2 #else #define FILL_SIZE_STRING 16 +#define FILL_SIZE_ARRAY 8 #endif #define INIT_CLONE_SCALAR /**/ @@ -345,11 +347,11 @@ union test_small_start { short three; unsigned long four; struct big_struct { - unsigned long array[8]; + unsigned long array[FILL_SIZE_ARRAY]; } big; }; -/* Mismatched sizes, with one and two being small */ +/* Mismatched sizes, with three and four being small */ union test_small_end { short one; unsigned long two; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index aa6c714892ec..9f3b8b682adb 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -685,6 +685,15 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; } + if (ax25->ax25_dev) { + if (dev == ax25->ax25_dev->dev) { + rcu_read_unlock(); + break; + } + netdev_put(ax25->ax25_dev->dev, &ax25->dev_tracker); + ax25_dev_put(ax25->ax25_dev); + } + ax25->ax25_dev = ax25_dev_ax25dev(dev); if (!ax25->ax25_dev) { rcu_read_unlock(); @@ -692,6 +701,8 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; } ax25_fillin_cb(ax25, ax25->ax25_dev); + netdev_hold(dev, &ax25->dev_tracker, GFP_ATOMIC); + ax25_dev_hold(ax25->ax25_dev); rcu_read_unlock(); break; diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index ac11f1f08db0..d35479c465e2 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -113,8 +113,6 @@ static void batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) { ewma_throughput_init(&hardif_neigh->bat_v.throughput); - INIT_WORK(&hardif_neigh->bat_v.metric_work, - batadv_v_elp_throughput_metric_update); } /** diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 1d704574e6bf..b065578b4436 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -18,6 +18,7 @@ #include <linux/if_ether.h> #include <linux/jiffies.h> #include <linux/kref.h> +#include <linux/list.h> #include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/nl80211.h> @@ -26,6 +27,7 @@ #include <linux/rcupdate.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> @@ -42,6 +44,18 @@ #include "send.h" /** + * struct batadv_v_metric_queue_entry - list of hardif neighbors which require + * and metric update + */ +struct batadv_v_metric_queue_entry { + /** @hardif_neigh: hardif neighbor scheduled for metric update */ + struct batadv_hardif_neigh_node *hardif_neigh; + + /** @list: list node for metric_queue */ + struct list_head list; +}; + +/** * batadv_v_elp_start_timer() - restart timer for ELP periodic work * @hard_iface: the interface for which the timer has to be reset */ @@ -59,25 +73,36 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) /** * batadv_v_elp_get_throughput() - get the throughput towards a neighbour * @neigh: the neighbour for which the throughput has to be obtained + * @pthroughput: calculated throughput towards the given neighbour in multiples + * of 100kpbs (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). * - * Return: The throughput towards the given neighbour in multiples of 100kpbs - * (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). + * Return: true when value behind @pthroughput was set */ -static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) +static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh, + u32 *pthroughput) { struct batadv_hard_iface *hard_iface = neigh->if_incoming; + struct net_device *soft_iface = hard_iface->soft_iface; struct ethtool_link_ksettings link_settings; struct net_device *real_netdev; struct station_info sinfo; u32 throughput; int ret; + /* don't query throughput when no longer associated with any + * batman-adv interface + */ + if (!soft_iface) + return false; + /* if the user specified a customised value for this interface, then * return it directly */ throughput = atomic_read(&hard_iface->bat_v.throughput_override); - if (throughput != 0) - return throughput; + if (throughput != 0) { + *pthroughput = throughput; + return true; + } /* if this is a wireless device, then ask its throughput through * cfg80211 API @@ -104,27 +129,39 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) * possible to delete this neighbor. For now set * the throughput metric to 0. */ - return 0; + *pthroughput = 0; + return true; } if (ret) goto default_throughput; - if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) - return sinfo.expected_throughput / 100; + if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) { + *pthroughput = sinfo.expected_throughput / 100; + return true; + } /* try to estimate the expected throughput based on reported tx * rates */ - if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) - return cfg80211_calculate_bitrate(&sinfo.txrate) / 3; + if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) { + *pthroughput = cfg80211_calculate_bitrate(&sinfo.txrate) / 3; + return true; + } goto default_throughput; } + /* only use rtnl_trylock because the elp worker will be cancelled while + * the rntl_lock is held. the cancel_delayed_work_sync() would otherwise + * wait forever when the elp work_item was started and it is then also + * trying to rtnl_lock + */ + if (!rtnl_trylock()) + return false; + /* if not a wifi interface, check if this device provides data via * ethtool (e.g. an Ethernet adapter) */ - rtnl_lock(); ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings); rtnl_unlock(); if (ret == 0) { @@ -135,13 +172,15 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX; throughput = link_settings.base.speed; - if (throughput && throughput != SPEED_UNKNOWN) - return throughput * 10; + if (throughput && throughput != SPEED_UNKNOWN) { + *pthroughput = throughput * 10; + return true; + } } default_throughput: if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) { - batadv_info(hard_iface->soft_iface, + batadv_info(soft_iface, "WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n", hard_iface->net_dev->name, BATADV_THROUGHPUT_DEFAULT_VALUE / 10, @@ -150,31 +189,26 @@ default_throughput: } /* if none of the above cases apply, return the base_throughput */ - return BATADV_THROUGHPUT_DEFAULT_VALUE; + *pthroughput = BATADV_THROUGHPUT_DEFAULT_VALUE; + return true; } /** * batadv_v_elp_throughput_metric_update() - worker updating the throughput * metric of a single hop neighbour - * @work: the work queue item + * @neigh: the neighbour to probe */ -void batadv_v_elp_throughput_metric_update(struct work_struct *work) +static void +batadv_v_elp_throughput_metric_update(struct batadv_hardif_neigh_node *neigh) { - struct batadv_hardif_neigh_node_bat_v *neigh_bat_v; - struct batadv_hardif_neigh_node *neigh; - - neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v, - metric_work); - neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node, - bat_v); + u32 throughput; + bool valid; - ewma_throughput_add(&neigh->bat_v.throughput, - batadv_v_elp_get_throughput(neigh)); + valid = batadv_v_elp_get_throughput(neigh, &throughput); + if (!valid) + return; - /* decrement refcounter to balance increment performed before scheduling - * this task - */ - batadv_hardif_neigh_put(neigh); + ewma_throughput_add(&neigh->bat_v.throughput, throughput); } /** @@ -248,14 +282,16 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) */ static void batadv_v_elp_periodic_work(struct work_struct *work) { + struct batadv_v_metric_queue_entry *metric_entry; + struct batadv_v_metric_queue_entry *metric_safe; struct batadv_hardif_neigh_node *hardif_neigh; struct batadv_hard_iface *hard_iface; struct batadv_hard_iface_bat_v *bat_v; struct batadv_elp_packet *elp_packet; + struct list_head metric_queue; struct batadv_priv *bat_priv; struct sk_buff *skb; u32 elp_interval; - bool ret; bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work); hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v); @@ -291,6 +327,8 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) atomic_inc(&hard_iface->bat_v.elp_seqno); + INIT_LIST_HEAD(&metric_queue); + /* The throughput metric is updated on each sent packet. This way, if a * node is dead and no longer sends packets, batman-adv is still able to * react timely to its death. @@ -315,16 +353,28 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) /* Reading the estimated throughput from cfg80211 is a task that * may sleep and that is not allowed in an rcu protected - * context. Therefore schedule a task for that. + * context. Therefore add it to metric_queue and process it + * outside rcu protected context. */ - ret = queue_work(batadv_event_workqueue, - &hardif_neigh->bat_v.metric_work); - - if (!ret) + metric_entry = kzalloc(sizeof(*metric_entry), GFP_ATOMIC); + if (!metric_entry) { batadv_hardif_neigh_put(hardif_neigh); + continue; + } + + metric_entry->hardif_neigh = hardif_neigh; + list_add(&metric_entry->list, &metric_queue); } rcu_read_unlock(); + list_for_each_entry_safe(metric_entry, metric_safe, &metric_queue, list) { + batadv_v_elp_throughput_metric_update(metric_entry->hardif_neigh); + + batadv_hardif_neigh_put(metric_entry->hardif_neigh); + list_del(&metric_entry->list); + kfree(metric_entry); + } + restart_timer: batadv_v_elp_start_timer(hard_iface); out: diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 9e2740195fa2..c9cb0a307100 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -10,7 +10,6 @@ #include "main.h" #include <linux/skbuff.h> -#include <linux/workqueue.h> int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface); void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface); @@ -19,6 +18,5 @@ void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface, void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface); int batadv_v_elp_packet_recv(struct sk_buff *skb, struct batadv_hard_iface *if_incoming); -void batadv_v_elp_throughput_metric_update(struct work_struct *work); #endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */ diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 3c0a14a582e4..d4b71d34310f 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -3937,23 +3937,21 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_change *tt_change; struct batadv_tvlv_tt_data *tt_data; u16 num_entries, num_vlan; - size_t flex_size; + size_t tt_data_sz; if (tvlv_value_len < sizeof(*tt_data)) return; tt_data = tvlv_value; - tvlv_value_len -= sizeof(*tt_data); - num_vlan = ntohs(tt_data->num_vlan); - flex_size = flex_array_size(tt_data, vlan_data, num_vlan); - if (tvlv_value_len < flex_size) + tt_data_sz = struct_size(tt_data, vlan_data, num_vlan); + if (tvlv_value_len < tt_data_sz) return; tt_change = (struct batadv_tvlv_tt_change *)((void *)tt_data - + flex_size); - tvlv_value_len -= flex_size; + + tt_data_sz); + tvlv_value_len -= tt_data_sz; num_entries = batadv_tt_entries(tvlv_value_len); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index f491bff8c51b..fe89f08533fe 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -596,9 +596,6 @@ struct batadv_hardif_neigh_node_bat_v { * neighbor */ unsigned long last_unicast_tx; - - /** @metric_work: work queue callback item for metric update */ - struct work_struct metric_work; }; /** diff --git a/net/bluetooth/hidp/Kconfig b/net/bluetooth/hidp/Kconfig index 6746be07e222..e08aae35351a 100644 --- a/net/bluetooth/hidp/Kconfig +++ b/net/bluetooth/hidp/Kconfig @@ -1,8 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config BT_HIDP tristate "HIDP protocol support" - depends on BT_BREDR && INPUT && HID_SUPPORT - select HID + depends on BT_BREDR && HID help HIDP (Human Interface Device Protocol) is a transport layer for HID reports. HIDP is required for the Bluetooth Human diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 27b4c4a2ba1f..fec11e576f31 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -119,7 +119,6 @@ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn, { struct l2cap_chan *c; - mutex_lock(&conn->chan_lock); c = __l2cap_get_chan_by_scid(conn, cid); if (c) { /* Only lock if chan reference is not 0 */ @@ -127,7 +126,6 @@ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn, if (c) l2cap_chan_lock(c); } - mutex_unlock(&conn->chan_lock); return c; } @@ -140,7 +138,6 @@ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn, { struct l2cap_chan *c; - mutex_lock(&conn->chan_lock); c = __l2cap_get_chan_by_dcid(conn, cid); if (c) { /* Only lock if chan reference is not 0 */ @@ -148,7 +145,6 @@ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn, if (c) l2cap_chan_lock(c); } - mutex_unlock(&conn->chan_lock); return c; } @@ -418,7 +414,7 @@ static void l2cap_chan_timeout(struct work_struct *work) if (!conn) return; - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); /* __set_chan_timer() calls l2cap_chan_hold(chan) while scheduling * this work. No need to call l2cap_chan_hold(chan) here again. */ @@ -439,7 +435,7 @@ static void l2cap_chan_timeout(struct work_struct *work) l2cap_chan_unlock(chan); l2cap_chan_put(chan); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } struct l2cap_chan *l2cap_chan_create(void) @@ -641,9 +637,9 @@ void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) { - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); __l2cap_chan_add(conn, chan); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } void l2cap_chan_del(struct l2cap_chan *chan, int err) @@ -731,9 +727,9 @@ void l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func, if (!conn) return; - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); __l2cap_chan_list(conn, func, data); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } EXPORT_SYMBOL_GPL(l2cap_chan_list); @@ -745,7 +741,7 @@ static void l2cap_conn_update_id_addr(struct work_struct *work) struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan; - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); list_for_each_entry(chan, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -754,7 +750,7 @@ static void l2cap_conn_update_id_addr(struct work_struct *work) l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan) @@ -948,6 +944,16 @@ static u8 l2cap_get_ident(struct l2cap_conn *conn) return id; } +static void l2cap_send_acl(struct l2cap_conn *conn, struct sk_buff *skb, + u8 flags) +{ + /* Check if the hcon still valid before attempting to send */ + if (hci_conn_valid(conn->hcon->hdev, conn->hcon)) + hci_send_acl(conn->hchan, skb, flags); + else + kfree_skb(skb); +} + static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, void *data) { @@ -970,7 +976,7 @@ static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, bt_cb(skb)->force_active = BT_POWER_FORCE_ACTIVE_ON; skb->priority = HCI_PRIO_MAX; - hci_send_acl(conn->hchan, skb, flags); + l2cap_send_acl(conn, skb, flags); } static void l2cap_do_send(struct l2cap_chan *chan, struct sk_buff *skb) @@ -1497,8 +1503,6 @@ static void l2cap_conn_start(struct l2cap_conn *conn) BT_DBG("conn %p", conn); - mutex_lock(&conn->chan_lock); - list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -1567,8 +1571,6 @@ static void l2cap_conn_start(struct l2cap_conn *conn) l2cap_chan_unlock(chan); } - - mutex_unlock(&conn->chan_lock); } static void l2cap_le_conn_ready(struct l2cap_conn *conn) @@ -1614,7 +1616,7 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) if (hcon->type == ACL_LINK) l2cap_request_info(conn); - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); list_for_each_entry(chan, &conn->chan_l, list) { @@ -1632,7 +1634,7 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); if (hcon->type == LE_LINK) l2cap_le_conn_ready(conn); @@ -1647,14 +1649,10 @@ static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err) BT_DBG("conn %p", conn); - mutex_lock(&conn->chan_lock); - list_for_each_entry(chan, &conn->chan_l, list) { if (test_bit(FLAG_FORCE_RELIABLE, &chan->flags)) l2cap_chan_set_err(chan, err); } - - mutex_unlock(&conn->chan_lock); } static void l2cap_info_timeout(struct work_struct *work) @@ -1665,7 +1663,9 @@ static void l2cap_info_timeout(struct work_struct *work) conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; conn->info_ident = 0; + mutex_lock(&conn->lock); l2cap_conn_start(conn); + mutex_unlock(&conn->lock); } /* @@ -1757,6 +1757,8 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); + mutex_lock(&conn->lock); + kfree_skb(conn->rx_skb); skb_queue_purge(&conn->pending_rx); @@ -1775,8 +1777,6 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) /* Force the connection to be immediately dropped */ hcon->disc_timeout = 0; - mutex_lock(&conn->chan_lock); - /* Kill channels */ list_for_each_entry_safe(chan, l, &conn->chan_l, list) { l2cap_chan_hold(chan); @@ -1790,15 +1790,14 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) l2cap_chan_put(chan); } - mutex_unlock(&conn->chan_lock); - - hci_chan_del(conn->hchan); - if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) cancel_delayed_work_sync(&conn->info_timer); - hcon->l2cap_data = NULL; + hci_chan_del(conn->hchan); conn->hchan = NULL; + + hcon->l2cap_data = NULL; + mutex_unlock(&conn->lock); l2cap_conn_put(conn); } @@ -2916,8 +2915,6 @@ static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb) BT_DBG("conn %p", conn); - mutex_lock(&conn->chan_lock); - list_for_each_entry(chan, &conn->chan_l, list) { if (chan->chan_type != L2CAP_CHAN_RAW) continue; @@ -2932,8 +2929,6 @@ static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb) if (chan->ops->recv(chan, nskb)) kfree_skb(nskb); } - - mutex_unlock(&conn->chan_lock); } /* ---- L2CAP signalling commands ---- */ @@ -3952,7 +3947,6 @@ static void l2cap_connect(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, goto response; } - mutex_lock(&conn->chan_lock); l2cap_chan_lock(pchan); /* Check if the ACL is secure enough (if not SDP) */ @@ -4059,7 +4053,6 @@ response: } l2cap_chan_unlock(pchan); - mutex_unlock(&conn->chan_lock); l2cap_chan_put(pchan); } @@ -4098,27 +4091,19 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, BT_DBG("dcid 0x%4.4x scid 0x%4.4x result 0x%2.2x status 0x%2.2x", dcid, scid, result, status); - mutex_lock(&conn->chan_lock); - if (scid) { chan = __l2cap_get_chan_by_scid(conn, scid); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; } else { chan = __l2cap_get_chan_by_ident(conn, cmd->ident); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; } chan = l2cap_chan_hold_unless_zero(chan); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; err = 0; @@ -4156,9 +4141,6 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, l2cap_chan_unlock(chan); l2cap_chan_put(chan); -unlock: - mutex_unlock(&conn->chan_lock); - return err; } @@ -4446,11 +4428,7 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn, chan->ops->set_shutdown(chan); - l2cap_chan_unlock(chan); - mutex_lock(&conn->chan_lock); - l2cap_chan_lock(chan); l2cap_chan_del(chan, ECONNRESET); - mutex_unlock(&conn->chan_lock); chan->ops->close(chan); @@ -4487,11 +4465,7 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, return 0; } - l2cap_chan_unlock(chan); - mutex_lock(&conn->chan_lock); - l2cap_chan_lock(chan); l2cap_chan_del(chan, 0); - mutex_unlock(&conn->chan_lock); chan->ops->close(chan); @@ -4689,13 +4663,9 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, BT_DBG("dcid 0x%4.4x mtu %u mps %u credits %u result 0x%2.2x", dcid, mtu, mps, credits, result); - mutex_lock(&conn->chan_lock); - chan = __l2cap_get_chan_by_ident(conn, cmd->ident); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; err = 0; @@ -4743,9 +4713,6 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, l2cap_chan_unlock(chan); -unlock: - mutex_unlock(&conn->chan_lock); - return err; } @@ -4857,7 +4824,6 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, goto response; } - mutex_lock(&conn->chan_lock); l2cap_chan_lock(pchan); if (!smp_sufficient_security(conn->hcon, pchan->sec_level, @@ -4923,7 +4889,6 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, response_unlock: l2cap_chan_unlock(pchan); - mutex_unlock(&conn->chan_lock); l2cap_chan_put(pchan); if (result == L2CAP_CR_PEND) @@ -5057,7 +5022,6 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, goto response; } - mutex_lock(&conn->chan_lock); l2cap_chan_lock(pchan); if (!smp_sufficient_security(conn->hcon, pchan->sec_level, @@ -5132,7 +5096,6 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, unlock: l2cap_chan_unlock(pchan); - mutex_unlock(&conn->chan_lock); l2cap_chan_put(pchan); response: @@ -5169,8 +5132,6 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, BT_DBG("mtu %u mps %u credits %u result 0x%4.4x", mtu, mps, credits, result); - mutex_lock(&conn->chan_lock); - cmd_len -= sizeof(*rsp); list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { @@ -5256,8 +5217,6 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); - return err; } @@ -5370,8 +5329,6 @@ static inline int l2cap_le_command_rej(struct l2cap_conn *conn, if (cmd_len < sizeof(*rej)) return -EPROTO; - mutex_lock(&conn->chan_lock); - chan = __l2cap_get_chan_by_ident(conn, cmd->ident); if (!chan) goto done; @@ -5386,7 +5343,6 @@ static inline int l2cap_le_command_rej(struct l2cap_conn *conn, l2cap_chan_put(chan); done: - mutex_unlock(&conn->chan_lock); return 0; } @@ -6841,8 +6797,12 @@ static void process_pending_rx(struct work_struct *work) BT_DBG(""); + mutex_lock(&conn->lock); + while ((skb = skb_dequeue(&conn->pending_rx))) l2cap_recv_frame(conn, skb); + + mutex_unlock(&conn->lock); } static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) @@ -6881,7 +6841,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) conn->local_fixed_chan |= L2CAP_FC_SMP_BREDR; mutex_init(&conn->ident_lock); - mutex_init(&conn->chan_lock); + mutex_init(&conn->lock); INIT_LIST_HEAD(&conn->chan_l); INIT_LIST_HEAD(&conn->users); @@ -7072,7 +7032,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, } } - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); l2cap_chan_lock(chan); if (cid && __l2cap_get_chan_by_dcid(conn, cid)) { @@ -7113,7 +7073,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, chan_unlock: l2cap_chan_unlock(chan); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); done: hci_dev_unlock(hdev); hci_dev_put(hdev); @@ -7325,7 +7285,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) BT_DBG("conn %p status 0x%2.2x encrypt %u", conn, status, encrypt); - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); list_for_each_entry(chan, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -7399,7 +7359,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } /* Append fragment into frame respecting the maximum len of rx_skb */ @@ -7466,19 +7426,45 @@ static void l2cap_recv_reset(struct l2cap_conn *conn) conn->rx_len = 0; } +struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c) +{ + if (!c) + return NULL; + + BT_DBG("conn %p orig refcnt %u", c, kref_read(&c->ref)); + + if (!kref_get_unless_zero(&c->ref)) + return NULL; + + return c; +} + void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) { - struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_conn *conn; int len; + /* Lock hdev to access l2cap_data to avoid race with l2cap_conn_del */ + hci_dev_lock(hcon->hdev); + + conn = hcon->l2cap_data; + if (!conn) conn = l2cap_conn_add(hcon); - if (!conn) - goto drop; + conn = l2cap_conn_hold_unless_zero(conn); + + hci_dev_unlock(hcon->hdev); + + if (!conn) { + kfree_skb(skb); + return; + } BT_DBG("conn %p len %u flags 0x%x", conn, skb->len, flags); + mutex_lock(&conn->lock); + switch (flags) { case ACL_START: case ACL_START_NO_FLUSH: @@ -7503,7 +7489,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) if (len == skb->len) { /* Complete frame received */ l2cap_recv_frame(conn, skb); - return; + goto unlock; } BT_DBG("Start: total len %d, frag len %u", len, skb->len); @@ -7567,6 +7553,9 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) drop: kfree_skb(skb); +unlock: + mutex_unlock(&conn->lock); + l2cap_conn_put(conn); } static struct hci_cb l2cap_cb = { diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 46ea0bee2259..acd11b268b98 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1326,9 +1326,10 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) /* prevent sk structure from being freed whilst unlocked */ sock_hold(sk); - chan = l2cap_pi(sk)->chan; /* prevent chan structure from being freed whilst unlocked */ - l2cap_chan_hold(chan); + chan = l2cap_chan_hold_unless_zero(l2cap_pi(sk)->chan); + if (!chan) + goto shutdown_already; BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); @@ -1358,22 +1359,20 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) release_sock(sk); l2cap_chan_lock(chan); - conn = chan->conn; - if (conn) - /* prevent conn structure from being freed */ - l2cap_conn_get(conn); + /* prevent conn structure from being freed */ + conn = l2cap_conn_hold_unless_zero(chan->conn); l2cap_chan_unlock(chan); if (conn) /* mutex lock must be taken before l2cap_chan_lock() */ - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); l2cap_chan_lock(chan); l2cap_chan_close(chan, 0); l2cap_chan_unlock(chan); if (conn) { - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); l2cap_conn_put(conn); } diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 305dd72c844c..17226b2341d0 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -1132,7 +1132,7 @@ static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, todo_size = size; - while (todo_size) { + do { struct j1939_sk_buff_cb *skcb; segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE, @@ -1177,7 +1177,7 @@ static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, todo_size -= segment_size; session->total_queued_size += segment_size; - } + } while (todo_size); switch (ret) { case 0: /* OK */ diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 95f7a7e65a73..9b72d118d756 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -382,8 +382,9 @@ sk_buff *j1939_session_skb_get_by_offset(struct j1939_session *session, skb_queue_walk(&session->skb_queue, do_skb) { do_skcb = j1939_skb_to_cb(do_skb); - if (offset_start >= do_skcb->offset && - offset_start < (do_skcb->offset + do_skb->len)) { + if ((offset_start >= do_skcb->offset && + offset_start < (do_skcb->offset + do_skb->len)) || + (offset_start == 0 && do_skcb->offset == 0 && do_skb->len == 0)) { skb = do_skb; } } diff --git a/net/core/dev.c b/net/core/dev.c index c0021cbd28fc..b91658e8aedb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11286,6 +11286,20 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, const struct net_device_ops *ops = dev->netdev_ops; const struct net_device_core_stats __percpu *p; + /* + * IPv{4,6} and udp tunnels share common stat helpers and use + * different stat type (NETDEV_PCPU_STAT_TSTATS vs + * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent. + */ + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) != + offsetof(struct pcpu_dstats, rx_bytes)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) != + offsetof(struct pcpu_dstats, rx_packets)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) != + offsetof(struct pcpu_dstats, tx_bytes)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) != + offsetof(struct pcpu_dstats, tx_packets)); + if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); ops->ndo_get_stats64(dev, storage); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index e684ba3ebb38..94a7872ab231 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -37,8 +37,8 @@ static const struct fib_kuid_range fib_kuid_range_unset = { bool fib_rule_matchall(const struct fib_rule *rule) { - if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id || - rule->flags) + if (READ_ONCE(rule->iifindex) || READ_ONCE(rule->oifindex) || + rule->mark || rule->tun_id || rule->flags) return false; if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) return false; @@ -261,12 +261,14 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { - int ret = 0; + int iifindex, oifindex, ret = 0; - if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) + iifindex = READ_ONCE(rule->iifindex); + if (iifindex && (iifindex != fl->flowi_iif)) goto out; - if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) + oifindex = READ_ONCE(rule->oifindex); + if (oifindex && (oifindex != fl->flowi_oif)) goto out; if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) @@ -1041,14 +1043,14 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (rule->iifname[0]) { if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) goto nla_put_failure; - if (rule->iifindex == -1) + if (READ_ONCE(rule->iifindex) == -1) frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->oifname[0]) { if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) goto nla_put_failure; - if (rule->oifindex == -1) + if (READ_ONCE(rule->oifindex) == -1) frh->flags |= FIB_RULE_OIF_DETACHED; } @@ -1220,10 +1222,10 @@ static void attach_rules(struct list_head *rules, struct net_device *dev) list_for_each_entry(rule, rules, list) { if (rule->iifindex == -1 && strcmp(dev->name, rule->iifname) == 0) - rule->iifindex = dev->ifindex; + WRITE_ONCE(rule->iifindex, dev->ifindex); if (rule->oifindex == -1 && strcmp(dev->name, rule->oifname) == 0) - rule->oifindex = dev->ifindex; + WRITE_ONCE(rule->oifindex, dev->ifindex); } } @@ -1233,9 +1235,9 @@ static void detach_rules(struct list_head *rules, struct net_device *dev) list_for_each_entry(rule, rules, list) { if (rule->iifindex == dev->ifindex) - rule->iifindex = -1; + WRITE_ONCE(rule->iifindex, -1); if (rule->oifindex == dev->ifindex) - rule->oifindex = -1; + WRITE_ONCE(rule->oifindex, -1); } } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0e638a37aa09..5db41bf2ed93 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1108,10 +1108,12 @@ bool __skb_flow_dissect(const struct net *net, FLOW_DISSECTOR_KEY_BASIC, target_container); + rcu_read_lock(); + if (skb) { if (!net) { if (skb->dev) - net = dev_net(skb->dev); + net = dev_net_rcu(skb->dev); else if (skb->sk) net = sock_net(skb->sk); } @@ -1122,7 +1124,6 @@ bool __skb_flow_dissect(const struct net *net, enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog_array *run_array; - rcu_read_lock(); run_array = rcu_dereference(init_net.bpf.run_array[type]); if (!run_array) run_array = rcu_dereference(net->bpf.run_array[type]); @@ -1150,17 +1151,17 @@ bool __skb_flow_dissect(const struct net *net, prog = READ_ONCE(run_array->items[0].prog); result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, hlen, flags); - if (result == BPF_FLOW_DISSECTOR_CONTINUE) - goto dissect_continue; - __skb_flow_bpf_to_target(&flow_keys, flow_dissector, - target_container); - rcu_read_unlock(); - return result == BPF_OK; + if (result != BPF_FLOW_DISSECTOR_CONTINUE) { + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + rcu_read_unlock(); + return result == BPF_OK; + } } -dissect_continue: - rcu_read_unlock(); } + rcu_read_unlock(); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 89656d180bc6..bd0251bd74a1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3447,10 +3447,12 @@ static const struct seq_operations neigh_stat_seq_ops = { static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid) { - struct net *net = dev_net(n->dev); struct sk_buff *skb; int err = -ENOBUFS; + struct net *net; + rcu_read_lock(); + net = dev_net_rcu(n->dev); skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; @@ -3463,9 +3465,11 @@ static void __neigh_notify(struct neighbour *n, int type, int flags, goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); - return; + goto out; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); +out: + rcu_read_unlock(); } void neigh_app_ns(struct neighbour *n) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1f4d4b5570ab..d1e559fce918 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3432,6 +3432,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = -ENODEV; rtnl_nets_unlock(&rtnl_nets); + rtnl_nets_destroy(&rtnl_nets); errout: return err; } diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 2bd77c94f9f1..d88e9080643b 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -462,6 +462,11 @@ const char ts_rx_filter_names[][ETH_GSTRING_LEN] = { }; static_assert(ARRAY_SIZE(ts_rx_filter_names) == __HWTSTAMP_FILTER_CNT); +const char ts_flags_names[][ETH_GSTRING_LEN] = { + [const_ilog2(HWTSTAMP_FLAG_BONDED_PHC_INDEX)] = "bonded-phc-index", +}; +static_assert(ARRAY_SIZE(ts_flags_names) == __HWTSTAMP_FLAG_CNT); + const char udp_tunnel_type_names[][ETH_GSTRING_LEN] = { [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN] = "vxlan", [ETHTOOL_UDP_TUNNEL_TYPE_GENEVE] = "geneve", diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 850eadde4bfc..58e9e7db06f9 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -13,6 +13,7 @@ ETHTOOL_LINK_MODE_ ## speed ## base ## type ## _ ## duplex ## _BIT #define __SOF_TIMESTAMPING_CNT (const_ilog2(SOF_TIMESTAMPING_LAST) + 1) +#define __HWTSTAMP_FLAG_CNT (const_ilog2(HWTSTAMP_FLAG_LAST) + 1) struct link_mode_info { int speed; @@ -38,6 +39,7 @@ extern const char wol_mode_names[][ETH_GSTRING_LEN]; extern const char sof_timestamping_names[][ETH_GSTRING_LEN]; extern const char ts_tx_type_names[][ETH_GSTRING_LEN]; extern const char ts_rx_filter_names[][ETH_GSTRING_LEN]; +extern const char ts_flags_names[][ETH_GSTRING_LEN]; extern const char udp_tunnel_type_names[][ETH_GSTRING_LEN]; int __ethtool_get_link(struct net_device *dev); diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 34bee42e1247..7609ce2b2c5e 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -993,7 +993,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, return rc; /* Nonzero ring with RSS only makes sense if NIC adds them together */ - if (cmd == ETHTOOL_SRXCLSRLINS && info.flow_type & FLOW_RSS && + if (cmd == ETHTOOL_SRXCLSRLINS && info.fs.flow_type & FLOW_RSS && !ops->cap_rss_rxnfc_adds && ethtool_get_flow_spec_ring(info.fs.ring_cookie)) return -EINVAL; diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 7cb106b590ab..58df9ad02ce8 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -107,6 +107,8 @@ rss_prepare_ctx(const struct rss_req_info *request, struct net_device *dev, u32 total_size, indir_bytes; u8 *rss_config; + data->no_key_fields = !dev->ethtool_ops->rxfh_per_ctx_key; + ctx = xa_load(&dev->ethtool->rss_ctx, request->rss_context); if (!ctx) return -ENOENT; @@ -153,7 +155,6 @@ rss_prepare_data(const struct ethnl_req_info *req_base, if (!ops->cap_rss_ctx_supported && !ops->create_rxfh_context) return -EOPNOTSUPP; - data->no_key_fields = !ops->rxfh_per_ctx_key; return rss_prepare_ctx(request, dev, data, info); } diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 818cf01f0911..6b76c05caba4 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -75,6 +75,11 @@ static const struct strset_info info_template[] = { .count = __HWTSTAMP_FILTER_CNT, .strings = ts_rx_filter_names, }, + [ETH_SS_TS_FLAGS] = { + .per_dev = false, + .count = __HWTSTAMP_FLAG_CNT, + .strings = ts_flags_names, + }, [ETH_SS_UDP_TUNNEL_TYPES] = { .per_dev = false, .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT, diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c index 9188e088fb2f..2be356bdfe87 100644 --- a/net/ethtool/tsconfig.c +++ b/net/ethtool/tsconfig.c @@ -54,7 +54,7 @@ static int tsconfig_prepare_data(const struct ethnl_req_info *req_base, data->hwtst_config.tx_type = BIT(cfg.tx_type); data->hwtst_config.rx_filter = BIT(cfg.rx_filter); - data->hwtst_config.flags = BIT(cfg.flags); + data->hwtst_config.flags = cfg.flags; data->hwprov_desc.index = -1; hwprov = rtnl_dereference(dev->hwprov); @@ -91,10 +91,16 @@ static int tsconfig_reply_size(const struct ethnl_req_info *req_base, BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32); BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32); + BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32); - if (data->hwtst_config.flags) - /* _TSCONFIG_HWTSTAMP_FLAGS */ - len += nla_total_size(sizeof(u32)); + if (data->hwtst_config.flags) { + ret = ethnl_bitset32_size(&data->hwtst_config.flags, + NULL, __HWTSTAMP_FLAG_CNT, + ts_flags_names, compact); + if (ret < 0) + return ret; + len += ret; /* _TSCONFIG_HWTSTAMP_FLAGS */ + } if (data->hwtst_config.tx_type) { ret = ethnl_bitset32_size(&data->hwtst_config.tx_type, @@ -130,8 +136,10 @@ static int tsconfig_fill_reply(struct sk_buff *skb, int ret; if (data->hwtst_config.flags) { - ret = nla_put_u32(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS, - data->hwtst_config.flags); + ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS, + &data->hwtst_config.flags, NULL, + __HWTSTAMP_FLAG_CNT, + ts_flags_names, compact); if (ret < 0) return ret; } @@ -180,7 +188,7 @@ const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1] = [ETHTOOL_A_TSCONFIG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER] = NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy), - [ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS] = { .type = NLA_U32 }, + [ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS] = { .type = NLA_NESTED }, [ETHTOOL_A_TSCONFIG_RX_FILTERS] = { .type = NLA_NESTED }, [ETHTOOL_A_TSCONFIG_TX_TYPES] = { .type = NLA_NESTED }, }; @@ -296,6 +304,7 @@ static int ethnl_set_tsconfig(struct ethnl_req_info *req_base, BUILD_BUG_ON(__HWTSTAMP_TX_CNT >= 32); BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT >= 32); + BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32); if (!netif_device_present(dev)) return -ENODEV; @@ -377,9 +386,13 @@ static int ethnl_set_tsconfig(struct ethnl_req_info *req_base, } if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS]) { - ethnl_update_u32(&hwtst_config.flags, - tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS], - &config_mod); + ret = ethnl_update_bitset32(&hwtst_config.flags, + __HWTSTAMP_FLAG_CNT, + tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS], + ts_flags_names, info->extack, + &config_mod); + if (ret < 0) + goto err_free_hwprov; } ret = net_hwtstamp_validate(&hwtst_config); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index cb9a7ed8abd3..f23a1ec6694c 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -659,10 +659,12 @@ static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb */ void arp_xmit(struct sk_buff *skb) { + rcu_read_lock(); /* Send it off, maybe filter it using firewalling first. */ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, - dev_net(skb->dev), NULL, skb, NULL, skb->dev, + dev_net_rcu(skb->dev), NULL, skb, NULL, skb->dev, arp_xmit_finish); + rcu_read_unlock(); } EXPORT_SYMBOL(arp_xmit); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c8b3cf5fba4c..55b8151759bc 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1371,10 +1371,11 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) __be32 addr = 0; unsigned char localnet_scope = RT_SCOPE_HOST; struct in_device *in_dev; - struct net *net = dev_net(dev); + struct net *net; int master_idx; rcu_read_lock(); + net = dev_net_rcu(dev); in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto no_in_dev; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 094084b61bff..5482edb5aade 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -399,10 +399,10 @@ static void icmp_push_reply(struct sock *sk, static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) { - struct ipcm_cookie ipc; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->dst.dev); + struct net *net = dev_net_rcu(rt->dst.dev); bool apply_ratelimit = false; + struct ipcm_cookie ipc; struct flowi4 fl4; struct sock *sk; struct inet_sock *inet; @@ -608,12 +608,14 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, struct sock *sk; if (!rt) - goto out; + return; + + rcu_read_lock(); if (rt->dst.dev) - net = dev_net(rt->dst.dev); + net = dev_net_rcu(rt->dst.dev); else if (skb_in->dev) - net = dev_net(skb_in->dev); + net = dev_net_rcu(skb_in->dev); else goto out; @@ -785,7 +787,8 @@ out_unlock: icmp_xmit_unlock(sk); out_bh_enable: local_bh_enable(); -out:; +out: + rcu_read_unlock(); } EXPORT_SYMBOL(__icmp_send); @@ -834,7 +837,7 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) * avoid additional coding at protocol handlers. */ if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) { - __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); return; } @@ -868,7 +871,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb) struct net *net; u32 info = 0; - net = dev_net(skb_dst(skb)->dev); + net = dev_net_rcu(skb_dst(skb)->dev); /* * Incomplete header ? @@ -979,7 +982,7 @@ out_err: static enum skb_drop_reason icmp_redirect(struct sk_buff *skb) { if (skb->len < sizeof(struct iphdr)) { - __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); return SKB_DROP_REASON_PKT_TOO_SMALL; } @@ -1011,7 +1014,7 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) struct icmp_bxm icmp_param; struct net *net; - net = dev_net(skb_dst(skb)->dev); + net = dev_net_rcu(skb_dst(skb)->dev); /* should there be an ICMP stat for ignored echos? */ if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all)) return SKB_NOT_DROPPED_YET; @@ -1040,9 +1043,9 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) { + struct net *net = dev_net_rcu(skb->dev); struct icmp_ext_hdr *ext_hdr, _ext_hdr; struct icmp_ext_echo_iio *iio, _iio; - struct net *net = dev_net(skb->dev); struct inet6_dev *in6_dev; struct in_device *in_dev; struct net_device *dev; @@ -1181,7 +1184,7 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb) return SKB_NOT_DROPPED_YET; out_err: - __ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net_rcu(skb_dst(skb)->dev), ICMP_MIB_INERRORS); return SKB_DROP_REASON_PKT_TOO_SMALL; } @@ -1198,7 +1201,7 @@ int icmp_rcv(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->dst.dev); + struct net *net = dev_net_rcu(rt->dst.dev); struct icmphdr *icmph; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { @@ -1371,9 +1374,9 @@ int icmp_err(struct sk_buff *skb, u32 info) struct iphdr *iph = (struct iphdr *)skb->data; int offset = iph->ihl<<2; struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset); + struct net *net = dev_net_rcu(skb->dev); int type = icmp_hdr(skb)->type; int code = icmp_hdr(skb)->code; - struct net *net = dev_net(skb->dev); /* * Use ping_err to handle all icmp errors except those diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 577b88a43293..753704f75b2c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -390,7 +390,13 @@ static inline int ip_rt_proc_init(void) static inline bool rt_is_expired(const struct rtable *rth) { - return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); + bool res; + + rcu_read_lock(); + res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev)); + rcu_read_unlock(); + + return res; } void rt_cache_flush(struct net *net) @@ -1002,9 +1008,9 @@ out: kfree_skb_reason(skb, reason); static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; - struct net *net = dev_net(dst->dev); struct fib_result res; bool lock = false; + struct net *net; u32 old_mtu; if (ip_mtu_locked(dst)) @@ -1014,6 +1020,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (old_mtu < mtu) return; + rcu_read_lock(); + net = dev_net_rcu(dst->dev); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); @@ -1021,9 +1029,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (rt->rt_pmtu == mtu && !lock && time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2)) - return; + goto out; - rcu_read_lock(); if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; @@ -1037,14 +1044,14 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } - rcu_read_unlock(); - return; + goto out; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } +out: rcu_read_unlock(); } @@ -1307,10 +1314,15 @@ static void set_class_tag(struct rtable *rt, u32 tag) static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { - struct net *net = dev_net(dst->dev); unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); - unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, - net->ipv4.ip_rt_min_advmss); + unsigned int advmss; + struct net *net; + + rcu_read_lock(); + net = dev_net_rcu(dst->dev); + advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, + net->ipv4.ip_rt_min_advmss); + rcu_read_unlock(); return min(advmss, IPV4_MAX_PMTU - header_size); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c472c9a57cf6..a9bb9ce5438e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1141,9 +1141,9 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); - if (hlen + cork->gso_size > cork->fragsize) { + if (hlen + min(datalen, cork->gso_size) > cork->fragsize) { kfree_skb(skb); - return -EINVAL; + return -EMSGSIZE; } if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index a6984a29fdb9..4d14ab7f7e99 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -76,7 +76,7 @@ static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); if (type == ICMPV6_PKT_TOOBIG) ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); @@ -473,7 +473,10 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (!skb->dev) return; - net = dev_net(skb->dev); + + rcu_read_lock(); + + net = dev_net_rcu(skb->dev); mark = IP6_REPLY_MARK(net, skb->mark); /* * Make sure we respect the rules @@ -496,7 +499,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, !(type == ICMPV6_PARAMPROB && code == ICMPV6_UNK_OPTION && (opt_unrec(skb, info)))) - return; + goto out; saddr = NULL; } @@ -526,7 +529,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); - return; + goto out; } /* @@ -535,7 +538,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (is_ineligible(skb)) { net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); - return; + goto out; } /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */ @@ -582,7 +585,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, np = inet6_sk(sk); if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit)) - goto out; + goto out_unlock; tmp_hdr.icmp6_type = type; tmp_hdr.icmp6_code = code; @@ -600,7 +603,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, dst = icmpv6_route_lookup(net, skb, sk, &fl6); if (IS_ERR(dst)) - goto out; + goto out_unlock; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); @@ -616,7 +619,6 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, goto out_dst_release; } - rcu_read_lock(); idev = __in6_dev_get(skb->dev); if (ip6_append_data(sk, icmpv6_getfrag, &msg, @@ -630,13 +632,15 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, len + sizeof(struct icmp6hdr)); } - rcu_read_unlock(); + out_dst_release: dst_release(dst); -out: +out_unlock: icmpv6_xmit_unlock(sk); out_bh_enable: local_bh_enable(); +out: + rcu_read_unlock(); } EXPORT_SYMBOL(icmp6_send); @@ -679,8 +683,8 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, skb_pull(skb2, nhs); skb_reset_network_header(skb2); - rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, - skb, 0); + rt = rt6_lookup(dev_net_rcu(skb->dev), &ipv6_hdr(skb2)->saddr, + NULL, 0, skb, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; @@ -717,7 +721,7 @@ EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach); static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct sock *sk; struct inet6_dev *idev; struct ipv6_pinfo *np; @@ -832,7 +836,7 @@ enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) { struct inet6_skb_parm *opt = IP6CB(skb); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct inet6_protocol *ipprot; enum skb_drop_reason reason; int inner_offset; @@ -889,7 +893,7 @@ out: static int icmpv6_rcv(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct net_device *dev = icmp6_dev(skb); struct inet6_dev *idev = __in6_dev_get(dev); const struct in6_addr *saddr, *daddr; @@ -921,7 +925,7 @@ static int icmpv6_rcv(struct sk_buff *skb) skb_set_network_header(skb, nh); } - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INMSGS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INMSGS); saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; @@ -939,7 +943,7 @@ static int icmpv6_rcv(struct sk_buff *skb) type = hdr->icmp6_type; - ICMP6MSGIN_INC_STATS(dev_net(dev), idev, type); + ICMP6MSGIN_INC_STATS(dev_net_rcu(dev), idev, type); switch (type) { case ICMPV6_ECHO_REQUEST: @@ -1034,9 +1038,9 @@ static int icmpv6_rcv(struct sk_buff *skb) csum_error: reason = SKB_DROP_REASON_ICMP_CSUM; - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_CSUMERRORS); discard_it: - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INERRORS); drop_no_count: kfree_skb_reason(skb, reason); return 0; diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 28e5a89dc255..2c383c12a431 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -336,7 +336,7 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb), *cache_dst; + struct dst_entry *dst = skb_dst(skb), *cache_dst = NULL; struct in6_addr orig_daddr; struct ioam6_lwt *ilwt; int err = -EINVAL; @@ -407,13 +407,15 @@ do_encap: cache_dst = ip6_route_output(net, NULL, &fl6); if (cache_dst->error) { err = cache_dst->error; - dst_release(cache_dst); goto drop; } - local_bh_disable(); - dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr); - local_bh_enable(); + /* cache only if we don't create a dst reference loop */ + if (dst->lwtstate != cache_dst->lwtstate) { + local_bh_disable(); + dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr); + local_bh_enable(); + } err = skb_cow_head(skb, LL_RESERVED_SPACE(cache_dst->dev)); if (unlikely(err)) @@ -426,8 +428,10 @@ do_encap: return dst_output(net, sk, skb); } out: + dst_release(cache_dst); return dst->lwtstate->orig_output(net, sk, skb); drop: + dst_release(cache_dst); kfree_skb(skb); return err; } diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 70c0e16c0ae6..39da6a7ce5f1 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -477,9 +477,7 @@ discard: static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_clear_delivery_time(skb); - rcu_read_lock(); ip6_protocol_deliver_rcu(net, skb, 0, false); - rcu_read_unlock(); return 0; } @@ -487,9 +485,15 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk int ip6_input(struct sk_buff *skb) { - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, - dev_net(skb->dev), NULL, skb, skb->dev, NULL, - ip6_input_finish); + int res; + + rcu_read_lock(); + res = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, + dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL, + ip6_input_finish); + rcu_read_unlock(); + + return res; } EXPORT_SYMBOL_GPL(ip6_input); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 9dfdb40988b0..65831b4fee1f 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1773,21 +1773,19 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) struct net_device *dev = idev->dev; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - struct net *net = dev_net(dev); const struct in6_addr *saddr; struct in6_addr addr_buf; struct mld2_report *pmr; struct sk_buff *skb; unsigned int size; struct sock *sk; - int err; + struct net *net; - sk = net->ipv6.igmp_sk; /* we assume size > sizeof(ra) here * Also try to not allocate high-order pages for big MTU */ size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen; - skb = sock_alloc_send_skb(sk, size, 1, &err); + skb = alloc_skb(size, GFP_KERNEL); if (!skb) return NULL; @@ -1795,6 +1793,12 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); + rcu_read_lock(); + + net = dev_net_rcu(dev); + sk = net->ipv6.igmp_sk; + skb_set_owner_w(skb, sk); + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address @@ -1806,6 +1810,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); + rcu_read_unlock(); + skb_put_data(skb, ra, sizeof(ra)); skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data); @@ -2165,21 +2171,21 @@ static void mld_send_cr(struct inet6_dev *idev) static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) { - struct net *net = dev_net(dev); - struct sock *sk = net->ipv6.igmp_sk; + const struct in6_addr *snd_addr, *saddr; + int err, len, payload_len, full_len; + struct in6_addr addr_buf; struct inet6_dev *idev; struct sk_buff *skb; struct mld_msg *hdr; - const struct in6_addr *snd_addr, *saddr; - struct in6_addr addr_buf; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - int err, len, payload_len, full_len; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; - struct flowi6 fl6; struct dst_entry *dst; + struct flowi6 fl6; + struct net *net; + struct sock *sk; if (type == ICMPV6_MGM_REDUCTION) snd_addr = &in6addr_linklocal_allrouters; @@ -2190,19 +2196,21 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) payload_len = len + sizeof(ra); full_len = sizeof(struct ipv6hdr) + payload_len; - rcu_read_lock(); - IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_OUTREQUESTS); - rcu_read_unlock(); + skb = alloc_skb(hlen + tlen + full_len, GFP_KERNEL); - skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err); + rcu_read_lock(); + net = dev_net_rcu(dev); + idev = __in6_dev_get(dev); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); if (!skb) { - rcu_read_lock(); - IP6_INC_STATS(net, __in6_dev_get(dev), - IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return; } + sk = net->ipv6.igmp_sk; + skb_set_owner_w(skb, sk); + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, hlen); @@ -2227,9 +2235,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) IPPROTO_ICMPV6, csum_partial(hdr, len, 0)); - rcu_read_lock(); - idev = __in6_dev_get(skb->dev); - icmpv6_flow_init(sk, &fl6, type, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index d044c67019de..8699d1a188dc 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -418,15 +418,11 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, { int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); - if (!skb) { - ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", - __func__); + if (!skb) return NULL; - } skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; @@ -437,7 +433,9 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, /* Manually assign socket ownership as we avoid calling * sock_alloc_send_pskb() to bypass wmem buffer limits */ - skb_set_owner_w(skb, sk); + rcu_read_lock(); + skb_set_owner_w(skb, dev_net_rcu(dev)->ipv6.ndisc_sk); + rcu_read_unlock(); return skb; } @@ -473,16 +471,20 @@ static void ip6_nd_hdr(struct sk_buff *skb, void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, const struct in6_addr *saddr) { + struct icmp6hdr *icmp6h = icmp6_hdr(skb); struct dst_entry *dst = skb_dst(skb); - struct net *net = dev_net(skb->dev); - struct sock *sk = net->ipv6.ndisc_sk; struct inet6_dev *idev; + struct net *net; + struct sock *sk; int err; - struct icmp6hdr *icmp6h = icmp6_hdr(skb); u8 type; type = icmp6h->icmp6_type; + rcu_read_lock(); + + net = dev_net_rcu(skb->dev); + sk = net->ipv6.ndisc_sk; if (!dst) { struct flowi6 fl6; int oif = skb->dev->ifindex; @@ -490,6 +492,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { + rcu_read_unlock(); kfree_skb(skb); return; } @@ -504,7 +507,6 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); - rcu_read_lock(); idev = __in6_dev_get(dst->dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); @@ -1694,7 +1696,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) bool ret; if (netif_is_l3_master(skb->dev)) { - dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); + dev = dev_get_by_index_rcu(dev_net(skb->dev), IPCB(skb)->iif); if (!dev) return; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 78362822b907..ef2d23a1e3d5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3196,13 +3196,18 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) { struct net_device *dev = dst->dev; unsigned int mtu = dst_mtu(dst); - struct net *net = dev_net(dev); + struct net *net; mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + rcu_read_lock(); + + net = dev_net_rcu(dev); if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) mtu = net->ipv6.sysctl.ip6_rt_min_advmss; + rcu_read_unlock(); + /* * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index 7ba22d2f2bfe..0ac4283acdf2 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -232,13 +232,15 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; - dst_release(dst); goto drop; } - local_bh_disable(); - dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); - local_bh_enable(); + /* cache only if we don't create a dst reference loop */ + if (orig_dst->lwtstate != dst->lwtstate) { + local_bh_disable(); + dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); + local_bh_enable(); + } err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) @@ -251,6 +253,7 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) return dst_output(net, sk, skb); drop: + dst_release(dst); kfree_skb(skb); return err; } @@ -269,8 +272,10 @@ static int rpl_input(struct sk_buff *skb) local_bh_enable(); err = rpl_do_srh(skb, rlwt, dst); - if (unlikely(err)) + if (unlikely(err)) { + dst_release(dst); goto drop; + } if (!dst) { ip6_route_input(skb); diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 4bf937bfc263..33833b2064c0 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -482,8 +482,10 @@ static int seg6_input_core(struct net *net, struct sock *sk, local_bh_enable(); err = seg6_do_srh(skb, dst); - if (unlikely(err)) + if (unlikely(err)) { + dst_release(dst); goto drop; + } if (!dst) { ip6_route_input(skb); @@ -571,13 +573,15 @@ static int seg6_output_core(struct net *net, struct sock *sk, dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; - dst_release(dst); goto drop; } - local_bh_disable(); - dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); - local_bh_enable(); + /* cache only if we don't create a dst reference loop */ + if (orig_dst->lwtstate != dst->lwtstate) { + local_bh_disable(); + dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); + local_bh_enable(); + } err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) @@ -593,6 +597,7 @@ static int seg6_output_core(struct net *net, struct sock *sk, return dst_output(net, sk, skb); drop: + dst_release(dst); kfree_skb(skb); return err; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6671daa67f4f..c6ea438b5c75 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1389,9 +1389,9 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); - if (hlen + cork->gso_size > cork->fragsize) { + if (hlen + min(datalen, cork->gso_size) > cork->fragsize) { kfree_skb(skb); - return -EINVAL; + return -EMSGSIZE; } if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 97c6eb8847a0..8cd4cf7ae211 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -381,10 +381,8 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; - if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) { - flow_offload_teardown(flow); + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return 0; - } iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); thoff = (iph->ihl * 4) + ctx->offset; @@ -662,10 +660,8 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; - if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) { - flow_offload_teardown(flow); + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return 0; - } ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); thoff = sizeof(*ip6h) + ctx->offset; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 225f6048867f..5d548eda742d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2101,6 +2101,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, { struct ovs_header *ovs_header; struct ovs_vport_stats vport_stats; + struct net *net_vport; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family, @@ -2117,12 +2118,15 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex)) goto nla_put_failure; - if (!net_eq(net, dev_net(vport->dev))) { - int id = peernet2id_alloc(net, dev_net(vport->dev), gfp); + rcu_read_lock(); + net_vport = dev_net_rcu(vport->dev); + if (!net_eq(net, net_vport)) { + int id = peernet2id_alloc(net, net_vport, GFP_ATOMIC); if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id)) - goto nla_put_failure; + goto nla_put_failure_unlock; } + rcu_read_unlock(); ovs_vport_get_stats(vport, &vport_stats); if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS, @@ -2143,6 +2147,8 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, genlmsg_end(skb, ovs_header); return 0; +nla_put_failure_unlock: + rcu_read_unlock(); nla_put_failure: err = -EMSGSIZE; error: diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 72c65d938a15..a4a668b88a8f 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -701,11 +701,9 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net_device *dev; ax25_address *source; ax25_uid_assoc *user; + int err = -EINVAL; int n; - if (!sock_flag(sk, SOCK_ZAPPED)) - return -EINVAL; - if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose)) return -EINVAL; @@ -718,8 +716,15 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS) return -EINVAL; - if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) - return -EADDRNOTAVAIL; + lock_sock(sk); + + if (!sock_flag(sk, SOCK_ZAPPED)) + goto out_release; + + err = -EADDRNOTAVAIL; + dev = rose_dev_get(&addr->srose_addr); + if (!dev) + goto out_release; source = &addr->srose_call; @@ -730,7 +735,8 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } else { if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { dev_put(dev); - return -EACCES; + err = -EACCES; + goto out_release; } rose->source_call = *source; } @@ -753,8 +759,10 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) rose_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); - - return 0; + err = 0; +out_release: + release_sock(sk); + return err; } static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 718193df9d2e..5e740c486203 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -327,8 +327,8 @@ struct rxrpc_local { * packet with a maximum set of jumbo subpackets or a PING ACK padded * out to 64K with zeropages for PMTUD. */ - struct kvec kvec[RXRPC_MAX_NR_JUMBO > 3 + 16 ? - RXRPC_MAX_NR_JUMBO : 3 + 16]; + struct kvec kvec[1 + RXRPC_MAX_NR_JUMBO > 3 + 16 ? + 1 + RXRPC_MAX_NR_JUMBO : 3 + 16]; }; /* @@ -582,6 +582,7 @@ enum rxrpc_call_flag { RXRPC_CALL_EXCLUSIVE, /* The call uses a once-only connection */ RXRPC_CALL_RX_IS_IDLE, /* recvmsg() is idle - send an ACK */ RXRPC_CALL_RECVMSG_READ_ALL, /* recvmsg() read all of the received data */ + RXRPC_CALL_CONN_CHALLENGING, /* The connection is being challenged */ }; /* @@ -602,7 +603,6 @@ enum rxrpc_call_state { RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */ - RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ @@ -874,8 +874,7 @@ struct rxrpc_txbuf { #define RXRPC_TXBUF_RESENT 0x100 /* Set if has been resent */ __be16 cksum; /* Checksum to go in header */ bool jumboable; /* Can be non-terminal jumbo subpacket */ - u8 nr_kvec; /* Amount of kvec[] used */ - struct kvec kvec[1]; + void *data; /* Data with preceding jumbo header */ }; static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb) diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 5a543c3f6fb0..c4c8b46a68c6 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -22,7 +22,6 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", [RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc", - [RXRPC_CALL_SERVER_SECURING] = "SvSecure", [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq", [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", @@ -453,17 +452,16 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, call->cong_tstamp = skb->tstamp; __set_bit(RXRPC_CALL_EXPOSED, &call->flags); - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); spin_lock(&conn->state_lock); switch (conn->state) { case RXRPC_CONN_SERVICE_UNSECURED: case RXRPC_CONN_SERVICE_CHALLENGING: - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); + __set_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags); break; case RXRPC_CONN_SERVICE: - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); break; case RXRPC_CONN_ABORTED: diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 713e04394ceb..4d9c5e21ba78 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -228,10 +228,8 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn) */ static void rxrpc_call_is_secure(struct rxrpc_call *call) { - if (call && __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SECURING) { - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); + if (call && __test_and_clear_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags)) rxrpc_notify_socket(call); - } } /* @@ -272,6 +270,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, * we've already received the packet, put it on the * front of the queue. */ + sp->conn = rxrpc_get_connection(conn, rxrpc_conn_get_poke_secured); skb->mark = RXRPC_SKB_MARK_SERVICE_CONN_SECURED; rxrpc_get_skb(skb, rxrpc_skb_get_conn_secured); skb_queue_head(&conn->local->rx_queue, skb); @@ -437,14 +436,16 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) if (test_and_clear_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events)) rxrpc_abort_calls(conn); - switch (skb->mark) { - case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: - if (conn->state != RXRPC_CONN_SERVICE) - break; + if (skb) { + switch (skb->mark) { + case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: + if (conn->state != RXRPC_CONN_SERVICE) + break; - for (loop = 0; loop < RXRPC_MAXCALLS; loop++) - rxrpc_call_is_secure(conn->channels[loop].call); - break; + for (loop = 0; loop < RXRPC_MAXCALLS; loop++) + rxrpc_call_is_secure(conn->channels[loop].call); + break; + } } /* Process delayed ACKs whose time has come. */ diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 7eba4d7d9a38..2f1fd1e2e7e4 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -67,6 +67,7 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, INIT_WORK(&conn->destructor, rxrpc_clean_up_connection); INIT_LIST_HEAD(&conn->proc_link); INIT_LIST_HEAD(&conn->link); + INIT_LIST_HEAD(&conn->attend_link); mutex_init(&conn->security_lock); mutex_init(&conn->tx_data_alloc_lock); skb_queue_head_init(&conn->rx_queue); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 4974b5accafa..9047ba13bd31 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -448,11 +448,19 @@ static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); bool last = sp->hdr.flags & RXRPC_LAST_PACKET; - skb_queue_tail(&call->recvmsg_queue, skb); + spin_lock_irq(&call->recvmsg_queue.lock); + + __skb_queue_tail(&call->recvmsg_queue, skb); rxrpc_input_update_ack_window(call, window, wtop); trace_rxrpc_receive(call, last ? why + 1 : why, sp->hdr.serial, sp->hdr.seq); if (last) + /* Change the state inside the lock so that recvmsg syncs + * correctly with it and using sendmsg() to send a reply + * doesn't race. + */ rxrpc_end_rx_phase(call, sp->hdr.serial); + + spin_unlock_irq(&call->recvmsg_queue.lock); } /* @@ -657,7 +665,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb rxrpc_propose_delay_ACK(call, sp->hdr.serial, rxrpc_propose_ack_input_data); } - if (notify) { + if (notify && !test_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags)) { trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial); rxrpc_notify_socket(call); } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 6f7a125d6e90..95905b85a8d7 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -428,13 +428,13 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_send_data_req *req, struct rxrpc_txbuf *txb, + struct rxrpc_wire_header *whdr, rxrpc_serial_t serial, int subpkt) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxrpc_jumbo_header *jumbo = (void *)(whdr + 1) - sizeof(*jumbo); + struct rxrpc_jumbo_header *jumbo = txb->data - sizeof(*jumbo); enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; - struct kvec *kv = &call->local->kvec[subpkt]; + struct kvec *kv = &call->local->kvec[1 + subpkt]; size_t len = txb->pkt_len; bool last; u8 flags; @@ -491,18 +491,15 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, } dont_set_request_ack: - /* The jumbo header overlays the wire header in the txbuf. */ + /* There's a jumbo header prepended to the data if we need it. */ if (subpkt < req->n - 1) flags |= RXRPC_JUMBO_PACKET; else flags &= ~RXRPC_JUMBO_PACKET; if (subpkt == 0) { whdr->flags = flags; - whdr->serial = htonl(txb->serial); whdr->cksum = txb->cksum; - whdr->serviceId = htons(conn->service_id); - kv->iov_base = whdr; - len += sizeof(*whdr); + kv->iov_base = txb->data; } else { jumbo->flags = flags; jumbo->pad = 0; @@ -535,7 +532,9 @@ static unsigned int rxrpc_prepare_txqueue(struct rxrpc_txqueue *tq, /* * Prepare a (jumbo) packet for transmission. */ -static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req) +static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, + struct rxrpc_send_data_req *req, + struct rxrpc_wire_header *whdr) { struct rxrpc_txqueue *tq = req->tq; rxrpc_serial_t serial; @@ -549,6 +548,18 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se /* Each transmission of a Tx packet needs a new serial number */ serial = rxrpc_get_next_serials(call->conn, req->n); + whdr->epoch = htonl(call->conn->proto.epoch); + whdr->cid = htonl(call->cid); + whdr->callNumber = htonl(call->call_id); + whdr->seq = htonl(seq); + whdr->serial = htonl(serial); + whdr->type = RXRPC_PACKET_TYPE_DATA; + whdr->flags = 0; + whdr->userStatus = 0; + whdr->securityIndex = call->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(call->conn->service_id); + call->tx_last_serial = serial + req->n - 1; call->tx_last_sent = req->now; xmit_ts = rxrpc_prepare_txqueue(tq, req); @@ -576,7 +587,7 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se if (i + 1 == req->n) /* Only sample the last subpacket in a jumbo. */ __set_bit(ix, &tq->rtt_samples); - len += rxrpc_prepare_data_subpacket(call, req, txb, serial, i); + len += rxrpc_prepare_data_subpacket(call, req, txb, whdr, serial, i); serial++; seq++; i++; @@ -618,6 +629,7 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se } rxrpc_set_keepalive(call, req->now); + page_frag_free(whdr); return len; } @@ -626,25 +638,33 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se */ void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req) { + struct rxrpc_wire_header *whdr; struct rxrpc_connection *conn = call->conn; enum rxrpc_tx_point frag; struct rxrpc_txqueue *tq = req->tq; struct rxrpc_txbuf *txb; struct msghdr msg; rxrpc_seq_t seq = req->seq; - size_t len; + size_t len = sizeof(*whdr); bool new_call = test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags); int ret, stat_ix; _enter("%x,%x-%x", tq->qbase, seq, seq + req->n - 1); + whdr = page_frag_alloc(&call->local->tx_alloc, sizeof(*whdr), GFP_NOFS); + if (!whdr) + return; /* Drop the packet if no memory. */ + + call->local->kvec[0].iov_base = whdr; + call->local->kvec[0].iov_len = sizeof(*whdr); + stat_ix = umin(req->n, ARRAY_SIZE(call->rxnet->stat_tx_jumbo)) - 1; atomic_inc(&call->rxnet->stat_tx_jumbo[stat_ix]); - len = rxrpc_prepare_data_packet(call, req); + len += rxrpc_prepare_data_packet(call, req, whdr); txb = tq->bufs[seq & RXRPC_TXQ_MASK]; - iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, req->n, len); + iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, 1 + req->n, len); msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; @@ -695,13 +715,13 @@ void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req if (ret == -EMSGSIZE) { rxrpc_inc_stat(call->rxnet, stat_tx_data_send_msgsize); - trace_rxrpc_tx_packet(call->debug_id, call->local->kvec[0].iov_base, frag); + trace_rxrpc_tx_packet(call->debug_id, whdr, frag); ret = 0; } else if (ret < 0) { rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, frag); } else { - trace_rxrpc_tx_packet(call->debug_id, call->local->kvec[0].iov_base, frag); + trace_rxrpc_tx_packet(call->debug_id, whdr, frag); } rxrpc_tx_backoff(call, ret); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index e874c31fa901..bc283da9ee40 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -169,6 +169,13 @@ void rxrpc_input_error(struct rxrpc_local *local, struct sk_buff *skb) goto out; } + if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 && + serr->ee.ee_type == ICMPV6_PKT_TOOBIG && + serr->ee.ee_code == 0)) { + rxrpc_adjust_mtu(peer, serr->ee.ee_info); + goto out; + } + rxrpc_store_error(peer, skb); out: rxrpc_put_peer(peer, rxrpc_peer_put_input_error); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 62b09d23ec08..6cb37b0eb77f 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -257,8 +257,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, struct rxrpc_txbuf *txb, struct skcipher_request *req) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxkad_level1_hdr *hdr = (void *)(whdr + 1); + struct rxkad_level1_hdr *hdr = txb->data; struct rxrpc_crypt iv; struct scatterlist sg; size_t pad; @@ -274,7 +273,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, pad = RXKAD_ALIGN - pad; pad &= RXKAD_ALIGN - 1; if (pad) { - memset(txb->kvec[0].iov_base + txb->offset, 0, pad); + memset(txb->data + txb->offset, 0, pad); txb->pkt_len += pad; } @@ -300,8 +299,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, struct skcipher_request *req) { const struct rxrpc_key_token *token; - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxkad_level2_hdr *rxkhdr = (void *)(whdr + 1); + struct rxkad_level2_hdr *rxkhdr = txb->data; struct rxrpc_crypt iv; struct scatterlist sg; size_t content, pad; @@ -319,7 +317,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, txb->pkt_len = round_up(content, RXKAD_ALIGN); pad = txb->pkt_len - content; if (pad) - memset(txb->kvec[0].iov_base + txb->offset, 0, pad); + memset(txb->data + txb->offset, 0, pad); /* encrypt from the session key */ token = call->conn->key->payload.data[0]; @@ -407,9 +405,8 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) /* Clear excess space in the packet */ if (txb->pkt_len < txb->alloc_size) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; size_t gap = txb->alloc_size - txb->pkt_len; - void *p = whdr + 1; + void *p = txb->data; memset(p + txb->pkt_len, 0, gap); } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 0e8da909d4f2..84dc6c94f23b 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -419,7 +419,7 @@ reload: size_t copy = umin(txb->space, msg_data_left(msg)); _debug("add %zu", copy); - if (!copy_from_iter_full(txb->kvec[0].iov_base + txb->offset, + if (!copy_from_iter_full(txb->data + txb->offset, copy, &msg->msg_iter)) goto efault; _debug("added"); @@ -445,8 +445,6 @@ reload: ret = call->security->secure_packet(call, txb); if (ret < 0) goto out; - - txb->kvec[0].iov_len += txb->len; rxrpc_queue_packet(rx, call, txb, notify_end_tx); txb = NULL; } @@ -707,7 +705,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) } else { switch (rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_AWAIT_CONN: - case RXRPC_CALL_SERVER_SECURING: + case RXRPC_CALL_SERVER_RECV_REQUEST: if (p.command == RXRPC_CMD_SEND_ABORT) break; fallthrough; diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index 131d9e55c8e9..c550991d48fa 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -19,17 +19,19 @@ atomic_t rxrpc_nr_txbuf; struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, size_t data_align, gfp_t gfp) { - struct rxrpc_wire_header *whdr; struct rxrpc_txbuf *txb; - size_t total, hoff; + size_t total, doff, jsize = sizeof(struct rxrpc_jumbo_header); void *buf; txb = kzalloc(sizeof(*txb), gfp); if (!txb) return NULL; - hoff = round_up(sizeof(*whdr), data_align) - sizeof(*whdr); - total = hoff + sizeof(*whdr) + data_size; + /* We put a jumbo header in the buffer, but not a full wire header to + * avoid delayed-corruption problems with zerocopy. + */ + doff = round_up(jsize, data_align); + total = doff + data_size; data_align = umax(data_align, L1_CACHE_BYTES); mutex_lock(&call->conn->tx_data_alloc_lock); @@ -41,30 +43,15 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ return NULL; } - whdr = buf + hoff; - refcount_set(&txb->ref, 1); txb->call_debug_id = call->debug_id; txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); txb->alloc_size = data_size; txb->space = data_size; - txb->offset = sizeof(*whdr); + txb->offset = 0; txb->flags = call->conn->out_clientflag; txb->seq = call->send_top + 1; - txb->nr_kvec = 1; - txb->kvec[0].iov_base = whdr; - txb->kvec[0].iov_len = sizeof(*whdr); - - whdr->epoch = htonl(call->conn->proto.epoch); - whdr->cid = htonl(call->cid); - whdr->callNumber = htonl(call->call_id); - whdr->seq = htonl(txb->seq); - whdr->type = RXRPC_PACKET_TYPE_DATA; - whdr->flags = 0; - whdr->userStatus = 0; - whdr->securityIndex = call->security_ix; - whdr->_rsvd = 0; - whdr->serviceId = htons(call->dest_srx.srx_service); + txb->data = buf + doff; trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1, rxrpc_txbuf_alloc_data); @@ -90,14 +77,10 @@ void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb) { - int i; - trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0, rxrpc_txbuf_free); - for (i = 0; i < txb->nr_kvec; i++) - if (txb->kvec[i].iov_base && - !is_zero_pfn(page_to_pfn(virt_to_page(txb->kvec[i].iov_base)))) - page_frag_free(txb->kvec[i].iov_base); + if (txb->data) + page_frag_free(txb->data); kfree(txb); atomic_dec(&rxrpc_nr_txbuf); } diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index b50b2c2cc09b..e6bfd39ff339 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -40,6 +40,9 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, { unsigned int prev_backlog; + if (unlikely(READ_ONCE(sch->limit) == 0)) + return qdisc_drop(skb, sch, to_free); + if (likely(sch->q.qlen < READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 71ec9986ed37..fdd79d3ccd8c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -749,9 +749,9 @@ deliver: if (err != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(err)) qdisc_qstats_drop(sch); - qdisc_tree_reduce_backlog(sch, 1, pkt_len); sch->qstats.backlog -= pkt_len; sch->q.qlen--; + qdisc_tree_reduce_backlog(sch, 1, pkt_len); } goto tfifo_dequeue; } diff --git a/net/socket.c b/net/socket.c index 262a28b59c7f..28bae5a94234 100644 --- a/net/socket.c +++ b/net/socket.c @@ -479,6 +479,11 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) sock->file = file; file->private_data = sock; stream_open(SOCK_INODE(sock), file); + /* + * Disable permission and pre-content events, but enable legacy + * inotify events for legacy users. + */ + file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); return file; } EXPORT_SYMBOL(sock_alloc_file); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 075695173648..53a081d49d28 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -824,13 +824,19 @@ static void __vsock_release(struct sock *sk, int level) */ lock_sock_nested(sk, level); - sock_orphan(sk); + /* Indicate to vsock_remove_sock() that the socket is being released and + * can be removed from the bound_table. Unlike transport reassignment + * case, where the socket must remain bound despite vsock_remove_sock() + * being called from the transport release() callback. + */ + sock_set_flag(sk, SOCK_DEAD); if (vsk->transport) vsk->transport->release(vsk); else if (sock_type_connectible(sk->sk_type)) vsock_remove_sock(vsk); + sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; skb_queue_purge(&sk->sk_receive_queue); diff --git a/rust/Makefile b/rust/Makefile index 8fcfd60447bc..ea3849eb78f6 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -144,7 +144,7 @@ rusttestlib-kernel: private rustc_target_flags = --extern ffi \ --extern bindings --extern uapi rusttestlib-kernel: $(src)/kernel/lib.rs \ rusttestlib-bindings rusttestlib-uapi rusttestlib-build_error \ - $(obj)/libmacros.so $(obj)/bindings.o FORCE + $(obj)/$(libmacros_name) $(obj)/bindings.o FORCE +$(call if_changed,rustc_test_library) rusttestlib-bindings: private rustc_target_flags = --extern ffi @@ -240,6 +240,7 @@ bindgen_skip_c_flags := -mno-fp-ret-in-387 -mpreferred-stack-boundary=% \ -fzero-call-used-regs=% -fno-stack-clash-protection \ -fno-inline-functions-called-once -fsanitize=bounds-strict \ -fstrict-flex-arrays=% -fmin-function-alignment=% \ + -fzero-init-padding-bits=% \ --param=% --param asan-% # Derived from `scripts/Makefile.clang`. @@ -331,7 +332,7 @@ $(obj)/bindings/bindings_helpers_generated.rs: private bindgen_target_extra = ; $(obj)/bindings/bindings_helpers_generated.rs: $(src)/helpers/helpers.c FORCE $(call if_changed_dep,bindgen) -rust_exports = $(NM) -p --defined-only $(1) | awk '$$2~/(T|R|D|B)/ && $$3!~/__cfi/ { printf $(2),$$3 }' +rust_exports = $(NM) -p --defined-only $(1) | awk '$$2~/(T|R|D|B)/ && $$3!~/__cfi/ && $$3!~/__odr_asan/ { printf $(2),$$3 }' quiet_cmd_exports = EXPORTS $@ cmd_exports = \ diff --git a/rust/kernel/init.rs b/rust/kernel/init.rs index 3f9236c1c9d5..7fd1ea8265a5 100644 --- a/rust/kernel/init.rs +++ b/rust/kernel/init.rs @@ -870,7 +870,7 @@ pub unsafe trait PinInit<T: ?Sized, E = Infallible>: Sized { /// use kernel::{types::Opaque, init::pin_init_from_closure}; /// #[repr(C)] /// struct RawFoo([u8; 16]); - /// extern { + /// extern "C" { /// fn init_foo(_: *mut RawFoo); /// } /// diff --git a/samples/hid/Makefile b/samples/hid/Makefile index 8ea59e9631a3..db5a077c77fc 100644 --- a/samples/hid/Makefile +++ b/samples/hid/Makefile @@ -40,16 +40,17 @@ BPF_EXTRA_CFLAGS += -I$(srctree)/arch/mips/include/asm/mach-generic endif endif -TPROGS_CFLAGS += -Wall -O2 -TPROGS_CFLAGS += -Wmissing-prototypes -TPROGS_CFLAGS += -Wstrict-prototypes +COMMON_CFLAGS += -Wall -O2 +COMMON_CFLAGS += -Wmissing-prototypes +COMMON_CFLAGS += -Wstrict-prototypes +TPROGS_CFLAGS += $(COMMON_CFLAGS) TPROGS_CFLAGS += -I$(objtree)/usr/include TPROGS_CFLAGS += -I$(LIBBPF_INCLUDE) TPROGS_CFLAGS += -I$(srctree)/tools/include ifdef SYSROOT -TPROGS_CFLAGS += --sysroot=$(SYSROOT) +COMMON_CFLAGS += --sysroot=$(SYSROOT) TPROGS_LDFLAGS := -L$(SYSROOT)/usr/lib endif @@ -112,7 +113,7 @@ clean: $(LIBBPF): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT) # Fix up variables inherited from Kbuild that tools/ build system won't like - $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(TPROGS_CFLAGS)" \ + $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(COMMON_CFLAGS)" \ LDFLAGS=$(TPROGS_LDFLAGS) srctree=$(HID_SAMPLES_PATH)/../../ \ O= OUTPUT=$(LIBBPF_OUTPUT)/ DESTDIR=$(LIBBPF_DESTDIR) prefix= \ $@ install_headers @@ -163,7 +164,7 @@ $(obj)/hid_surface_dial.o: $(obj)/hid_surface_dial.skel.h VMLINUX_BTF_PATHS ?= $(abspath $(if $(O),$(O)/vmlinux)) \ $(abspath $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)) \ - $(abspath ./vmlinux) + $(abspath $(objtree)/vmlinux) VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) $(obj)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index eb719f6d8d53..dc081cf46d21 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -31,6 +31,11 @@ KBUILD_CFLAGS-$(CONFIG_CC_NO_ARRAY_BOUNDS) += -Wno-array-bounds ifdef CONFIG_CC_IS_CLANG # The kernel builds with '-std=gnu11' so use of GNU extensions is acceptable. KBUILD_CFLAGS += -Wno-gnu + +# Clang checks for overflow/truncation with '%p', while GCC does not: +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111219 +KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow-non-kprintf) +KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation-non-kprintf) else # gcc inanely warns about local variables called 'main' @@ -105,11 +110,6 @@ KBUILD_CFLAGS += $(call cc-disable-warning, packed-not-aligned) KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow) ifdef CONFIG_CC_IS_GCC KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation) -else -# Clang checks for overflow/truncation with '%p', while GCC does not: -# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111219 -KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow-non-kprintf) -KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation-non-kprintf) endif KBUILD_CFLAGS += $(call cc-disable-warning, stringop-truncation) @@ -133,7 +133,6 @@ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-to-enum-cast) KBUILD_CFLAGS += -Wno-tautological-constant-out-of-range-compare KBUILD_CFLAGS += $(call cc-disable-warning, unaligned-access) KBUILD_CFLAGS += -Wno-enum-compare-conditional -KBUILD_CFLAGS += -Wno-enum-enum-conversion endif endif @@ -157,6 +156,10 @@ KBUILD_CFLAGS += -Wno-missing-field-initializers KBUILD_CFLAGS += -Wno-type-limits KBUILD_CFLAGS += -Wno-shift-negative-value +ifdef CONFIG_CC_IS_CLANG +KBUILD_CFLAGS += -Wno-enum-enum-conversion +endif + ifdef CONFIG_CC_IS_GCC KBUILD_CFLAGS += -Wno-maybe-uninitialized endif diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index ad55ef201aac..cad20f0e66ee 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -305,7 +305,7 @@ endef # These are shared by some Makefile.* files. ifdef CONFIG_LTO_CLANG -# Run $(LD) here to covert LLVM IR to ELF in the following cases: +# Run $(LD) here to convert LLVM IR to ELF in the following cases: # - when this object needs objtool processing, as objtool cannot process LLVM IR # - when this is a single-object module, as modpost cannot process LLVM IR cmd_ld_single = $(if $(objtool-enabled)$(is-single-obj-m), ; $(LD) $(ld_flags) -r -o $(tmp-target) $@; mv $(tmp-target) $@) diff --git a/scripts/generate_rust_target.rs b/scripts/generate_rust_target.rs index 0d00ac3723b5..4fd6b6ab3e32 100644 --- a/scripts/generate_rust_target.rs +++ b/scripts/generate_rust_target.rs @@ -165,6 +165,18 @@ impl KernelConfig { let option = "CONFIG_".to_owned() + option; self.0.contains_key(&option) } + + /// Is the rustc version at least `major.minor.patch`? + fn rustc_version_atleast(&self, major: u32, minor: u32, patch: u32) -> bool { + let check_version = 100000 * major + 100 * minor + patch; + let actual_version = self + .0 + .get("CONFIG_RUSTC_VERSION") + .unwrap() + .parse::<u32>() + .unwrap(); + check_version <= actual_version + } } fn main() { @@ -182,6 +194,9 @@ fn main() { } } else if cfg.has("X86_64") { ts.push("arch", "x86_64"); + if cfg.rustc_version_atleast(1, 86, 0) { + ts.push("rustc-abi", "x86-softfloat"); + } ts.push( "data-layout", "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", @@ -215,6 +230,9 @@ fn main() { panic!("32-bit x86 only works under UML"); } ts.push("arch", "x86"); + if cfg.rustc_version_atleast(1, 86, 0) { + ts.push("rustc-abi", "x86-softfloat"); + } ts.push( "data-layout", "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128", diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index e18ae7dc8140..36b28987a2f0 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -507,6 +507,9 @@ static int parse_elf(struct elf_info *info, const char *filename) info->modinfo_len = sechdrs[i].sh_size; } else if (!strcmp(secname, ".export_symbol")) { info->export_symbol_secndx = i; + } else if (!strcmp(secname, ".no_trim_symbol")) { + info->no_trim_symbol = (void *)hdr + sechdrs[i].sh_offset; + info->no_trim_symbol_len = sechdrs[i].sh_size; } if (sechdrs[i].sh_type == SHT_SYMTAB) { @@ -1566,6 +1569,14 @@ static void read_symbols(const char *modname) /* strip trailing .o */ mod = new_module(modname, strlen(modname) - strlen(".o")); + /* save .no_trim_symbol section for later use */ + if (info.no_trim_symbol_len) { + mod->no_trim_symbol = xmalloc(info.no_trim_symbol_len); + memcpy(mod->no_trim_symbol, info.no_trim_symbol, + info.no_trim_symbol_len); + mod->no_trim_symbol_len = info.no_trim_symbol_len; + } + if (!mod->is_vmlinux) { license = get_modinfo(&info, "license"); if (!license) @@ -1728,6 +1739,28 @@ static void handle_white_list_exports(const char *white_list) free(buf); } +/* + * Keep symbols recorded in the .no_trim_symbol section. This is necessary to + * prevent CONFIG_TRIM_UNUSED_KSYMS from dropping EXPORT_SYMBOL because + * symbol_get() relies on the symbol being present in the ksymtab for lookups. + */ +static void keep_no_trim_symbols(struct module *mod) +{ + unsigned long size = mod->no_trim_symbol_len; + + for (char *s = mod->no_trim_symbol; s; s = next_string(s , &size)) { + struct symbol *sym; + + /* + * If find_symbol() returns NULL, this symbol is not provided + * by any module, and symbol_get() will fail. + */ + sym = find_symbol(s); + if (sym) + sym->used = true; + } +} + static void check_modname_len(struct module *mod) { const char *mod_name; @@ -2254,6 +2287,8 @@ int main(int argc, char **argv) read_symbols_from_files(files_source); list_for_each_entry(mod, &modules, list) { + keep_no_trim_symbols(mod); + if (mod->dump_file || mod->is_vmlinux) continue; diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index ffd0a52a606e..59366f456b76 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -111,6 +111,8 @@ struct module_alias { * * @dump_file: path to the .symvers file if loaded from a file * @aliases: list head for module_aliases + * @no_trim_symbol: .no_trim_symbol section data + * @no_trim_symbol_len: length of the .no_trim_symbol section */ struct module { struct list_head list; @@ -128,6 +130,8 @@ struct module { // Actual imported namespaces struct list_head imported_namespaces; struct list_head aliases; + char *no_trim_symbol; + unsigned int no_trim_symbol_len; char name[]; }; @@ -141,6 +145,8 @@ struct elf_info { char *strtab; char *modinfo; unsigned int modinfo_len; + char *no_trim_symbol; + unsigned int no_trim_symbol_len; /* support for 32bit section numbers */ diff --git a/scripts/module.lds.S b/scripts/module.lds.S index c2f80f9141d4..450f1088d5fd 100644 --- a/scripts/module.lds.S +++ b/scripts/module.lds.S @@ -16,6 +16,7 @@ SECTIONS { *(.discard) *(.discard.*) *(.export_symbol) + *(.no_trim_symbol) } __ksymtab 0 : ALIGN(8) { *(SORT(___ksymtab+*)) } diff --git a/scripts/package/install-extmod-build b/scripts/package/install-extmod-build index bb6e23c1174e..b724626ea0ca 100755 --- a/scripts/package/install-extmod-build +++ b/scripts/package/install-extmod-build @@ -63,7 +63,7 @@ if [ "${CC}" != "${HOSTCC}" ]; then # Clear VPATH and srcroot because the source files reside in the output # directory. # shellcheck disable=SC2016 # $(MAKE), $(CC), and $(build) will be expanded by Make - "${MAKE}" run-command KBUILD_RUN_COMMAND='+$(MAKE) HOSTCC=$(CC) VPATH= srcroot=. $(build)='"${destdir}"/scripts + "${MAKE}" run-command KBUILD_RUN_COMMAND='+$(MAKE) HOSTCC="$(CC)" VPATH= srcroot=. $(build)='"${destdir}"/scripts rm -f "${destdir}/scripts/Kbuild" fi diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index d9fa69632147..0f78898bce09 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -1981,6 +1981,114 @@ static int tomoyo_truncate(char *str) } /** + * tomoyo_numscan - sscanf() which stores the length of a decimal integer value. + * + * @str: String to scan. + * @head: Leading string that must start with. + * @width: Pointer to "int" for storing length of a decimal integer value after @head. + * @tail: Optional character that must match after a decimal integer value. + * + * Returns whether @str starts with @head and a decimal value follows @head. + */ +static bool tomoyo_numscan(const char *str, const char *head, int *width, const char tail) +{ + const char *cp; + const int n = strlen(head); + + if (!strncmp(str, head, n)) { + cp = str + n; + while (*cp && *cp >= '0' && *cp <= '9') + cp++; + if (*cp == tail || !tail) { + *width = cp - (str + n); + return *width != 0; + } + } + *width = 0; + return 0; +} + +/** + * tomoyo_patternize_path - Make patterns for file path. Used by learning mode. + * + * @buffer: Destination buffer. + * @len: Size of @buffer. + * @entry: Original line. + * + * Returns nothing. + */ +static void tomoyo_patternize_path(char *buffer, const int len, char *entry) +{ + int width; + char *cp = entry; + + /* Nothing to do if this line is not for "file" related entry. */ + if (strncmp(entry, "file ", 5)) + goto flush; + /* + * Nothing to do if there is no colon in this line, for this rewriting + * applies to only filesystems where numeric values in the path are volatile. + */ + cp = strchr(entry + 5, ':'); + if (!cp) { + cp = entry; + goto flush; + } + /* Flush e.g. "file ioctl" part. */ + while (*cp != ' ') + cp--; + *cp++ = '\0'; + tomoyo_addprintf(buffer, len, "%s ", entry); + /* e.g. file ioctl pipe:[$INO] $CMD */ + if (tomoyo_numscan(cp, "pipe:[", &width, ']')) { + cp += width + 7; + tomoyo_addprintf(buffer, len, "pipe:[\\$]"); + goto flush; + } + /* e.g. file ioctl socket:[$INO] $CMD */ + if (tomoyo_numscan(cp, "socket:[", &width, ']')) { + cp += width + 9; + tomoyo_addprintf(buffer, len, "socket:[\\$]"); + goto flush; + } + if (!strncmp(cp, "proc:/self", 10)) { + /* e.g. file read proc:/self/task/$TID/fdinfo/$FD */ + cp += 10; + tomoyo_addprintf(buffer, len, "proc:/self"); + } else if (tomoyo_numscan(cp, "proc:/", &width, 0)) { + /* e.g. file read proc:/$PID/task/$TID/fdinfo/$FD */ + /* + * Don't patternize $PID part if $PID == 1, for several + * programs access only files in /proc/1/ directory. + */ + cp += width + 6; + if (width == 1 && *(cp - 1) == '1') + tomoyo_addprintf(buffer, len, "proc:/1"); + else + tomoyo_addprintf(buffer, len, "proc:/\\$"); + } else { + goto flush; + } + /* Patternize $TID part if "/task/" follows. */ + if (tomoyo_numscan(cp, "/task/", &width, 0)) { + cp += width + 6; + tomoyo_addprintf(buffer, len, "/task/\\$"); + } + /* Patternize $FD part if "/fd/" or "/fdinfo/" follows. */ + if (tomoyo_numscan(cp, "/fd/", &width, 0)) { + cp += width + 4; + tomoyo_addprintf(buffer, len, "/fd/\\$"); + } else if (tomoyo_numscan(cp, "/fdinfo/", &width, 0)) { + cp += width + 8; + tomoyo_addprintf(buffer, len, "/fdinfo/\\$"); + } +flush: + /* Flush remaining part if any. */ + if (*cp) + tomoyo_addprintf(buffer, len, "%s", cp); +} + +/** * tomoyo_add_entry - Add an ACL to current thread's domain. Used by learning mode. * * @domain: Pointer to "struct tomoyo_domain_info". @@ -2003,7 +2111,8 @@ static void tomoyo_add_entry(struct tomoyo_domain_info *domain, char *header) if (!cp) return; *cp++ = '\0'; - len = strlen(cp) + 1; + /* Reserve some space for potentially using patterns. */ + len = strlen(cp) + 16; /* strstr() will return NULL if ordering is wrong. */ if (*cp == 'f') { argv0 = strstr(header, " argv[]={ \""); @@ -2020,40 +2129,10 @@ static void tomoyo_add_entry(struct tomoyo_domain_info *domain, char *header) if (symlink) len += tomoyo_truncate(symlink + 1) + 1; } - buffer = kmalloc(len, GFP_NOFS); + buffer = kmalloc(len, GFP_NOFS | __GFP_ZERO); if (!buffer) return; - snprintf(buffer, len - 1, "%s", cp); - if (*cp == 'f' && strchr(buffer, ':')) { - /* Automatically replace 2 or more digits with \$ pattern. */ - char *cp2; - - /* e.g. file read proc:/$PID/stat */ - cp = strstr(buffer, " proc:/"); - if (cp && simple_strtoul(cp + 7, &cp2, 10) >= 10 && *cp2 == '/') { - *(cp + 7) = '\\'; - *(cp + 8) = '$'; - memmove(cp + 9, cp2, strlen(cp2) + 1); - goto ok; - } - /* e.g. file ioctl pipe:[$INO] $CMD */ - cp = strstr(buffer, " pipe:["); - if (cp && simple_strtoul(cp + 7, &cp2, 10) >= 10 && *cp2 == ']') { - *(cp + 7) = '\\'; - *(cp + 8) = '$'; - memmove(cp + 9, cp2, strlen(cp2) + 1); - goto ok; - } - /* e.g. file ioctl socket:[$INO] $CMD */ - cp = strstr(buffer, " socket:["); - if (cp && simple_strtoul(cp + 9, &cp2, 10) >= 10 && *cp2 == ']') { - *(cp + 9) = '\\'; - *(cp + 10) = '$'; - memmove(cp + 11, cp2, strlen(cp2) + 1); - goto ok; - } - } -ok: + tomoyo_patternize_path(buffer, len, cp); if (realpath) tomoyo_addprintf(buffer, len, " exec.%s", realpath); if (argv0) diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 3a7b0874cf44..5f9ccab26e9a 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -920,7 +920,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, #ifdef CONFIG_MMU /* * This is called at execve() time in order to dig around - * in the argv/environment of the new proceess + * in the argv/environment of the new process * (represented by bprm). */ mmap_read_lock(bprm->mm); diff --git a/security/tomoyo/securityfs_if.c b/security/tomoyo/securityfs_if.c index a2705798476f..7e69747b2f77 100644 --- a/security/tomoyo/securityfs_if.c +++ b/security/tomoyo/securityfs_if.c @@ -229,11 +229,11 @@ static void __init tomoyo_create_entry(const char *name, const umode_t mode, } /** - * tomoyo_initerface_init - Initialize /sys/kernel/security/tomoyo/ interface. + * tomoyo_interface_init - Initialize /sys/kernel/security/tomoyo/ interface. * * Returns 0. */ -static int __init tomoyo_initerface_init(void) +static int __init tomoyo_interface_init(void) { struct tomoyo_domain_info *domain; struct dentry *tomoyo_dir; @@ -270,4 +270,4 @@ static int __init tomoyo_initerface_init(void) return 0; } -fs_initcall(tomoyo_initerface_init); +fs_initcall(tomoyo_interface_init); diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index 04a92c3d65d4..d6ebcd9db80a 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -549,10 +549,7 @@ static const struct lsm_id tomoyo_lsmid = { .id = LSM_ID_TOMOYO, }; -/* - * tomoyo_security_ops is a "struct security_operations" which is used for - * registering TOMOYO. - */ +/* tomoyo_hooks is used for registering TOMOYO. */ static struct security_hook_list tomoyo_hooks[] __ro_after_init = { LSM_HOOK_INIT(cred_prepare, tomoyo_cred_prepare), LSM_HOOK_INIT(bprm_committed_creds, tomoyo_bprm_committed_creds), diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py index ca8a7edff3dd..319aaa004c40 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py +++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py @@ -252,6 +252,7 @@ def test_rss_queue_reconfigure(cfg, main_ctx=True): try: # this targets queue 4, which doesn't exist ntuple2 = ethtool_create(cfg, "-N", flow) + defer(ethtool, f"-N {cfg.ifname} delete {ntuple2}") except CmdExitFailure: pass else: @@ -259,7 +260,13 @@ def test_rss_queue_reconfigure(cfg, main_ctx=True): # change the table to target queues 0 and 2 ethtool(f"-X {cfg.ifname} {ctx_ref} weight 1 0 1 0") # ntuple rule therefore targets queues 1 and 3 - ntuple2 = ethtool_create(cfg, "-N", flow) + try: + ntuple2 = ethtool_create(cfg, "-N", flow) + except CmdExitFailure: + ksft_pr("Driver does not support rss + queue offset") + return + + defer(ethtool, f"-N {cfg.ifname} delete {ntuple2}") # should replace existing filter ksft_eq(ntuple, ntuple2) _send_traffic_check(cfg, port, ctx_ref, { 'target': (1, 3), diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c index 8eb6aa606a0d..46d289611ce8 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -383,6 +383,10 @@ static void test_statmount_mnt_point(void) return; } + if (!(sm->mask & STATMOUNT_MNT_POINT)) { + ksft_test_result_fail("missing STATMOUNT_MNT_POINT in mask\n"); + return; + } if (strcmp(sm->str + sm->mnt_point, "/") != 0) { ksft_test_result_fail("unexpected mount point: '%s' != '/'\n", sm->str + sm->mnt_point); @@ -408,6 +412,10 @@ static void test_statmount_mnt_root(void) strerror(errno)); return; } + if (!(sm->mask & STATMOUNT_MNT_ROOT)) { + ksft_test_result_fail("missing STATMOUNT_MNT_ROOT in mask\n"); + return; + } mnt_root = sm->str + sm->mnt_root; last_root = strrchr(mnt_root, '/'); if (last_root) @@ -437,6 +445,10 @@ static void test_statmount_fs_type(void) strerror(errno)); return; } + if (!(sm->mask & STATMOUNT_FS_TYPE)) { + ksft_test_result_fail("missing STATMOUNT_FS_TYPE in mask\n"); + return; + } fs_type = sm->str + sm->fs_type; for (s = known_fs; s != NULL; s++) { if (strcmp(fs_type, *s) == 0) @@ -464,6 +476,11 @@ static void test_statmount_mnt_opts(void) return; } + if (!(sm->mask & STATMOUNT_MNT_BASIC)) { + ksft_test_result_fail("missing STATMOUNT_MNT_BASIC in mask\n"); + return; + } + while (getline(&line, &len, f_mountinfo) != -1) { int i; char *p, *p2; @@ -514,7 +531,10 @@ static void test_statmount_mnt_opts(void) if (p2) *p2 = '\0'; - statmount_opts = sm->str + sm->mnt_opts; + if (sm->mask & STATMOUNT_MNT_OPTS) + statmount_opts = sm->str + sm->mnt_opts; + else + statmount_opts = ""; if (strcmp(statmount_opts, p) != 0) ksft_test_result_fail( "unexpected mount options: '%s' != '%s'\n", diff --git a/tools/testing/selftests/kvm/s390/cmma_test.c b/tools/testing/selftests/kvm/s390/cmma_test.c index e32dd59703a0..85cc8c18d6e7 100644 --- a/tools/testing/selftests/kvm/s390/cmma_test.c +++ b/tools/testing/selftests/kvm/s390/cmma_test.c @@ -444,7 +444,7 @@ static void assert_no_pages_cmma_dirty(struct kvm_vm *vm) ); } -static void test_get_inital_dirty(void) +static void test_get_initial_dirty(void) { struct kvm_vm *vm = create_vm_two_memslots(); struct kvm_vcpu *vcpu; @@ -651,7 +651,7 @@ struct testdef { } testlist[] = { { "migration mode and dirty tracking", test_migration_mode }, { "GET_CMMA_BITS: basic calls", test_get_cmma_basic }, - { "GET_CMMA_BITS: all pages are dirty initally", test_get_inital_dirty }, + { "GET_CMMA_BITS: all pages are dirty initially", test_get_initial_dirty }, { "GET_CMMA_BITS: holes are skipped", test_get_skip_holes }, }; diff --git a/tools/testing/selftests/kvm/s390/ucontrol_test.c b/tools/testing/selftests/kvm/s390/ucontrol_test.c index 135ee22856cf..d265b34c54be 100644 --- a/tools/testing/selftests/kvm/s390/ucontrol_test.c +++ b/tools/testing/selftests/kvm/s390/ucontrol_test.c @@ -88,10 +88,6 @@ asm("test_skey_asm:\n" " ahi %r0,1\n" " st %r1,0(%r5,%r6)\n" - " iske %r1,%r6\n" - " ahi %r0,1\n" - " diag 0,0,0x44\n" - " sske %r1,%r6\n" " xgr %r1,%r1\n" " iske %r1,%r6\n" @@ -459,10 +455,14 @@ TEST_F(uc_kvm, uc_no_user_region) }; ASSERT_EQ(-1, ioctl(self->vm_fd, KVM_SET_USER_MEMORY_REGION, ®ion)); - ASSERT_EQ(EINVAL, errno); + ASSERT_TRUE(errno == EEXIST || errno == EINVAL) + TH_LOG("errno %s (%i) not expected for ioctl KVM_SET_USER_MEMORY_REGION", + strerror(errno), errno); ASSERT_EQ(-1, ioctl(self->vm_fd, KVM_SET_USER_MEMORY_REGION2, ®ion2)); - ASSERT_EQ(EINVAL, errno); + ASSERT_TRUE(errno == EEXIST || errno == EINVAL) + TH_LOG("errno %s (%i) not expected for ioctl KVM_SET_USER_MEMORY_REGION2", + strerror(errno), errno); } TEST_F(uc_kvm, uc_map_unmap) @@ -596,7 +596,9 @@ TEST_F(uc_kvm, uc_skey) ASSERT_EQ(true, uc_handle_exit(self)); ASSERT_EQ(1, sync_regs->gprs[0]); - /* ISKE */ + /* SSKE + ISKE */ + sync_regs->gprs[1] = skeyvalue; + run->kvm_dirty_regs |= KVM_SYNC_GPRS; ASSERT_EQ(0, uc_run_once(self)); /* @@ -608,21 +610,11 @@ TEST_F(uc_kvm, uc_skey) TEST_ASSERT_EQ(0, sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)); TEST_ASSERT_EQ(KVM_EXIT_S390_SIEIC, self->run->exit_reason); TEST_ASSERT_EQ(ICPT_INST, sie_block->icptcode); - TEST_REQUIRE(sie_block->ipa != 0xb229); + TEST_REQUIRE(sie_block->ipa != 0xb22b); - /* ISKE contd. */ + /* SSKE + ISKE contd. */ ASSERT_EQ(false, uc_handle_exit(self)); ASSERT_EQ(2, sync_regs->gprs[0]); - /* assert initial skey (ACC = 0, R & C = 1) */ - ASSERT_EQ(0x06, sync_regs->gprs[1]); - uc_assert_diag44(self); - - /* SSKE + ISKE */ - sync_regs->gprs[1] = skeyvalue; - run->kvm_dirty_regs |= KVM_SYNC_GPRS; - ASSERT_EQ(0, uc_run_once(self)); - ASSERT_EQ(false, uc_handle_exit(self)); - ASSERT_EQ(3, sync_regs->gprs[0]); ASSERT_EQ(skeyvalue, sync_regs->gprs[1]); uc_assert_diag44(self); @@ -631,7 +623,7 @@ TEST_F(uc_kvm, uc_skey) run->kvm_dirty_regs |= KVM_SYNC_GPRS; ASSERT_EQ(0, uc_run_once(self)); ASSERT_EQ(false, uc_handle_exit(self)); - ASSERT_EQ(4, sync_regs->gprs[0]); + ASSERT_EQ(3, sync_regs->gprs[0]); /* assert R reset but rest of skey unchanged */ ASSERT_EQ(skeyvalue & 0xfa, sync_regs->gprs[1]); ASSERT_EQ(0, sync_regs->gprs[1] & 0x04); diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh index e5d06fb40233..15601402dee6 100644 --- a/tools/testing/selftests/livepatch/functions.sh +++ b/tools/testing/selftests/livepatch/functions.sh @@ -306,7 +306,8 @@ function check_result { result=$(dmesg | awk -v last_dmesg="$LAST_DMESG" 'p; $0 == last_dmesg { p=1 }' | \ grep -e 'livepatch:' -e 'test_klp' | \ grep -v '\(tainting\|taints\) kernel' | \ - sed 's/^\[[ 0-9.]*\] //') + sed 's/^\[[ 0-9.]*\] //' | \ + sed 's/^\[[ ]*[CT][0-9]*\] //') if [[ "$expect" == "$result" ]] ; then echo "ok" diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 414addef9a45..d240d02fa443 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -1302,7 +1302,7 @@ again: return ret; if (cfg_truncate > 0) { - xdisconnect(fd); + shutdown(fd, SHUT_WR); } else if (--cfg_repeat > 0) { xdisconnect(fd); diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c index 3f2fca02fec5..36ff28af4b19 100644 --- a/tools/testing/selftests/net/udpgso.c +++ b/tools/testing/selftests/net/udpgso.c @@ -103,6 +103,19 @@ struct testcase testcases_v4[] = { .r_num_mss = 1, }, { + /* datalen <= MSS < gso_len: will fall back to no GSO */ + .tlen = CONST_MSS_V4, + .gso_len = CONST_MSS_V4 + 1, + .r_num_mss = 0, + .r_len_last = CONST_MSS_V4, + }, + { + /* MSS < datalen < gso_len: fail */ + .tlen = CONST_MSS_V4 + 1, + .gso_len = CONST_MSS_V4 + 2, + .tfail = true, + }, + { /* send a single MSS + 1B */ .tlen = CONST_MSS_V4 + 1, .gso_len = CONST_MSS_V4, @@ -206,6 +219,19 @@ struct testcase testcases_v6[] = { .r_num_mss = 1, }, { + /* datalen <= MSS < gso_len: will fall back to no GSO */ + .tlen = CONST_MSS_V6, + .gso_len = CONST_MSS_V6 + 1, + .r_num_mss = 0, + .r_len_last = CONST_MSS_V6, + }, + { + /* MSS < datalen < gso_len: fail */ + .tlen = CONST_MSS_V6 + 1, + .gso_len = CONST_MSS_V6 + 2, + .tfail = true + }, + { /* send a single MSS + 1B */ .tlen = CONST_MSS_V6 + 1, .gso_len = CONST_MSS_V6, diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 8c3a73461475..14ba51b52095 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -47,6 +47,7 @@ #include <linux/kcmp.h> #include <sys/resource.h> #include <sys/capability.h> +#include <linux/perf_event.h> #include <unistd.h> #include <sys/syscall.h> @@ -68,6 +69,10 @@ # define PR_SET_PTRACER 0x59616d61 #endif +#ifndef noinline +#define noinline __attribute__((noinline)) +#endif + #ifndef PR_SET_NO_NEW_PRIVS #define PR_SET_NO_NEW_PRIVS 38 #define PR_GET_NO_NEW_PRIVS 39 @@ -4888,6 +4893,200 @@ TEST(tsync_vs_dead_thread_leader) EXPECT_EQ(0, status); } +noinline int probed(void) +{ + return 1; +} + +static int parse_uint_from_file(const char *file, const char *fmt) +{ + int err = -1, ret; + FILE *f; + + f = fopen(file, "re"); + if (f) { + err = fscanf(f, fmt, &ret); + fclose(f); + } + return err == 1 ? ret : err; +} + +static int determine_uprobe_perf_type(void) +{ + const char *file = "/sys/bus/event_source/devices/uprobe/type"; + + return parse_uint_from_file(file, "%d\n"); +} + +static int determine_uprobe_retprobe_bit(void) +{ + const char *file = "/sys/bus/event_source/devices/uprobe/format/retprobe"; + + return parse_uint_from_file(file, "config:%d\n"); +} + +static ssize_t get_uprobe_offset(const void *addr) +{ + size_t start, base, end; + bool found = false; + char buf[256]; + FILE *f; + + f = fopen("/proc/self/maps", "r"); + if (!f) + return -1; + + while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) { + if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) { + found = true; + break; + } + } + fclose(f); + return found ? (uintptr_t)addr - start + base : -1; +} + +FIXTURE(URETPROBE) { + int fd; +}; + +FIXTURE_VARIANT(URETPROBE) { + /* + * All of the URETPROBE behaviors can be tested with either + * uretprobe attached or not + */ + bool attach; +}; + +FIXTURE_VARIANT_ADD(URETPROBE, attached) { + .attach = true, +}; + +FIXTURE_VARIANT_ADD(URETPROBE, not_attached) { + .attach = false, +}; + +FIXTURE_SETUP(URETPROBE) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; + ssize_t offset; + int type, bit; + +#ifndef __NR_uretprobe + SKIP(return, "__NR_uretprobe syscall not defined"); +#endif + + if (!variant->attach) + return; + + memset(&attr, 0, attr_sz); + + type = determine_uprobe_perf_type(); + ASSERT_GE(type, 0); + bit = determine_uprobe_retprobe_bit(); + ASSERT_GE(bit, 0); + offset = get_uprobe_offset(probed); + ASSERT_GE(offset, 0); + + attr.config |= 1 << bit; + attr.size = attr_sz; + attr.type = type; + attr.config1 = ptr_to_u64("/proc/self/exe"); + attr.config2 = offset; + + self->fd = syscall(__NR_perf_event_open, &attr, + getpid() /* pid */, -1 /* cpu */, -1 /* group_fd */, + PERF_FLAG_FD_CLOEXEC); +} + +FIXTURE_TEARDOWN(URETPROBE) +{ + /* we could call close(self->fd), but we'd need extra filter for + * that and since we are calling _exit right away.. + */ +} + +static int run_probed_with_filter(struct sock_fprog *prog) +{ + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || + seccomp(SECCOMP_SET_MODE_FILTER, 0, prog)) { + return -1; + } + + probed(); + return 0; +} + +TEST_F(URETPROBE, uretprobe_default_allow) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + ASSERT_EQ(0, run_probed_with_filter(&prog)); +} + +TEST_F(URETPROBE, uretprobe_default_block) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit_group, 1, 0), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + ASSERT_EQ(0, run_probed_with_filter(&prog)); +} + +TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), +#ifdef __NR_uretprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1), +#endif + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + ASSERT_EQ(0, run_probed_with_filter(&prog)); +} + +TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), +#ifdef __NR_uretprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0), +#endif + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit_group, 1, 0), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + ASSERT_EQ(0, run_probed_with_filter(&prog)); +} + /* * TODO: * - expand NNP testing diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index d3dd65b05b5f..9044ac054167 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -94,5 +94,37 @@ "$TC qdisc del dev $DUMMY ingress", "$IP addr del 10.10.10.10/24 dev $DUMMY" ] - } + }, + { + "id": "a4b9", + "name": "Test class qlen notification", + "category": [ + "qdisc" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: drr", + "$TC filter add dev $DUMMY parent 1: basic classid 1:1", + "$TC class add dev $DUMMY parent 1: classid 1:1 drr", + "$TC qdisc add dev $DUMMY parent 1:1 handle 2: netem", + "$TC qdisc add dev $DUMMY parent 2: handle 3: drr", + "$TC filter add dev $DUMMY parent 3: basic action drop", + "$TC class add dev $DUMMY parent 3: classid 3:1 drr", + "$TC class del dev $DUMMY classid 1:1", + "$TC class add dev $DUMMY parent 1: classid 1:1 drr" + ], + "cmdUnderTest": "ping -c1 -W0.01 -I $DUMMY 10.10.10.1", + "expExitCode": "1", + "verifyCmd": "$TC qdisc ls dev $DUMMY", + "matchPattern": "drr 1: root", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY root handle 1: drr", + "$IP addr del 10.10.10.10/24 dev $DUMMY" + ] + } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fifo.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fifo.json index ae3d286a32b2..6f20d033670d 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fifo.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fifo.json @@ -313,6 +313,29 @@ "matchPattern": "qdisc bfifo 1: root", "matchCount": "0", "teardown": [ + ] + }, + { + "id": "d774", + "name": "Check pfifo_head_drop qdisc enqueue behaviour when limit == 0", + "category": [ + "qdisc", + "pfifo_head_drop" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: pfifo_head_drop limit 0", + "$IP link set dev $DUMMY up || true" + ], + "cmdUnderTest": "ping -c2 -W0.01 -I $DUMMY 10.10.10.1", + "expExitCode": "1", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "dropped 2", + "matchCount": "1", + "teardown": [ ] } ] diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index dfff8b288265..d0f6d253ac72 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1788,6 +1788,42 @@ static void test_stream_connect_retry_server(const struct test_opts *opts) close(fd); } +static void test_stream_linger_client(const struct test_opts *opts) +{ + struct linger optval = { + .l_onoff = 1, + .l_linger = 1 + }; + int fd; + + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &optval, sizeof(optval))) { + perror("setsockopt(SO_LINGER)"); + exit(EXIT_FAILURE); + } + + close(fd); +} + +static void test_stream_linger_server(const struct test_opts *opts) +{ + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + vsock_wait_remote_close(fd); + close(fd); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -1943,6 +1979,11 @@ static struct test_case test_cases[] = { .run_client = test_stream_connect_retry_client, .run_server = test_stream_connect_retry_server, }, + { + .name = "SOCK_STREAM SO_LINGER null-ptr-deref", + .run_client = test_stream_linger_client, + .run_server = test_stream_linger_server, + }, {}, }; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index faf10671eed2..ba0327e2d0d3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1071,15 +1071,6 @@ out_err: } /* - * Called after the VM is otherwise initialized, but just before adding it to - * the vm_list. - */ -int __weak kvm_arch_post_init_vm(struct kvm *kvm) -{ - return 0; -} - -/* * Called just after removing the VM from the vm_list, but before doing any * other destruction. */ @@ -1199,10 +1190,6 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) if (r) goto out_err_no_debugfs; - r = kvm_arch_post_init_vm(kvm); - if (r) - goto out_err; - mutex_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); mutex_unlock(&kvm_lock); @@ -1212,8 +1199,6 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) return kvm; -out_err: - kvm_destroy_vm_debugfs(kvm); out_err_no_debugfs: kvm_coalesced_mmio_free(kvm); out_no_coalesced_mmio: @@ -1971,7 +1956,15 @@ static int kvm_set_memory_region(struct kvm *kvm, return -EINVAL; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) return -EINVAL; - if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) + + /* + * The size of userspace-defined memory regions is restricted in order + * to play nice with dirty bitmap operations, which are indexed with an + * "unsigned int". KVM's internal memory regions don't support dirty + * logging, and so are exempt. + */ + if (id < KVM_USER_MEM_SLOTS && + (mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) return -EINVAL; slots = __kvm_memslots(kvm, as_id); |