From 28b0f8a6962a24ed21737578f3b1b07424635c9e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Feb 2018 07:38:08 -0800 Subject: tty: make n_tty_read() always abort if hangup is in progress A tty is hung up by __tty_hangup() setting file->f_op to hung_up_tty_fops, which is skipped on ttys whose write operation isn't tty_write(). This means that, for example, /dev/console whose write op is redirected_tty_write() is never actually marked hung up. Because n_tty_read() uses the hung up status to decide whether to abort the waiting readers, the lack of hung-up marking can lead to the following scenario. 1. A session contains two processes. The leader and its child. The child ignores SIGHUP. 2. The leader exits and starts disassociating from the controlling terminal (/dev/console). 3. __tty_hangup() skips setting f_op to hung_up_tty_fops. 4. SIGHUP is delivered and ignored. 5. tty_ldisc_hangup() is invoked. It wakes up the waits which should clear the read lockers of tty->ldisc_sem. 6. The reader wakes up but because tty_hung_up_p() is false, it doesn't abort and goes back to sleep while read-holding tty->ldisc_sem. 7. The leader progresses to tty_ldisc_lock() in tty_ldisc_hangup() and is now stuck in D sleep indefinitely waiting for tty->ldisc_sem. The following is Alan's explanation on why some ttys aren't hung up. http://lkml.kernel.org/r/20171101170908.6ad08580@alans-desktop 1. It broke the serial consoles because they would hang up and close down the hardware. With tty_port that *should* be fixable properly for any cases remaining. 2. The console layer was (and still is) completely broken and doens't refcount properly. So if you turn on console hangups it breaks (as indeed does freeing consoles and half a dozen other things). As neither can be fixed quickly, this patch works around the problem by introducing a new flag, TTY_HUPPING, which is used solely to tell n_tty_read() that hang-up is in progress for the console and the readers should be aborted regardless of the hung-up status of the device. The following is a sample hung task warning caused by this issue. INFO: task agetty:2662 blocked for more than 120 seconds. Not tainted 4.11.3-dbg-tty-lockup-02478-gfd6c7ee-dirty #28 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. 0 2662 1 0x00000086 Call Trace: __schedule+0x267/0x890 schedule+0x36/0x80 schedule_timeout+0x23c/0x2e0 ldsem_down_write+0xce/0x1f6 tty_ldisc_lock+0x16/0x30 tty_ldisc_hangup+0xb3/0x1b0 __tty_hangup+0x300/0x410 disassociate_ctty+0x6c/0x290 do_exit+0x7ef/0xb00 do_group_exit+0x3f/0xa0 get_signal+0x1b3/0x5d0 do_signal+0x28/0x660 exit_to_usermode_loop+0x46/0x86 do_syscall_64+0x9c/0xb0 entry_SYSCALL64_slow_path+0x25/0x25 The following is the repro. Run "$PROG /dev/console". The parent process hangs in D state. #include #include #include #include #include #include #include #include #include #include #include #include int main(int argc, char **argv) { struct sigaction sact = { .sa_handler = SIG_IGN }; struct timespec ts1s = { .tv_sec = 1 }; pid_t pid; int fd; if (argc < 2) { fprintf(stderr, "test-hung-tty /dev/$TTY\n"); return 1; } /* fork a child to ensure that it isn't already the session leader */ pid = fork(); if (pid < 0) { perror("fork"); return 1; } if (pid > 0) { /* top parent, wait for everyone */ while (waitpid(-1, NULL, 0) >= 0) ; if (errno != ECHILD) perror("waitpid"); return 0; } /* new session, start a new session and set the controlling tty */ if (setsid() < 0) { perror("setsid"); return 1; } fd = open(argv[1], O_RDWR); if (fd < 0) { perror("open"); return 1; } if (ioctl(fd, TIOCSCTTY, 1) < 0) { perror("ioctl"); return 1; } /* fork a child, sleep a bit and exit */ pid = fork(); if (pid < 0) { perror("fork"); return 1; } if (pid > 0) { nanosleep(&ts1s, NULL); printf("Session leader exiting\n"); exit(0); } /* * The child ignores SIGHUP and keeps reading from the controlling * tty. Because SIGHUP is ignored, the child doesn't get killed on * parent exit and the bug in n_tty makes the read(2) block the * parent's control terminal hangup attempt. The parent ends up in * D sleep until the child is explicitly killed. */ sigaction(SIGHUP, &sact, NULL); printf("Child reading tty\n"); while (1) { char buf[1024]; if (read(fd, buf, sizeof(buf)) < 0) { perror("read"); return 1; } } return 0; } Signed-off-by: Tejun Heo Cc: Alan Cox Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 0a6c71e0ad01..47f8af22f216 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -364,6 +364,7 @@ struct tty_file_private { #define TTY_PTY_LOCK 16 /* pty private */ #define TTY_NO_WRITE_SPLIT 17 /* Preserve write boundaries to driver */ #define TTY_HUPPED 18 /* Post driver->hangup() */ +#define TTY_HUPPING 19 /* Hangup in progress */ #define TTY_LDISC_HALTED 22 /* Line discipline is halted */ /* Values for tty->flow_change */ -- cgit v1.2.3 From fde9fc766e96c494b82931b1d270a9a751be07c0 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Mon, 19 Feb 2018 16:55:06 +0000 Subject: signals: Move put_compat_sigset to compat.h to silence hardened usercopy Since commit afcc90f8621e ("usercopy: WARN() on slab cache usercopy region violations"), MIPS systems booting with a compat root filesystem emit a warning when copying compat siginfo to userspace: WARNING: CPU: 0 PID: 953 at mm/usercopy.c:81 usercopy_warn+0x98/0xe8 Bad or missing usercopy whitelist? Kernel memory exposure attempt detected from SLAB object 'task_struct' (offset 1432, size 16)! Modules linked in: CPU: 0 PID: 953 Comm: S01logging Not tainted 4.16.0-rc2 #10 Stack : ffffffff808c0000 0000000000000000 0000000000000001 65ac85163f3bdc4a 65ac85163f3bdc4a 0000000000000000 90000000ff667ab8 ffffffff808c0000 00000000000003f8 ffffffff808d0000 00000000000000d1 0000000000000000 000000000000003c 0000000000000000 ffffffff808c8ca8 ffffffff808d0000 ffffffff808d0000 ffffffff80810000 fffffc0000000000 ffffffff80785c30 0000000000000009 0000000000000051 90000000ff667eb0 90000000ff667db0 000000007fe0d938 0000000000000018 ffffffff80449958 0000000020052798 ffffffff808c0000 90000000ff664000 90000000ff667ab0 00000000100c0000 ffffffff80698810 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 ffffffff8010d02c 65ac85163f3bdc4a ... Call Trace: [] show_stack+0x9c/0x130 [] dump_stack+0x90/0xd0 [] __warn+0x100/0x118 [] warn_slowpath_fmt+0x4c/0x70 [] usercopy_warn+0x98/0xe8 [] __check_object_size+0xfc/0x250 [] put_compat_sigset+0x30/0x88 [] setup_rt_frame_n32+0xc4/0x160 [] do_signal+0x19c/0x230 [] do_notify_resume+0x60/0x78 [] work_notifysig+0x10/0x18 ---[ end trace 88fffbf69147f48a ]--- Commit 5905429ad856 ("fork: Provide usercopy whitelisting for task_struct") noted that: "While the blocked and saved_sigmask fields of task_struct are copied to userspace (via sigmask_to_save() and setup_rt_frame()), it is always copied with a static length (i.e. sizeof(sigset_t))." However, this is not true in the case of compat signals, whose sigset is copied by put_compat_sigset and receives size as an argument. At most call sites, put_compat_sigset is copying a sigset from the current task_struct. This triggers a warning when CONFIG_HARDENED_USERCOPY is active. However, by marking this function as static inline, the warning can be avoided because in all of these cases the size is constant at compile time, which is allowed. The only site where this is not the case is handling the rt_sigpending syscall, but there the copy is being made from a stack local variable so does not trigger the warning. Move put_compat_sigset to compat.h, and mark it static inline. This fixes the WARN on MIPS. Fixes: afcc90f8621e ("usercopy: WARN() on slab cache usercopy region violations") Signed-off-by: Matt Redfearn Acked-by: Kees Cook Cc: "Dmitry V . Levin" Cc: Al Viro Cc: kernel-hardening@lists.openwall.com Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/18639/ Signed-off-by: James Hogan --- include/linux/compat.h | 26 ++++++++++++++++++++++++-- kernel/compat.c | 19 ------------------- 2 files changed, 24 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 8a9643857c4a..c4139c7a0de0 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -17,6 +17,7 @@ #include #include #include /* for aio_context_t */ +#include #include #include @@ -550,8 +551,29 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp); extern int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat); -extern int put_compat_sigset(compat_sigset_t __user *compat, - const sigset_t *set, unsigned int size); + +/* + * Defined inline such that size can be compile time constant, which avoids + * CONFIG_HARDENED_USERCOPY complaining about copies from task_struct + */ +static inline int +put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, + unsigned int size) +{ + /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */ +#ifdef __BIG_ENDIAN + compat_sigset_t v; + switch (_NSIG_WORDS) { + case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3]; + case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2]; + case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1]; + case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0]; + } + return copy_to_user(compat, &v, size) ? -EFAULT : 0; +#else + return copy_to_user(compat, set, size) ? -EFAULT : 0; +#endif +} asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes, diff --git a/kernel/compat.c b/kernel/compat.c index 3247fe761f60..3f5fa8902e7d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -488,25 +488,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) } EXPORT_SYMBOL_GPL(get_compat_sigset); -int -put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, - unsigned int size) -{ - /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */ -#ifdef __BIG_ENDIAN - compat_sigset_t v; - switch (_NSIG_WORDS) { - case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3]; - case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2]; - case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1]; - case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0]; - } - return copy_to_user(compat, &v, size) ? -EFAULT : 0; -#else - return copy_to_user(compat, set, size) ? -EFAULT : 0; -#endif -} - #ifdef CONFIG_NUMA COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, compat_uptr_t __user *, pages32, -- cgit v1.2.3 From d02f51cbcf12b09ab945873e35046045875eed9a Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Sat, 3 Mar 2018 03:03:46 +0100 Subject: bpf: fix bpf_skb_adjust_net/bpf_skb_proto_xlat to deal with gso sctp skbs SCTP GSO skbs have a gso_size of GSO_BY_FRAGS, so any sort of unconditionally mangling of that will result in nonsense value and would corrupt the skb later on. Therefore, i) add two helpers skb_increase_gso_size() and skb_decrease_gso_size() that would throw a one time warning and bail out for such skbs and ii) refuse and return early with an error in those BPF helpers that are affected. We do need to bail out as early as possible from there before any changes on the skb have been performed. Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper") Co-authored-by: Daniel Borkmann Signed-off-by: Daniel Axtens Cc: Marcelo Ricardo Leitner Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- Documentation/networking/segmentation-offloads.txt | 11 +++- include/linux/skbuff.h | 22 ++++++++ net/core/filter.c | 60 +++++++++++++++------- 3 files changed, 73 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/segmentation-offloads.txt b/Documentation/networking/segmentation-offloads.txt index d47480b61ac6..23a8dd91a9ec 100644 --- a/Documentation/networking/segmentation-offloads.txt +++ b/Documentation/networking/segmentation-offloads.txt @@ -153,8 +153,15 @@ To signal this, gso_size is set to the special value GSO_BY_FRAGS. Therefore, any code in the core networking stack must be aware of the possibility that gso_size will be GSO_BY_FRAGS and handle that case -appropriately. (For size checks, the skb_gso_validate_*_len family of -helpers do this automatically.) +appropriately. + +There are a couple of helpers to make this easier: + + - For size checks, the skb_gso_validate_*_len family of helpers correctly + considers GSO_BY_FRAGS. + + - For manipulating packets, skb_increase_gso_size and skb_decrease_gso_size + will check for GSO_BY_FRAGS and WARN if asked to manipulate these skbs. This also affects drivers with the NETIF_F_FRAGLIST & NETIF_F_GSO_SCTP bits set. Note also that NETIF_F_GSO_SCTP is included in NETIF_F_GSO_SOFTWARE. diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c1e66bdcf583..8c67c33f40c9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4038,6 +4038,12 @@ static inline bool skb_is_gso_v6(const struct sk_buff *skb) return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6; } +/* Note: Should be called only if skb_is_gso(skb) is true */ +static inline bool skb_is_gso_sctp(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP; +} + static inline void skb_gso_reset(struct sk_buff *skb) { skb_shinfo(skb)->gso_size = 0; @@ -4045,6 +4051,22 @@ static inline void skb_gso_reset(struct sk_buff *skb) skb_shinfo(skb)->gso_type = 0; } +static inline void skb_increase_gso_size(struct skb_shared_info *shinfo, + u16 increment) +{ + if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS)) + return; + shinfo->gso_size += increment; +} + +static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo, + u16 decrement) +{ + if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS)) + return; + shinfo->gso_size -= decrement; +} + void __skb_warn_lro_forwarding(const struct sk_buff *skb); static inline bool skb_warn_if_lro(const struct sk_buff *skb) diff --git a/net/core/filter.c b/net/core/filter.c index 0c121adbdbaa..48aa7c7320db 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2087,6 +2087,10 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; + /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ + if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + return -ENOTSUPP; + ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; @@ -2096,19 +2100,21 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) return ret; if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + /* SKB_GSO_TCPV4 needs to be changed into * SKB_GSO_TCPV6. */ - if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { - skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4; - skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; + if (shinfo->gso_type & SKB_GSO_TCPV4) { + shinfo->gso_type &= ~SKB_GSO_TCPV4; + shinfo->gso_type |= SKB_GSO_TCPV6; } /* Due to IPv6 header, MSS needs to be downgraded. */ - skb_shinfo(skb)->gso_size -= len_diff; + skb_decrease_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; + shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IPV6); @@ -2123,6 +2129,10 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; + /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ + if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + return -ENOTSUPP; + ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; @@ -2132,19 +2142,21 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) return ret; if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + /* SKB_GSO_TCPV6 needs to be changed into * SKB_GSO_TCPV4. */ - if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) { - skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6; - skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; + if (shinfo->gso_type & SKB_GSO_TCPV6) { + shinfo->gso_type &= ~SKB_GSO_TCPV6; + shinfo->gso_type |= SKB_GSO_TCPV4; } /* Due to IPv4 header, MSS can be upgraded. */ - skb_shinfo(skb)->gso_size += len_diff; + skb_increase_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; + shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IP); @@ -2243,6 +2255,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; + /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ + if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + return -ENOTSUPP; + ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; @@ -2252,11 +2268,13 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) return ret; if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + /* Due to header grow, MSS needs to be downgraded. */ - skb_shinfo(skb)->gso_size -= len_diff; + skb_decrease_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; + shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_segs = 0; } return 0; @@ -2267,6 +2285,10 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; + /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ + if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + return -ENOTSUPP; + ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; @@ -2276,11 +2298,13 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) return ret; if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + /* Due to header shrink, MSS can be upgraded. */ - skb_shinfo(skb)->gso_size += len_diff; + skb_increase_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; + shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_segs = 0; } return 0; -- cgit v1.2.3 From a6f1086e29e93621a6394b94b8c0e4a4e490f38b Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 5 Mar 2018 15:22:30 -0800 Subject: PCI: Move of_irq_parse_and_map_pci() declaration under OF_IRQ Since commit 4670d610d592 ("PCI: Move OF-related PCI functions into PCI core"), sparc:allmodconfig fails to build with the following error. pcie-cadence-host.c:(.text+0x4c4): undefined reference to `of_irq_parse_and_map_pci' pcie-cadence-host.c:(.text+0x4c8): undefined reference to `of_irq_parse_and_map_pci' of_irq_parse_and_map_pci() is now only available if OF_IRQ is enabled. Make its declaration and its dummy function dependent on OF_IRQ to solve the problem. Fixes: 4670d610d592 ("PCI: Move OF-related PCI functions into PCI core") Signed-off-by: Guenter Roeck Signed-off-by: Bjorn Helgaas Acked-by: Rob Herring --- include/linux/of_pci.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h index 88865e0ebf4d..091033a6b836 100644 --- a/include/linux/of_pci.h +++ b/include/linux/of_pci.h @@ -13,7 +13,6 @@ struct device_node; struct device_node *of_pci_find_child_device(struct device_node *parent, unsigned int devfn); int of_pci_get_devfn(struct device_node *np); -int of_irq_parse_and_map_pci(const struct pci_dev *dev, u8 slot, u8 pin); int of_pci_parse_bus_range(struct device_node *node, struct resource *res); int of_get_pci_domain_nr(struct device_node *node); int of_pci_get_max_link_speed(struct device_node *node); @@ -33,12 +32,6 @@ static inline int of_pci_get_devfn(struct device_node *np) return -EINVAL; } -static inline int -of_irq_parse_and_map_pci(const struct pci_dev *dev, u8 slot, u8 pin) -{ - return 0; -} - static inline int of_pci_parse_bus_range(struct device_node *node, struct resource *res) { @@ -67,6 +60,16 @@ of_pci_get_max_link_speed(struct device_node *node) static inline void of_pci_check_probe_only(void) { } #endif +#if IS_ENABLED(CONFIG_OF_IRQ) +int of_irq_parse_and_map_pci(const struct pci_dev *dev, u8 slot, u8 pin); +#else +static inline int +of_irq_parse_and_map_pci(const struct pci_dev *dev, u8 slot, u8 pin) +{ + return 0; +} +#endif + #if defined(CONFIG_OF_ADDRESS) int of_pci_get_host_bridge_resources(struct device_node *dev, unsigned char busno, unsigned char bus_max, -- cgit v1.2.3 From 859d880cf544dbe095ce97534ef04cd88ba2f2b4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 6 Mar 2018 00:20:25 -0600 Subject: signal: Correct the offset of si_pkey in struct siginfo The change moving addr_lsb into the _sigfault union failed to take into account that _sigfault._addr_bnd._lower being a pointer forced the entire union to have pointer alignment. In practice this only mattered for the offset of si_pkey which is why this has taken so long to discover. To correct this change _dummy_pkey and _dummy_bnd to have pointer type. Reported-by: kernel test robot Fixes: b68a68d3dcc1 ("signal: Move addr_lsb into the _sigfault union for clarity") Signed-off-by: "Eric W. Biederman" --- include/linux/compat.h | 4 ++-- include/uapi/asm-generic/siginfo.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 8a9643857c4a..e16d07eb08cf 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -229,13 +229,13 @@ typedef struct compat_siginfo { short int _addr_lsb; /* Valid LSB of the reported address. */ /* used when si_code=SEGV_BNDERR */ struct { - short _dummy_bnd; + compat_uptr_t _dummy_bnd; compat_uptr_t _lower; compat_uptr_t _upper; } _addr_bnd; /* used when si_code=SEGV_PKUERR */ struct { - short _dummy_pkey; + compat_uptr_t _dummy_pkey; u32 _pkey; } _addr_pkey; }; diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index 85dc965afd89..99c902e460c2 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -102,13 +102,13 @@ typedef struct siginfo { short _addr_lsb; /* LSB of the reported address */ /* used when si_code=SEGV_BNDERR */ struct { - short _dummy_bnd; + void *_dummy_bnd; void __user *_lower; void __user *_upper; } _addr_bnd; /* used when si_code=SEGV_PKUERR */ struct { - short _dummy_pkey; + void *_dummy_pkey; __u32 _pkey; } _addr_pkey; }; -- cgit v1.2.3 From cb88a0588717ba6c756cb5972d75766b273a6817 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 6 Mar 2018 09:38:49 +0100 Subject: usb: quirks: add control message delay for 1b1c:1b20 Corsair Strafe RGB keyboard does not respond to usb control messages sometimes and hence generates timeouts. Commit de3af5bf259d ("usb: quirks: add delay init quirk for Corsair Strafe RGB keyboard") tried to fix those timeouts by adding USB_QUIRK_DELAY_INIT. Unfortunately, even with this quirk timeouts of usb_control_msg() can still be seen, but with a lower frequency (approx. 1 out of 15): [ 29.103520] usb 1-8: string descriptor 0 read error: -110 [ 34.363097] usb 1-8: can't set config #1, error -110 Adding further delays to different locations where usb control messages are issued just moves the timeouts to other locations, e.g.: [ 35.400533] usbhid 1-8:1.0: can't add hid device: -110 [ 35.401014] usbhid: probe of 1-8:1.0 failed with error -110 The only way to reliably avoid those issues is having a pause after each usb control message. In approx. 200 boot cycles no more timeouts were seen. Addionaly, keep USB_QUIRK_DELAY_INIT as it turned out to be necessary to have the delay in hub_port_connect() after hub_port_init(). The overall boot time seems not to be influenced by these additional delays, even on fast machines and lightweight distributions. Fixes: de3af5bf259d ("usb: quirks: add delay init quirk for Corsair Strafe RGB keyboard") Cc: stable@vger.kernel.org Signed-off-by: Danilo Krummrich Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/message.c | 4 ++++ drivers/usb/core/quirks.c | 3 ++- include/linux/usb/quirks.h | 3 +++ 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c index c64cf6c4a83d..0c11d40a12bc 100644 --- a/drivers/usb/core/message.c +++ b/drivers/usb/core/message.c @@ -151,6 +151,10 @@ int usb_control_msg(struct usb_device *dev, unsigned int pipe, __u8 request, ret = usb_internal_control_msg(dev, pipe, dr, data, size, timeout); + /* Linger a bit, prior to the next control message. */ + if (dev->quirks & USB_QUIRK_DELAY_CTRL_MSG) + msleep(200); + kfree(dr); return ret; diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index f4a548471f0f..54b019e267c5 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -230,7 +230,8 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x1b1c, 0x1b13), .driver_info = USB_QUIRK_DELAY_INIT }, /* Corsair Strafe RGB */ - { USB_DEVICE(0x1b1c, 0x1b20), .driver_info = USB_QUIRK_DELAY_INIT }, + { USB_DEVICE(0x1b1c, 0x1b20), .driver_info = USB_QUIRK_DELAY_INIT | + USB_QUIRK_DELAY_CTRL_MSG }, /* Corsair K70 LUX */ { USB_DEVICE(0x1b1c, 0x1b36), .driver_info = USB_QUIRK_DELAY_INIT }, diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h index f1fcec2fd5f8..b7a99ce56bc9 100644 --- a/include/linux/usb/quirks.h +++ b/include/linux/usb/quirks.h @@ -63,4 +63,7 @@ */ #define USB_QUIRK_DISCONNECT_SUSPEND BIT(12) +/* Device needs a pause after every control message. */ +#define USB_QUIRK_DELAY_CTRL_MSG BIT(13) + #endif /* __LINUX_USB_QUIRKS_H */ -- cgit v1.2.3 From d3dcf8eb615537526bd42ff27a081d46d337816e Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Sun, 4 Mar 2018 17:29:48 +0200 Subject: rhashtable: Fix rhlist duplicates insertion When inserting duplicate objects (those with the same key), current rhlist implementation messes up the chain pointers by updating the bucket pointer instead of prev next pointer to the newly inserted node. This causes missing elements on removal and travesal. Fix that by properly updating pprev pointer to point to the correct rhash_head next pointer. Issue: 1241076 Change-Id: I86b2c140bcb4aeb10b70a72a267ff590bb2b17e7 Fixes: ca26893f05e8 ('rhashtable: Add rhlist interface') Signed-off-by: Paul Blakey Acked-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 4 +++- lib/rhashtable.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index c9df2527e0cd..668a21f04b09 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -766,8 +766,10 @@ slow_path: if (!key || (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, head)) : - rhashtable_compare(&arg, rht_obj(ht, head)))) + rhashtable_compare(&arg, rht_obj(ht, head)))) { + pprev = &head->next; continue; + } data = rht_obj(ht, head); diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 3825c30aaa36..47de025b6245 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -506,8 +506,10 @@ static void *rhashtable_lookup_one(struct rhashtable *ht, if (!key || (ht->p.obj_cmpfn ? ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : - rhashtable_compare(&arg, rht_obj(ht, head)))) + rhashtable_compare(&arg, rht_obj(ht, head)))) { + pprev = &head->next; continue; + } if (!ht->rhlist) return rht_obj(ht, head); -- cgit v1.2.3 From 2695578b896aea472b2c0dcbe9d92daa71738484 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Mar 2018 11:41:13 -0800 Subject: net: usbnet: fix potential deadlock on 32bit hosts Marek reported a LOCKDEP issue occurring on 32bit host, that we tracked down to the fact that usbnet could either run from soft or hard irqs. This patch adds u64_stats_update_begin_irqsave() and u64_stats_update_end_irqrestore() helpers to solve this case. [ 17.768040] ================================ [ 17.772239] WARNING: inconsistent lock state [ 17.776511] 4.16.0-rc3-next-20180227-00007-g876c53a7493c #453 Not tainted [ 17.783329] -------------------------------- [ 17.787580] inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. [ 17.793607] swapper/0/0 [HC0[0]:SC1[1]:HE1:SE0] takes: [ 17.798751] (&syncp->seq#5){?.-.}, at: [<9b22e5f0>] asix_rx_fixup_internal+0x188/0x288 [ 17.806790] {IN-HARDIRQ-W} state was registered at: [ 17.811677] tx_complete+0x100/0x208 [ 17.815319] __usb_hcd_giveback_urb+0x60/0xf0 [ 17.819770] xhci_giveback_urb_in_irq+0xa8/0x240 [ 17.824469] xhci_td_cleanup+0xf4/0x16c [ 17.828367] xhci_irq+0xe74/0x2240 [ 17.831827] usb_hcd_irq+0x24/0x38 [ 17.835343] __handle_irq_event_percpu+0x98/0x510 [ 17.840111] handle_irq_event_percpu+0x1c/0x58 [ 17.844623] handle_irq_event+0x38/0x5c [ 17.848519] handle_fasteoi_irq+0xa4/0x138 [ 17.852681] generic_handle_irq+0x18/0x28 [ 17.856760] __handle_domain_irq+0x6c/0xe4 [ 17.860941] gic_handle_irq+0x54/0xa0 [ 17.864666] __irq_svc+0x70/0xb0 [ 17.867964] arch_cpu_idle+0x20/0x3c [ 17.871578] arch_cpu_idle+0x20/0x3c [ 17.875190] do_idle+0x144/0x218 [ 17.878468] cpu_startup_entry+0x18/0x1c [ 17.882454] start_kernel+0x394/0x400 [ 17.886177] irq event stamp: 161912 [ 17.889616] hardirqs last enabled at (161912): [<7bedfacf>] __netdev_alloc_skb+0xcc/0x140 [ 17.897893] hardirqs last disabled at (161911): [] __netdev_alloc_skb+0x94/0x140 [ 17.904903] exynos5-hsi2c 12ca0000.i2c: tx timeout [ 17.906116] softirqs last enabled at (161904): [<387102ff>] irq_enter+0x78/0x80 [ 17.906123] softirqs last disabled at (161905): [] irq_exit+0x134/0x158 [ 17.925722]. [ 17.925722] other info that might help us debug this: [ 17.933435] Possible unsafe locking scenario: [ 17.933435]. [ 17.940331] CPU0 [ 17.942488] ---- [ 17.944894] lock(&syncp->seq#5); [ 17.948274] [ 17.950847] lock(&syncp->seq#5); [ 17.954386]. [ 17.954386] *** DEADLOCK *** [ 17.954386]. [ 17.962422] no locks held by swapper/0/0. Fixes: c8b5d129ee29 ("net: usbnet: support 64bit stats") Signed-off-by: Eric Dumazet Reported-by: Marek Szyprowski Cc: Greg Ungerer Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 10 ++++++---- include/linux/u64_stats_sync.h | 22 ++++++++++++++++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 8a22ff67b026..d9eea8cfe6cb 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -315,6 +315,7 @@ static void __usbnet_status_stop_force(struct usbnet *dev) void usbnet_skb_return (struct usbnet *dev, struct sk_buff *skb) { struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->stats64); + unsigned long flags; int status; if (test_bit(EVENT_RX_PAUSED, &dev->flags)) { @@ -326,10 +327,10 @@ void usbnet_skb_return (struct usbnet *dev, struct sk_buff *skb) if (skb->protocol == 0) skb->protocol = eth_type_trans (skb, dev->net); - u64_stats_update_begin(&stats64->syncp); + flags = u64_stats_update_begin_irqsave(&stats64->syncp); stats64->rx_packets++; stats64->rx_bytes += skb->len; - u64_stats_update_end(&stats64->syncp); + u64_stats_update_end_irqrestore(&stats64->syncp, flags); netif_dbg(dev, rx_status, dev->net, "< rx, len %zu, type 0x%x\n", skb->len + sizeof (struct ethhdr), skb->protocol); @@ -1248,11 +1249,12 @@ static void tx_complete (struct urb *urb) if (urb->status == 0) { struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->stats64); + unsigned long flags; - u64_stats_update_begin(&stats64->syncp); + flags = u64_stats_update_begin_irqsave(&stats64->syncp); stats64->tx_packets += entry->packets; stats64->tx_bytes += entry->length; - u64_stats_update_end(&stats64->syncp); + u64_stats_update_end_irqrestore(&stats64->syncp, flags); } else { dev->net->stats.tx_errors++; diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index 5bdbd9f49395..07ee0f84a46c 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -90,6 +90,28 @@ static inline void u64_stats_update_end(struct u64_stats_sync *syncp) #endif } +static inline unsigned long +u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) +{ + unsigned long flags = 0; + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + local_irq_save(flags); + write_seqcount_begin(&syncp->seq); +#endif + return flags; +} + +static inline void +u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, + unsigned long flags) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_end(&syncp->seq); + local_irq_restore(flags); +#endif +} + static inline void u64_stats_update_begin_raw(struct u64_stats_sync *syncp) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) -- cgit v1.2.3 From 3a4030761ea88ff439030ca98e3094b9900e96b7 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 9 Mar 2018 14:50:34 +0800 Subject: vhost_net: examine pointer types during un-producing After commit fc72d1d54dd9 ("tuntap: XDP transmission"), we can actually queueing XDP pointers in the pointer ring, so we should examine the pointer type before freeing the pointer. Fixes: fc72d1d54dd9 ("tuntap: XDP transmission") Reported-by: Michael S. Tsirkin Acked-by: Michael S. Tsirkin Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 3 ++- drivers/vhost/net.c | 2 +- include/linux/if_tun.h | 4 ++++ 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 7433bb2e4451..28cfa642e39a 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -655,7 +655,7 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile) return tun; } -static void tun_ptr_free(void *ptr) +void tun_ptr_free(void *ptr) { if (!ptr) return; @@ -667,6 +667,7 @@ static void tun_ptr_free(void *ptr) __skb_array_destroy_skb(ptr); } } +EXPORT_SYMBOL_GPL(tun_ptr_free); static void tun_queue_purge(struct tun_file *tfile) { diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index efb93063fda1..8139bc70ad7d 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -170,7 +170,7 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) { ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head, vhost_net_buf_get_size(rxq), - __skb_array_destroy_skb); + tun_ptr_free); rxq->head = rxq->tail = 0; } } diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index c5b0a75a7812..fd00170b494f 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -25,6 +25,7 @@ struct ptr_ring *tun_get_tx_ring(struct file *file); bool tun_is_xdp_buff(void *ptr); void *tun_xdp_to_ptr(void *ptr); void *tun_ptr_to_xdp(void *ptr); +void tun_ptr_free(void *ptr); #else #include #include @@ -50,5 +51,8 @@ static inline void *tun_ptr_to_xdp(void *ptr) { return NULL; } +static inline void tun_ptr_free(void *ptr) +{ +} #endif /* CONFIG_TUN */ #endif /* __IF_TUN_H */ -- cgit v1.2.3 From b1d0a5d0cba4597c0394997b2d5fced3e3841b4e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 10 Mar 2018 01:15:45 +0100 Subject: netfilter: x_tables: add and use xt_check_proc_name recent and hashlimit both create /proc files, but only check that name is 0 terminated. This can trigger WARN() from procfs when name is "" or "/". Add helper for this and then use it for both. Cc: Eric Dumazet Reported-by: Eric Dumazet Reported-by: Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 ++ net/netfilter/x_tables.c | 30 ++++++++++++++++++++++++++++++ net/netfilter/xt_hashlimit.c | 16 ++++++++++------ net/netfilter/xt_recent.c | 6 +++--- 4 files changed, 45 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 1313b35c3ab7..14529511c4b8 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -285,6 +285,8 @@ unsigned int *xt_alloc_entry_offsets(unsigned int size); bool xt_find_jump_offset(const unsigned int *offsets, unsigned int target, unsigned int size); +int xt_check_proc_name(const char *name, unsigned int size); + int xt_check_match(struct xt_mtchk_param *, unsigned int size, u_int8_t proto, bool inv_proto); int xt_check_target(struct xt_tgchk_param *, unsigned int size, u_int8_t proto, diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index fa1655aff8d3..4aa01c90e9d1 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -423,6 +423,36 @@ textify_hooks(char *buf, size_t size, unsigned int mask, uint8_t nfproto) return buf; } +/** + * xt_check_proc_name - check that name is suitable for /proc file creation + * + * @name: file name candidate + * @size: length of buffer + * + * some x_tables modules wish to create a file in /proc. + * This function makes sure that the name is suitable for this + * purpose, it checks that name is NUL terminated and isn't a 'special' + * name, like "..". + * + * returns negative number on error or 0 if name is useable. + */ +int xt_check_proc_name(const char *name, unsigned int size) +{ + if (name[0] == '\0') + return -EINVAL; + + if (strnlen(name, size) == size) + return -ENAMETOOLONG; + + if (strcmp(name, ".") == 0 || + strcmp(name, "..") == 0 || + strchr(name, '/')) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(xt_check_proc_name); + int xt_check_match(struct xt_mtchk_param *par, unsigned int size, u_int8_t proto, bool inv_proto) { diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 66f5aca62a08..3360f13dc208 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -917,8 +917,9 @@ static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par) struct hashlimit_cfg3 cfg = {}; int ret; - if (info->name[sizeof(info->name) - 1] != '\0') - return -EINVAL; + ret = xt_check_proc_name(info->name, sizeof(info->name)); + if (ret) + return ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 1); @@ -935,8 +936,9 @@ static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par) struct hashlimit_cfg3 cfg = {}; int ret; - if (info->name[sizeof(info->name) - 1] != '\0') - return -EINVAL; + ret = xt_check_proc_name(info->name, sizeof(info->name)); + if (ret) + return ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 2); @@ -950,9 +952,11 @@ static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par) static int hashlimit_mt_check(const struct xt_mtchk_param *par) { struct xt_hashlimit_mtinfo3 *info = par->matchinfo; + int ret; - if (info->name[sizeof(info->name) - 1] != '\0') - return -EINVAL; + ret = xt_check_proc_name(info->name, sizeof(info->name)); + if (ret) + return ret; return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg, info->name, 3); diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 6d232d18faff..81ee1d6543b2 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -361,9 +361,9 @@ static int recent_mt_check(const struct xt_mtchk_param *par, info->hit_count, XT_RECENT_MAX_NSTAMPS - 1); return -EINVAL; } - if (info->name[0] == '\0' || - strnlen(info->name, XT_RECENT_NAME_LEN) == XT_RECENT_NAME_LEN) - return -EINVAL; + ret = xt_check_proc_name(info->name, sizeof(info->name)); + if (ret) + return ret; if (ip_pkt_list_tot && info->hit_count < ip_pkt_list_tot) nstamp_mask = roundup_pow_of_two(ip_pkt_list_tot) - 1; -- cgit v1.2.3 From a2c054a896b8ac794ddcfc7c92e2dc7ec4ed4ed5 Mon Sep 17 00:00:00 2001 From: Brad Mouring Date: Thu, 8 Mar 2018 16:23:03 -0600 Subject: net: phy: Tell caller result of phy_change() In 664fcf123a30e (net: phy: Threaded interrupts allow some simplification) the phy_interrupt system was changed to use a traditional threaded interrupt scheme instead of a workqueue approach. With this change, the phy status check moved into phy_change, which did not report back to the caller whether or not the interrupt was handled. This means that, in the case of a shared phy interrupt, only the first phydev's interrupt registers are checked (since phy_interrupt() would always return IRQ_HANDLED). This leads to interrupt storms when it is a secondary device that's actually the interrupt source. Signed-off-by: Brad Mouring Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 145 +++++++++++++++++++++++++------------------------- include/linux/phy.h | 1 - 2 files changed, 72 insertions(+), 74 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index a6f924fee584..9aabfa1a455a 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -617,6 +617,77 @@ static void phy_error(struct phy_device *phydev) phy_trigger_machine(phydev, false); } +/** + * phy_disable_interrupts - Disable the PHY interrupts from the PHY side + * @phydev: target phy_device struct + */ +static int phy_disable_interrupts(struct phy_device *phydev) +{ + int err; + + /* Disable PHY interrupts */ + err = phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED); + if (err) + goto phy_err; + + /* Clear the interrupt */ + err = phy_clear_interrupt(phydev); + if (err) + goto phy_err; + + return 0; + +phy_err: + phy_error(phydev); + + return err; +} + +/** + * phy_change - Called by the phy_interrupt to handle PHY changes + * @phydev: phy_device struct that interrupted + */ +static irqreturn_t phy_change(struct phy_device *phydev) +{ + if (phy_interrupt_is_valid(phydev)) { + if (phydev->drv->did_interrupt && + !phydev->drv->did_interrupt(phydev)) + return IRQ_NONE; + + if (phydev->state == PHY_HALTED) + if (phy_disable_interrupts(phydev)) + goto phy_err; + } + + mutex_lock(&phydev->lock); + if ((PHY_RUNNING == phydev->state) || (PHY_NOLINK == phydev->state)) + phydev->state = PHY_CHANGELINK; + mutex_unlock(&phydev->lock); + + /* reschedule state queue work to run as soon as possible */ + phy_trigger_machine(phydev, true); + + if (phy_interrupt_is_valid(phydev) && phy_clear_interrupt(phydev)) + goto phy_err; + return IRQ_HANDLED; + +phy_err: + phy_error(phydev); + return IRQ_NONE; +} + +/** + * phy_change_work - Scheduled by the phy_mac_interrupt to handle PHY changes + * @work: work_struct that describes the work to be done + */ +void phy_change_work(struct work_struct *work) +{ + struct phy_device *phydev = + container_of(work, struct phy_device, phy_queue); + + phy_change(phydev); +} + /** * phy_interrupt - PHY interrupt handler * @irq: interrupt line @@ -632,9 +703,7 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat) if (PHY_HALTED == phydev->state) return IRQ_NONE; /* It can't be ours. */ - phy_change(phydev); - - return IRQ_HANDLED; + return phy_change(phydev); } /** @@ -651,32 +720,6 @@ static int phy_enable_interrupts(struct phy_device *phydev) return phy_config_interrupt(phydev, PHY_INTERRUPT_ENABLED); } -/** - * phy_disable_interrupts - Disable the PHY interrupts from the PHY side - * @phydev: target phy_device struct - */ -static int phy_disable_interrupts(struct phy_device *phydev) -{ - int err; - - /* Disable PHY interrupts */ - err = phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED); - if (err) - goto phy_err; - - /* Clear the interrupt */ - err = phy_clear_interrupt(phydev); - if (err) - goto phy_err; - - return 0; - -phy_err: - phy_error(phydev); - - return err; -} - /** * phy_start_interrupts - request and enable interrupts for a PHY device * @phydev: target phy_device struct @@ -719,50 +762,6 @@ int phy_stop_interrupts(struct phy_device *phydev) } EXPORT_SYMBOL(phy_stop_interrupts); -/** - * phy_change - Called by the phy_interrupt to handle PHY changes - * @phydev: phy_device struct that interrupted - */ -void phy_change(struct phy_device *phydev) -{ - if (phy_interrupt_is_valid(phydev)) { - if (phydev->drv->did_interrupt && - !phydev->drv->did_interrupt(phydev)) - return; - - if (phydev->state == PHY_HALTED) - if (phy_disable_interrupts(phydev)) - goto phy_err; - } - - mutex_lock(&phydev->lock); - if ((PHY_RUNNING == phydev->state) || (PHY_NOLINK == phydev->state)) - phydev->state = PHY_CHANGELINK; - mutex_unlock(&phydev->lock); - - /* reschedule state queue work to run as soon as possible */ - phy_trigger_machine(phydev, true); - - if (phy_interrupt_is_valid(phydev) && phy_clear_interrupt(phydev)) - goto phy_err; - return; - -phy_err: - phy_error(phydev); -} - -/** - * phy_change_work - Scheduled by the phy_mac_interrupt to handle PHY changes - * @work: work_struct that describes the work to be done - */ -void phy_change_work(struct work_struct *work) -{ - struct phy_device *phydev = - container_of(work, struct phy_device, phy_queue); - - phy_change(phydev); -} - /** * phy_stop - Bring down the PHY link, and stop checking the status * @phydev: target phy_device struct diff --git a/include/linux/phy.h b/include/linux/phy.h index d7069539f351..b260fb336b25 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1012,7 +1012,6 @@ int phy_driver_register(struct phy_driver *new_driver, struct module *owner); int phy_drivers_register(struct phy_driver *new_driver, int n, struct module *owner); void phy_state_machine(struct work_struct *work); -void phy_change(struct phy_device *phydev); void phy_change_work(struct work_struct *work); void phy_mac_interrupt(struct phy_device *phydev); void phy_start_machine(struct phy_device *phydev); -- cgit v1.2.3 From bf2ae2e4bf9360e07c0cdfa166bcdc0afd92f4ce Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 10 Mar 2018 18:57:50 +0800 Subject: sock_diag: request _diag module only when the family or proto has been registered Now when using 'ss' in iproute, kernel would try to load all _diag modules, which also causes corresponding family and proto modules to be loaded as well due to module dependencies. Like after running 'ss', sctp, dccp, af_packet (if it works as a module) would be loaded. For example: $ lsmod|grep sctp $ ss $ lsmod|grep sctp sctp_diag 16384 0 sctp 323584 5 sctp_diag inet_diag 24576 4 raw_diag,tcp_diag,sctp_diag,udp_diag libcrc32c 16384 3 nf_conntrack,nf_nat,sctp As these family and proto modules are loaded unintentionally, it could cause some problems, like: - Some debug tools use 'ss' to collect the socket info, which loads all those diag and family and protocol modules. It's noisy for identifying issues. - Users usually expect to drop sctp init packet silently when they have no sense of sctp protocol instead of sending abort back. - It wastes resources (especially with multiple netns), and SCTP module can't be unloaded once it's loaded. ... In short, it's really inappropriate to have these family and proto modules loaded unexpectedly when just doing debugging with inet_diag. This patch is to introduce sock_load_diag_module() where it loads the _diag module only when it's corresponding family or proto has been already registered. Note that we can't just load _diag module without the family or proto loaded, as some symbols used in _diag module are from the family or proto module. v1->v2: - move inet proto check to inet_diag to avoid a compiling err. v2->v3: - define sock_load_diag_module in sock.c and export one symbol only. - improve the changelog. Reported-by: Sabrina Dubroca Acked-by: Marcelo Ricardo Leitner Acked-by: Phil Sutter Acked-by: Sabrina Dubroca Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/net.h | 1 + include/net/sock.h | 1 + net/core/sock.c | 21 +++++++++++++++++++++ net/core/sock_diag.c | 12 ++++-------- net/ipv4/inet_diag.c | 3 +-- net/socket.c | 5 +++++ 6 files changed, 33 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 91216b16feb7..2a0391eea05c 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -222,6 +222,7 @@ enum { int sock_wake_async(struct socket_wq *sk_wq, int how, int band); int sock_register(const struct net_proto_family *fam); void sock_unregister(int family); +bool sock_is_registered(int family); int __sock_create(struct net *net, int family, int type, int proto, struct socket **res, int kern); int sock_create(int family, int type, int proto, struct socket **res); diff --git a/include/net/sock.h b/include/net/sock.h index 169c92afcafa..ae23f3b389ca 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1137,6 +1137,7 @@ struct proto { int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); +int sock_load_diag_module(int family, int protocol); #ifdef SOCK_REFCNT_DEBUG static inline void sk_refcnt_debug_inc(struct sock *sk) diff --git a/net/core/sock.c b/net/core/sock.c index c501499a04fe..85b0b64e7f9d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3261,6 +3261,27 @@ void proto_unregister(struct proto *prot) } EXPORT_SYMBOL(proto_unregister); +int sock_load_diag_module(int family, int protocol) +{ + if (!protocol) { + if (!sock_is_registered(family)) + return -ENOENT; + + return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, family); + } + +#ifdef CONFIG_INET + if (family == AF_INET && + !rcu_access_pointer(inet_protos[protocol])) + return -ENOENT; +#endif + + return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, family, protocol); +} +EXPORT_SYMBOL(sock_load_diag_module); + #ifdef CONFIG_PROC_FS static void *proto_seq_start(struct seq_file *seq, loff_t *pos) __acquires(proto_list_mutex) diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 146b50e30659..c37b5be7c5e4 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -220,8 +220,7 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; if (sock_diag_handlers[req->sdiag_family] == NULL) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, req->sdiag_family); + sock_load_diag_module(req->sdiag_family, 0); mutex_lock(&sock_diag_table_mutex); hndl = sock_diag_handlers[req->sdiag_family]; @@ -247,8 +246,7 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, case TCPDIAG_GETSOCK: case DCCPDIAG_GETSOCK: if (inet_rcv_compat == NULL) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET); + sock_load_diag_module(AF_INET, 0); mutex_lock(&sock_diag_table_mutex); if (inet_rcv_compat != NULL) @@ -281,14 +279,12 @@ static int sock_diag_bind(struct net *net, int group) case SKNLGRP_INET_TCP_DESTROY: case SKNLGRP_INET_UDP_DESTROY: if (!sock_diag_handlers[AF_INET]) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET); + sock_load_diag_module(AF_INET, 0); break; case SKNLGRP_INET6_TCP_DESTROY: case SKNLGRP_INET6_UDP_DESTROY: if (!sock_diag_handlers[AF_INET6]) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET6); + sock_load_diag_module(AF_INET6, 0); break; } return 0; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index a383f299ce24..4e5bc4b2f14e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -53,8 +53,7 @@ static DEFINE_MUTEX(inet_diag_table_mutex); static const struct inet_diag_handler *inet_diag_lock_handler(int proto) { if (!inet_diag_table[proto]) - request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET, proto); + sock_load_diag_module(AF_INET, proto); mutex_lock(&inet_diag_table_mutex); if (!inet_diag_table[proto]) diff --git a/net/socket.c b/net/socket.c index a93c99b518ca..08847c3b8c39 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2587,6 +2587,11 @@ void sock_unregister(int family) } EXPORT_SYMBOL(sock_unregister); +bool sock_is_registered(int family) +{ + return family < NPROTO && rcu_access_pointer(net_families[family]); +} + static int __init sock_init(void) { int err; -- cgit v1.2.3 From c2b37f76485f073f020e60b5954b6dc4e55f693c Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Thu, 8 Mar 2018 15:51:41 +0200 Subject: IB/mlx5: Fix integer overflows in mlx5_ib_create_srq This patch validates user provided input to prevent integer overflow due to integer manipulation in the mlx5_ib_create_srq function. Cc: syzkaller Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Boris Pismenny Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/srq.c | 15 +++++++++------ include/linux/mlx5/driver.h | 4 ++-- 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 6d5fadad9090..3c7522d025f2 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -241,8 +241,8 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_srq *srq; - int desc_size; - int buf_size; + size_t desc_size; + size_t buf_size; int err; struct mlx5_srq_attr in = {0}; __u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); @@ -266,15 +266,18 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, desc_size = sizeof(struct mlx5_wqe_srq_next_seg) + srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg); + if (desc_size == 0 || srq->msrq.max_gs > desc_size) + return ERR_PTR(-EINVAL); desc_size = roundup_pow_of_two(desc_size); - desc_size = max_t(int, 32, desc_size); + desc_size = max_t(size_t, 32, desc_size); + if (desc_size < sizeof(struct mlx5_wqe_srq_next_seg)) + return ERR_PTR(-EINVAL); srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) / sizeof(struct mlx5_wqe_data_seg); srq->msrq.wqe_shift = ilog2(desc_size); buf_size = srq->msrq.max * desc_size; - mlx5_ib_dbg(dev, "desc_size 0x%x, req wr 0x%x, srq size 0x%x, max_gs 0x%x, max_avail_gather 0x%x\n", - desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs, - srq->msrq.max_avail_gather); + if (buf_size < desc_size) + return ERR_PTR(-EINVAL); in.type = init_attr->srq_type; if (pd->uobject) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6ed79a8a8318..9d3a03364e6e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -453,8 +453,8 @@ struct mlx5_core_srq { struct mlx5_core_rsc_common common; /* must be first */ u32 srqn; int max; - int max_gs; - int max_avail_gather; + size_t max_gs; + size_t max_avail_gather; int wqe_shift; void (*event) (struct mlx5_core_srq *, enum mlx5_event); -- cgit v1.2.3 From 6417250d3f894e66a68ba1cd93676143f2376a6f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 6 Mar 2018 19:34:42 -0800 Subject: workqueue: remove unused cancel_work() Found this by accident. There are no usages of bare cancel_work() in current kernel source. Signed-off-by: Stephen Hemminger Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 1 - kernel/workqueue.c | 8 -------- 2 files changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index bc0cda180c8b..0c3301421c57 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -456,7 +456,6 @@ extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); extern bool flush_work(struct work_struct *work); -extern bool cancel_work(struct work_struct *work); extern bool cancel_work_sync(struct work_struct *work); extern bool flush_delayed_work(struct delayed_work *dwork); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ccd1080dd6e7..6ec6ba65127b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3018,14 +3018,6 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork) return ret; } -/* - * See cancel_delayed_work() - */ -bool cancel_work(struct work_struct *work) -{ - return __cancel_work(work, false); -} - /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel -- cgit v1.2.3 From 4dcb31d4649df36297296b819437709f5407059c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Mar 2018 09:04:16 -0700 Subject: net: use skb_to_full_sk() in skb_update_prio() Andrei Vagin reported a KASAN: slab-out-of-bounds error in skb_update_prio() Since SYNACK might be attached to a request socket, we need to get back to the listener socket. Since this listener is manipulated without locks, add const qualifiers to sock_cgroup_prioidx() so that the const can also be used in skb_update_prio() Also add the const qualifier to sock_cgroup_classid() for consistency. Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener") Signed-off-by: Eric Dumazet Reported-by: Andrei Vagin Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 4 ++-- net/core/dev.c | 22 +++++++++++++++------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 9f242b876fde..f8e76d01a5ad 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -755,13 +755,13 @@ struct sock_cgroup_data { * updaters and return part of the previous pointer as the prioidx or * classid. Such races are short-lived and the result isn't critical. */ -static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd) +static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd) { /* fallback to 1 which is always the ID of the root cgroup */ return (skcd->is_data & 1) ? skcd->prioidx : 1; } -static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd) +static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) { /* fallback to 0 which is the unconfigured default classid */ return (skcd->is_data & 1) ? skcd->classid : 0; diff --git a/net/core/dev.c b/net/core/dev.c index 2cedf520cb28..12be20535714 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3278,15 +3278,23 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) static void skb_update_prio(struct sk_buff *skb) { - struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); + const struct netprio_map *map; + const struct sock *sk; + unsigned int prioidx; - if (!skb->priority && skb->sk && map) { - unsigned int prioidx = - sock_cgroup_prioidx(&skb->sk->sk_cgrp_data); + if (skb->priority) + return; + map = rcu_dereference_bh(skb->dev->priomap); + if (!map) + return; + sk = skb_to_full_sk(skb); + if (!sk) + return; - if (prioidx < map->priomap_len) - skb->priority = map->priomap[prioidx]; - } + prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data); + + if (prioidx < map->priomap_len) + skb->priority = map->priomap[prioidx]; } #else #define skb_update_prio(skb) -- cgit v1.2.3 From 16ca6a607d84bef0129698d8d808f501afd08d43 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Mar 2018 21:48:01 +0000 Subject: KVM: arm/arm64: vgic: Don't populate multiple LRs with the same vintid The vgic code is trying to be clever when injecting GICv2 SGIs, and will happily populate LRs with the same interrupt number if they come from multiple vcpus (after all, they are distinct interrupt sources). Unfortunately, this is against the letter of the architecture, and the GICv2 architecture spec says "Each valid interrupt stored in the List registers must have a unique VirtualID for that virtual CPU interface.". GICv3 has similar (although slightly ambiguous) restrictions. This results in guests locking up when using GICv2-on-GICv3, for example. The obvious fix is to stop trying so hard, and inject a single vcpu per SGI per guest entry. After all, pending SGIs with multiple source vcpus are pretty rare, and are mostly seen in scenario where the physical CPUs are severely overcomitted. But as we now only inject a single instance of a multi-source SGI per vcpu entry, we may delay those interrupts for longer than strictly necessary, and run the risk of injecting lower priority interrupts in the meantime. In order to address this, we adopt a three stage strategy: - If we encounter a multi-source SGI in the AP list while computing its depth, we force the list to be sorted - When populating the LRs, we prevent the injection of any interrupt of lower priority than that of the first multi-source SGI we've injected. - Finally, the injection of a multi-source SGI triggers the request of a maintenance interrupt when there will be no pending interrupt in the LRs (HCR_NPIE). At the point where the last pending interrupt in the LRs switches from Pending to Active, the maintenance interrupt will be delivered, allowing us to add the remaining SGIs using the same process. Cc: stable@vger.kernel.org Fixes: 0919e84c0fc1 ("KVM: arm/arm64: vgic-new: Add IRQ sync/flush framework") Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v3.h | 1 + include/linux/irqchip/arm-gic.h | 1 + virt/kvm/arm/vgic/vgic-v2.c | 9 +++++- virt/kvm/arm/vgic/vgic-v3.c | 9 +++++- virt/kvm/arm/vgic/vgic.c | 61 +++++++++++++++++++++++++++++--------- virt/kvm/arm/vgic/vgic.h | 2 ++ 6 files changed, 67 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index c00c4c33e432..b26eccc78fb1 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -503,6 +503,7 @@ #define ICH_HCR_EN (1 << 0) #define ICH_HCR_UIE (1 << 1) +#define ICH_HCR_NPIE (1 << 3) #define ICH_HCR_TC (1 << 10) #define ICH_HCR_TALL0 (1 << 11) #define ICH_HCR_TALL1 (1 << 12) diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index d3453ee072fc..68d8b1f73682 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -84,6 +84,7 @@ #define GICH_HCR_EN (1 << 0) #define GICH_HCR_UIE (1 << 1) +#define GICH_HCR_NPIE (1 << 3) #define GICH_LR_VIRTUALID (0x3ff << 0) #define GICH_LR_PHYSID_CPUID_SHIFT (10) diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index e9d840a75e7b..29556f71b691 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -37,6 +37,13 @@ void vgic_v2_init_lrs(void) vgic_v2_write_lr(i, 0); } +void vgic_v2_set_npie(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; + + cpuif->vgic_hcr |= GICH_HCR_NPIE; +} + void vgic_v2_set_underflow(struct kvm_vcpu *vcpu) { struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; @@ -64,7 +71,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) int lr; unsigned long flags; - cpuif->vgic_hcr &= ~GICH_HCR_UIE; + cpuif->vgic_hcr &= ~(GICH_HCR_UIE | GICH_HCR_NPIE); for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { u32 val = cpuif->vgic_lr[lr]; diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index 6b329414e57a..0ff2006f3781 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -26,6 +26,13 @@ static bool group1_trap; static bool common_trap; static bool gicv4_enable; +void vgic_v3_set_npie(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; + + cpuif->vgic_hcr |= ICH_HCR_NPIE; +} + void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; @@ -47,7 +54,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) int lr; unsigned long flags; - cpuif->vgic_hcr &= ~ICH_HCR_UIE; + cpuif->vgic_hcr &= ~(ICH_HCR_UIE | ICH_HCR_NPIE); for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { u64 val = cpuif->vgic_lr[lr]; diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 0001858a2c23..8201899126f6 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -710,22 +710,37 @@ static inline void vgic_set_underflow(struct kvm_vcpu *vcpu) vgic_v3_set_underflow(vcpu); } +static inline void vgic_set_npie(struct kvm_vcpu *vcpu) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_set_npie(vcpu); + else + vgic_v3_set_npie(vcpu); +} + /* Requires the ap_list_lock to be held. */ -static int compute_ap_list_depth(struct kvm_vcpu *vcpu) +static int compute_ap_list_depth(struct kvm_vcpu *vcpu, + bool *multi_sgi) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq; int count = 0; + *multi_sgi = false; + DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { spin_lock(&irq->irq_lock); /* GICv2 SGIs can count for more than one... */ - if (vgic_irq_is_sgi(irq->intid) && irq->source) - count += hweight8(irq->source); - else + if (vgic_irq_is_sgi(irq->intid) && irq->source) { + int w = hweight8(irq->source); + + count += w; + *multi_sgi |= (w > 1); + } else { count++; + } spin_unlock(&irq->irq_lock); } return count; @@ -736,28 +751,43 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq; - int count = 0; + int count; + bool npie = false; + bool multi_sgi; + u8 prio = 0xff; DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); - if (compute_ap_list_depth(vcpu) > kvm_vgic_global_state.nr_lr) + count = compute_ap_list_depth(vcpu, &multi_sgi); + if (count > kvm_vgic_global_state.nr_lr || multi_sgi) vgic_sort_ap_list(vcpu); + count = 0; + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { spin_lock(&irq->irq_lock); - if (unlikely(vgic_target_oracle(irq) != vcpu)) - goto next; - /* - * If we get an SGI with multiple sources, try to get - * them in all at once. + * If we have multi-SGIs in the pipeline, we need to + * guarantee that they are all seen before any IRQ of + * lower priority. In that case, we need to filter out + * these interrupts by exiting early. This is easy as + * the AP list has been sorted already. */ - do { + if (multi_sgi && irq->priority > prio) { + spin_unlock(&irq->irq_lock); + break; + } + + if (likely(vgic_target_oracle(irq) == vcpu)) { vgic_populate_lr(vcpu, irq, count++); - } while (irq->source && count < kvm_vgic_global_state.nr_lr); -next: + if (irq->source) { + npie = true; + prio = irq->priority; + } + } + spin_unlock(&irq->irq_lock); if (count == kvm_vgic_global_state.nr_lr) { @@ -768,6 +798,9 @@ next: } } + if (npie) + vgic_set_npie(vcpu); + vcpu->arch.vgic_cpu.used_lrs = count; /* Nuke remaining LRs */ diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 5b11859a1a1e..f5b8519e5546 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -160,6 +160,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); void vgic_v2_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v2_set_npie(struct kvm_vcpu *vcpu); int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); @@ -189,6 +190,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); void vgic_v3_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v3_set_npie(struct kvm_vcpu *vcpu); void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_enable(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 95dd77580ccd66a0da96e6d4696945b8cea39431 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 14 Mar 2018 18:20:29 -0500 Subject: fs: Teach path_connected to handle nfs filesystems with multiple roots. On nfsv2 and nfsv3 the nfs server can export subsets of the same filesystem and report the same filesystem identifier, so that the nfs client can know they are the same filesystem. The subsets can be from disjoint directory trees. The nfsv2 and nfsv3 filesystems provides no way to find the common root of all directory trees exported form the server with the same filesystem identifier. The practical result is that in struct super s_root for nfs s_root is not necessarily the root of the filesystem. The nfs mount code sets s_root to the root of the first subset of the nfs filesystem that the kernel mounts. This effects the dcache invalidation code in generic_shutdown_super currently called shrunk_dcache_for_umount and that code for years has gone through an additional list of dentries that might be dentry trees that need to be freed to accomodate nfs. When I wrote path_connected I did not realize nfs was so special, and it's hueristic for avoiding calling is_subdir can fail. The practical case where this fails is when there is a move of a directory from the subtree exposed by one nfs mount to the subtree exposed by another nfs mount. This move can happen either locally or remotely. With the remote case requiring that the move directory be cached before the move and that after the move someone walks the path to where the move directory now exists and in so doing causes the already cached directory to be moved in the dcache through the magic of d_splice_alias. If someone whose working directory is in the move directory or a subdirectory and now starts calling .. from the initial mount of nfs (where s_root == mnt_root), then path_connected as a heuristic will not bother with the is_subdir check. As s_root really is not the root of the nfs filesystem this heuristic is wrong, and the path may actually not be connected and path_connected can fail. The is_subdir function might be cheap enough that we can call it unconditionally. Verifying that will take some benchmarking and the result may not be the same on all kernels this fix needs to be backported to. So I am avoiding that for now. Filesystems with snapshots such as nilfs and btrfs do something similar. But as the directory tree of the snapshots are disjoint from one another and from the main directory tree rename won't move things between them and this problem will not occur. Cc: stable@vger.kernel.org Reported-by: Al Viro Fixes: 397d425dc26d ("vfs: Test for and handle paths that are unreachable from their mnt_root") Signed-off-by: "Eric W. Biederman" Signed-off-by: Al Viro --- fs/namei.c | 5 +++-- fs/nfs/super.c | 2 ++ include/linux/fs.h | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index 921ae32dbc80..cafa365eeb70 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -559,9 +559,10 @@ static int __nd_alloc_stack(struct nameidata *nd) static bool path_connected(const struct path *path) { struct vfsmount *mnt = path->mnt; + struct super_block *sb = mnt->mnt_sb; - /* Only bind mounts can have disconnected paths */ - if (mnt->mnt_root == mnt->mnt_sb->s_root) + /* Bind mounts and multi-root filesystems can have disconnected paths */ + if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root)) return true; return is_subdir(path->dentry, mnt->mnt_root); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 29bacdc56f6a..5e470e233c83 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2631,6 +2631,8 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, /* initial superblock/root creation */ mount_info->fill_super(s, mount_info); nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned); + if (!(server->flags & NFS_MOUNT_UNSHARED)) + s->s_iflags |= SB_I_MULTIROOT; } mntroot = nfs_get_root(s, mount_info->mntfh, dev_name); diff --git a/include/linux/fs.h b/include/linux/fs.h index 2a815560fda0..0430e03febaa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1317,6 +1317,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ #define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ +#define SB_I_MULTIROOT 0x00000008 /* Multiple roots to the dentry tree */ /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ -- cgit v1.2.3 From cbe7128c4b92e2004984f477fd38dfa81662f02e Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Tue, 13 Mar 2018 14:51:28 +0900 Subject: vlan: Fix out of order vlan headers with reorder header off With reorder header off, received packets are untagged in skb_vlan_untag() called from within __netif_receive_skb_core(), and later the tag will be inserted back in vlan_do_receive(). This caused out of order vlan headers when we create a vlan device on top of another vlan device, because vlan_do_receive() inserts a tag as the outermost vlan tag. E.g. the outer tag is first removed in skb_vlan_untag() and inserted back in vlan_do_receive(), then the inner tag is next removed and inserted back as the outermost tag. This patch fixes the behaviour by inserting the inner tag at the right position. Signed-off-by: Toshiaki Makita Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 66 ++++++++++++++++++++++++++++++++++++++++--------- net/8021q/vlan_core.c | 4 +-- 2 files changed, 57 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 5e6a2d4dc366..c4a1cff9c768 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -300,30 +300,34 @@ static inline bool vlan_hw_offload_capable(netdev_features_t features, } /** - * __vlan_insert_tag - regular VLAN tag inserting + * __vlan_insert_inner_tag - inner VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert + * @mac_len: MAC header length including outer vlan headers * - * Inserts the VLAN tag into @skb as part of the payload + * Inserts the VLAN tag into @skb as part of the payload at offset mac_len * Returns error if skb_cow_head failes. * * Does not change skb->protocol so this function can be used during receive. */ -static inline int __vlan_insert_tag(struct sk_buff *skb, - __be16 vlan_proto, u16 vlan_tci) +static inline int __vlan_insert_inner_tag(struct sk_buff *skb, + __be16 vlan_proto, u16 vlan_tci, + unsigned int mac_len) { struct vlan_ethhdr *veth; if (skb_cow_head(skb, VLAN_HLEN) < 0) return -ENOMEM; - veth = skb_push(skb, VLAN_HLEN); + skb_push(skb, VLAN_HLEN); - /* Move the mac addresses to the beginning of the new header. */ - memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN); + /* Move the mac header sans proto to the beginning of the new header. */ + memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN); skb->mac_header -= VLAN_HLEN; + veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN); + /* first, the ethernet type */ veth->h_vlan_proto = vlan_proto; @@ -334,12 +338,30 @@ static inline int __vlan_insert_tag(struct sk_buff *skb, } /** - * vlan_insert_tag - regular VLAN tag inserting + * __vlan_insert_tag - regular VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload + * Returns error if skb_cow_head failes. + * + * Does not change skb->protocol so this function can be used during receive. + */ +static inline int __vlan_insert_tag(struct sk_buff *skb, + __be16 vlan_proto, u16 vlan_tci) +{ + return __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN); +} + +/** + * vlan_insert_inner_tag - inner VLAN tag inserting + * @skb: skbuff to tag + * @vlan_proto: VLAN encapsulation protocol + * @vlan_tci: VLAN TCI to insert + * @mac_len: MAC header length including outer vlan headers + * + * Inserts the VLAN tag into @skb as part of the payload at offset mac_len * Returns a VLAN tagged skb. If a new skb is created, @skb is freed. * * Following the skb_unshare() example, in case of error, the calling function @@ -347,12 +369,14 @@ static inline int __vlan_insert_tag(struct sk_buff *skb, * * Does not change skb->protocol so this function can be used during receive. */ -static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, - __be16 vlan_proto, u16 vlan_tci) +static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb, + __be16 vlan_proto, + u16 vlan_tci, + unsigned int mac_len) { int err; - err = __vlan_insert_tag(skb, vlan_proto, vlan_tci); + err = __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, mac_len); if (err) { dev_kfree_skb_any(skb); return NULL; @@ -360,6 +384,26 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, return skb; } +/** + * vlan_insert_tag - regular VLAN tag inserting + * @skb: skbuff to tag + * @vlan_proto: VLAN encapsulation protocol + * @vlan_tci: VLAN TCI to insert + * + * Inserts the VLAN tag into @skb as part of the payload + * Returns a VLAN tagged skb. If a new skb is created, @skb is freed. + * + * Following the skb_unshare() example, in case of error, the calling function + * doesn't have to worry about freeing the original skb. + * + * Does not change skb->protocol so this function can be used during receive. + */ +static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, + __be16 vlan_proto, u16 vlan_tci) +{ + return vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN); +} + /** * vlan_insert_tag_set_proto - regular VLAN tag inserting * @skb: skbuff to tag diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 64aa9f755e1d..45c9bf5ff3a0 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -48,8 +48,8 @@ bool vlan_do_receive(struct sk_buff **skbp) * original position later */ skb_push(skb, offset); - skb = *skbp = vlan_insert_tag(skb, skb->vlan_proto, - skb->vlan_tci); + skb = *skbp = vlan_insert_inner_tag(skb, skb->vlan_proto, + skb->vlan_tci, skb->mac_len); if (!skb) return false; skb_pull(skb, offset + VLAN_HLEN); -- cgit v1.2.3 From b3a5d111994450909158929560906f2c1c6c1d85 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Mar 2018 12:45:12 -0700 Subject: percpu_ref: Update doc to dissuade users from depending on internal RCU grace periods percpu_ref internally uses sched-RCU to implement the percpu -> atomic mode switching and the documentation suggested that this could be depended upon. This doesn't seem like a good idea. * percpu_ref uses sched-RCU which has different grace periods regular RCU. Users may combine percpu_ref with regular RCU usage and incorrectly believe that regular RCU grace periods are performed by percpu_ref. This can lead to, for example, use-after-free due to premature freeing. * percpu_ref has a grace period when switching from percpu to atomic mode. It doesn't have one between the last put and release. This distinction is subtle and can lead to surprising bugs. * percpu_ref allows starting in and switching to atomic mode manually for debugging and other purposes. This means that there may not be any grace periods from kill to release. This patch makes it clear that the grace periods are percpu_ref's internal implementation detail and can't be depended upon by the users. Signed-off-by: Tejun Heo Cc: Kent Overstreet Cc: Linus Torvalds Signed-off-by: Tejun Heo --- include/linux/percpu-refcount.h | 18 ++++++++++++------ lib/percpu-refcount.c | 2 ++ 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 864d167a1073..009cdf3d65b6 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -30,10 +30,14 @@ * calls io_destroy() or the process exits. * * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it - * calls percpu_ref_kill(), then hlist_del_rcu() and synchronize_rcu() to remove - * the kioctx from the proccess's list of kioctxs - after that, there can't be - * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop - * the initial ref with percpu_ref_put(). + * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref. + * After that, there can't be any new users of the kioctx (from lookup_ioctx()) + * and it's then safe to drop the initial ref with percpu_ref_put(). + * + * Note that the free path, free_ioctx(), needs to go through explicit call_rcu() + * to synchronize with RCU protected lookup_ioctx(). percpu_ref operations don't + * imply RCU grace periods of any kind and if a user wants to combine percpu_ref + * with RCU protection, it must be done explicitly. * * Code that does a two stage shutdown like this often needs some kind of * explicit synchronization to ensure the initial refcount can only be dropped @@ -113,8 +117,10 @@ void percpu_ref_reinit(struct percpu_ref *ref); * Must be used to drop the initial ref on a percpu refcount; must be called * precisely once before shutdown. * - * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the - * percpu counters and dropping the initial ref. + * Switches @ref into atomic mode before gathering up the percpu counters + * and dropping the initial ref. + * + * There are no implied RCU grace periods between kill and release. */ static inline void percpu_ref_kill(struct percpu_ref *ref) { diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 30e7dd88148b..9f96fa7bc000 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -322,6 +322,8 @@ EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu); * This function normally doesn't block and can be called from any context * but it may block if @confirm_kill is specified and @ref is in the * process of switching to atomic mode by percpu_ref_switch_to_atomic(). + * + * There are no implied RCU grace periods between kill and release. */ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) -- cgit v1.2.3 From 5df7af85ecd88e8b5f1f31d6456c3cf38a8bbdda Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Tue, 20 Mar 2018 09:44:52 +0800 Subject: net: phy: Add general dummy stubs for MMD register access For some phy devices, even though they don't support the MMD extended register access, it does have some side effect if we are trying to read/write the MMD registers via indirect method. So introduce general dummy stubs for MMD register access which these devices can use to avoid such side effect. Fixes: b6b5e8a69118 ("gianfar: Disable EEE autoneg by default") Signed-off-by: Kevin Hao Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 17 +++++++++++++++++ include/linux/phy.h | 4 ++++ 2 files changed, 21 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index fe16f5894092..74664a6c0cdc 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1673,6 +1673,23 @@ int genphy_config_init(struct phy_device *phydev) } EXPORT_SYMBOL(genphy_config_init); +/* This is used for the phy device which doesn't support the MMD extended + * register access, but it does have side effect when we are trying to access + * the MMD register via indirect method. + */ +int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad, u16 regnum) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(genphy_read_mmd_unsupported); + +int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum, + u16 regnum, u16 val) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(genphy_write_mmd_unsupported); + int genphy_suspend(struct phy_device *phydev) { return phy_set_bits(phydev, MII_BMCR, BMCR_PDOWN); diff --git a/include/linux/phy.h b/include/linux/phy.h index b260fb336b25..7c4c2379e010 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -984,6 +984,10 @@ static inline int genphy_no_soft_reset(struct phy_device *phydev) { return 0; } +int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad, + u16 regnum); +int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum, + u16 regnum, u16 val); /* Clause 45 PHY */ int genphy_c45_restart_aneg(struct phy_device *phydev); -- cgit v1.2.3 From f59f1caf72ba00d519c793c3deb32cd3be32edc2 Mon Sep 17 00:00:00 2001 From: Daniel Vacek Date: Thu, 22 Mar 2018 16:17:38 -0700 Subject: Revert "mm: page_alloc: skip over regions of invalid pfns where possible" This reverts commit b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible"). The commit is meant to be a boot init speed up skipping the loop in memmap_init_zone() for invalid pfns. But given some specific memory mapping on x86_64 (or more generally theoretically anywhere but on arm with CONFIG_HAVE_ARCH_PFN_VALID) the implementation also skips valid pfns which is plain wrong and causes 'kernel BUG at mm/page_alloc.c:1389!' crash> log | grep -e BUG -e RIP -e Call.Trace -e move_freepages_block -e rmqueue -e freelist -A1 kernel BUG at mm/page_alloc.c:1389! invalid opcode: 0000 [#1] SMP -- RIP: 0010: move_freepages+0x15e/0x160 -- Call Trace: move_freepages_block+0x73/0x80 __rmqueue+0x263/0x460 get_page_from_freelist+0x7e1/0x9e0 __alloc_pages_nodemask+0x176/0x420 -- crash> page_init_bug -v | grep RAM 1000 - 9bfff System RAM (620.00 KiB) 100000 - 430bffff System RAM ( 1.05 GiB = 1071.75 MiB = 1097472.00 KiB) 4b0c8000 - 4bf9cfff System RAM ( 14.83 MiB = 15188.00 KiB) 4bfac000 - 646b1fff System RAM (391.02 MiB = 400408.00 KiB) 7b788000 - 7b7fffff System RAM (480.00 KiB) 100000000 - 67fffffff System RAM ( 22.00 GiB) crash> page_init_bug | head -6 7b788000 - 7b7fffff System RAM (480.00 KiB) 1fffff00000000 0 1 DMA32 4096 1048575 505736 505344 505855 0 0 0 DMA 1 4095 1fffff00000400 0 1 DMA32 4096 1048575 BUG, zones differ! crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b787000 7b788000 PAGE PHYSICAL MAPPING INDEX CNT FLAGS ffffea0001e00000 78000000 0 0 0 0 ffffea0001ed7fc0 7b5ff000 0 0 0 0 ffffea0001ed8000 7b600000 0 0 0 0 <<<< ffffea0001ede1c0 7b787000 0 0 0 0 ffffea0001ede200 7b788000 0 0 1 1fffff00000000 Link: http://lkml.kernel.org/r/20180316143855.29838-1-neelx@redhat.com Fixes: b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible") Signed-off-by: Daniel Vacek Acked-by: Ard Biesheuvel Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Vlastimil Babka Cc: Mel Gorman Cc: Pavel Tatashin Cc: Paul Burton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 1 - mm/memblock.c | 28 ---------------------------- mm/page_alloc.c | 11 +---------- 3 files changed, 1 insertion(+), 39 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 8be5077efb5f..f92ea7783652 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -187,7 +187,6 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, unsigned long *out_end_pfn, int *out_nid); -unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn); /** * for_each_mem_pfn_range - early memory pfn range iterator diff --git a/mm/memblock.c b/mm/memblock.c index b6ba6b7adadc..48376bd33274 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, *out_nid = r->nid; } -unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn, - unsigned long max_pfn) -{ - struct memblock_type *type = &memblock.memory; - unsigned int right = type->cnt; - unsigned int mid, left = 0; - phys_addr_t addr = PFN_PHYS(++pfn); - - do { - mid = (right + left) / 2; - - if (addr < type->regions[mid].base) - right = mid; - else if (addr >= (type->regions[mid].base + - type->regions[mid].size)) - left = mid + 1; - else { - /* addr is within the region, so pfn is valid */ - return pfn; - } - } while (left < right); - - if (right == type->cnt) - return -1UL; - else - return PHYS_PFN(type->regions[right].base); -} - /** * memblock_set_node - set node ID on memblock regions * @base: base of area to set node ID for diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 010dee0f089e..1741dd23e7c1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5356,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (context != MEMMAP_EARLY) goto not_early; - if (!early_pfn_valid(pfn)) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - /* - * Skip to the pfn preceding the next valid one (or - * end_pfn), such that we hit a valid pfn (or end_pfn) - * on our next iteration of the loop. - */ - pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1; -#endif + if (!early_pfn_valid(pfn)) continue; - } if (!early_pfn_in_nid(pfn, nid)) continue; if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) -- cgit v1.2.3