From 0c1a7f13c9ec1ceb18d97ef4b1dd20ec71ffba31 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Wed, 17 Jun 2020 17:01:32 +0530 Subject: ieee80211: Add missing and new AKM suite selector definitions Add the definitions for missing AKM selectors defined in IEEE P802.11-REVmd/D3.0, table 9-151. These definitions will be used by various drivers that support these new AKM suites. Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/20200617113132.13477-1-vjakkam@codeaurora.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index fe15f831841b..9f732499ea88 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3333,13 +3333,17 @@ struct ieee80211_multiple_bssid_configuration { #define WLAN_AKM_SUITE_TDLS SUITE(0x000FAC, 7) #define WLAN_AKM_SUITE_SAE SUITE(0x000FAC, 8) #define WLAN_AKM_SUITE_FT_OVER_SAE SUITE(0x000FAC, 9) +#define WLAN_AKM_SUITE_AP_PEER_KEY SUITE(0x000FAC, 10) #define WLAN_AKM_SUITE_8021X_SUITE_B SUITE(0x000FAC, 11) #define WLAN_AKM_SUITE_8021X_SUITE_B_192 SUITE(0x000FAC, 12) +#define WLAN_AKM_SUITE_FT_8021X_SHA384 SUITE(0x000FAC, 13) #define WLAN_AKM_SUITE_FILS_SHA256 SUITE(0x000FAC, 14) #define WLAN_AKM_SUITE_FILS_SHA384 SUITE(0x000FAC, 15) #define WLAN_AKM_SUITE_FT_FILS_SHA256 SUITE(0x000FAC, 16) #define WLAN_AKM_SUITE_FT_FILS_SHA384 SUITE(0x000FAC, 17) #define WLAN_AKM_SUITE_OWE SUITE(0x000FAC, 18) +#define WLAN_AKM_SUITE_FT_PSK_SHA384 SUITE(0x000FAC, 19) +#define WLAN_AKM_SUITE_PSK_SHA384 SUITE(0x000FAC, 20) #define WLAN_MAX_KEY_LEN 32 -- cgit v1.2.3 From a9b59159d338d414acaa8e2f569d129d51c76452 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 24 Jun 2020 15:20:39 -0700 Subject: bpf: Do not allow btf_ctx_access with __int128 types To ensure btf_ctx_access() is safe the verifier checks that the BTF arg type is an int, enum, or pointer. When the function does the BTF arg lookup it uses the calculation 'arg = off / 8' using the fact that registers are 8B. This requires that the first arg is in the first reg, the second in the second, and so on. However, for __int128 the arg will consume two registers by default LLVM implementation. So this will cause the arg layout assumed by the 'arg = off / 8' calculation to be incorrect. Because __int128 is uncommon this patch applies the easiest fix and will force int types to be sizeof(u64) or smaller so that they will fit in a single register. v2: remove unneeded parens per Andrii's feedback Fixes: 9e15db66136a1 ("bpf: Implement accurate raw_tp context access via BTF") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/159303723962.11287.13309537171132420717.stgit@john-Precision-5820-Tower --- include/linux/btf.h | 5 +++++ kernel/bpf/btf.c | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 5c1ea99b480f..8b81fbb4497c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -82,6 +82,11 @@ static inline bool btf_type_is_int(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_INT; } +static inline bool btf_type_is_small_int(const struct btf_type *t) +{ + return btf_type_is_int(t) && t->size <= sizeof(u64); +} + static inline bool btf_type_is_enum(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 58c9af1d4808..9a1a98dd9e97 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3746,7 +3746,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return false; t = btf_type_skip_modifiers(btf, t->type, NULL); - if (!btf_type_is_int(t)) { + if (!btf_type_is_small_int(t)) { bpf_log(log, "ret type %s not allowed for fmod_ret\n", btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -3768,7 +3768,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t) || btf_type_is_enum(t)) + if (btf_type_is_small_int(t) || btf_type_is_enum(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { -- cgit v1.2.3 From 10652a9e9fe3fbcaca090f99cd3060ac3fee2913 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 10 Jun 2020 17:22:30 +0200 Subject: Revert "serial: core: Refactor uart_unlock_and_check_sysrq()" This reverts commit da9a5aa3402db0ff3b57216d8dbf2478e1046cae. In order to ease backporting a fix for a sysrq regression, revert this rewrite which was since added on top. The other sysrq helpers now bail out early when sysrq is not enabled; it's better to keep that pattern here as well. Note that the __releases() attribute won't be needed after the follow-on fix either. Fixes: da9a5aa3402d ("serial: core: Refactor uart_unlock_and_check_sysrq()") Cc: stable Signed-off-by: Johan Hovold Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20200610152232.16925-2-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_core.c | 23 +++++++++++++---------- include/linux/serial_core.h | 3 ++- 2 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 13fb92ae3710..fcdb6bfbe2cf 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -3239,19 +3239,22 @@ int uart_prepare_sysrq_char(struct uart_port *port, unsigned int ch) } EXPORT_SYMBOL_GPL(uart_prepare_sysrq_char); -void uart_unlock_and_check_sysrq(struct uart_port *port, unsigned long flags) -__releases(&port->lock) +void uart_unlock_and_check_sysrq(struct uart_port *port, unsigned long irqflags) { - if (port->has_sysrq) { - int sysrq_ch = port->sysrq_ch; + int sysrq_ch; - port->sysrq_ch = 0; - spin_unlock_irqrestore(&port->lock, flags); - if (sysrq_ch) - handle_sysrq(sysrq_ch); - } else { - spin_unlock_irqrestore(&port->lock, flags); + if (!port->has_sysrq) { + spin_unlock_irqrestore(&port->lock, irqflags); + return; } + + sysrq_ch = port->sysrq_ch; + port->sysrq_ch = 0; + + spin_unlock_irqrestore(&port->lock, irqflags); + + if (sysrq_ch) + handle_sysrq(sysrq_ch); } EXPORT_SYMBOL_GPL(uart_unlock_and_check_sysrq); diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 9fd550e7946a..ef4921ddbe97 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -464,7 +464,8 @@ extern void uart_insert_char(struct uart_port *port, unsigned int status, extern int uart_handle_sysrq_char(struct uart_port *port, unsigned int ch); extern int uart_prepare_sysrq_char(struct uart_port *port, unsigned int ch); -extern void uart_unlock_and_check_sysrq(struct uart_port *port, unsigned long flags); +extern void uart_unlock_and_check_sysrq(struct uart_port *port, + unsigned long irqflags); extern int uart_handle_break(struct uart_port *port); /* -- cgit v1.2.3 From 08d5470308ac3598e7709d08b8979ce6e9de8da2 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 10 Jun 2020 17:22:31 +0200 Subject: serial: core: fix sysrq overhead regression Commit 8e20fc391711 ("serial_core: Move sysrq functions from header file") converted the inline sysrq helpers to exported functions which are now called for every received character, interrupt and break signal also on systems without CONFIG_MAGIC_SYSRQ_SERIAL instead of being optimised away by the compiler. Inlining these helpers again also avoids the function call overhead when CONFIG_MAGIC_SYSRQ_SERIAL is enabled (e.g. when the port is not used as a console). Fixes: 8e20fc391711 ("serial_core: Move sysrq functions from header file") Cc: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Johan Hovold Cc: stable Reviewed-by: Andy Shevchenko Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://lore.kernel.org/r/20200610152232.16925-3-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_core.c | 99 +------------------------------------ include/linux/serial_core.h | 103 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 100 insertions(+), 102 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index fcdb6bfbe2cf..abb102e71b14 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -41,8 +41,6 @@ static struct lock_class_key port_lock_key; #define HIGH_BITS_OFFSET ((sizeof(long)-sizeof(int))*8) -#define SYSRQ_TIMEOUT (HZ * 5) - static void uart_change_speed(struct tty_struct *tty, struct uart_state *state, struct ktermios *old_termios); static void uart_wait_until_sent(struct tty_struct *tty, int timeout); @@ -3163,7 +3161,7 @@ static DECLARE_WORK(sysrq_enable_work, uart_sysrq_on); * Returns false if @ch is out of enabling sequence and should be * handled some other way, true if @ch was consumed. */ -static bool uart_try_toggle_sysrq(struct uart_port *port, unsigned int ch) +bool uart_try_toggle_sysrq(struct uart_port *port, unsigned int ch) { int sysrq_toggle_seq_len = strlen(sysrq_toggle_seq); @@ -3186,102 +3184,9 @@ static bool uart_try_toggle_sysrq(struct uart_port *port, unsigned int ch) port->sysrq = 0; return true; } -#else -static inline bool uart_try_toggle_sysrq(struct uart_port *port, unsigned int ch) -{ - return false; -} +EXPORT_SYMBOL_GPL(uart_try_toggle_sysrq); #endif -int uart_handle_sysrq_char(struct uart_port *port, unsigned int ch) -{ - if (!IS_ENABLED(CONFIG_MAGIC_SYSRQ_SERIAL)) - return 0; - - if (!port->has_sysrq || !port->sysrq) - return 0; - - if (ch && time_before(jiffies, port->sysrq)) { - if (sysrq_mask()) { - handle_sysrq(ch); - port->sysrq = 0; - return 1; - } - if (uart_try_toggle_sysrq(port, ch)) - return 1; - } - port->sysrq = 0; - - return 0; -} -EXPORT_SYMBOL_GPL(uart_handle_sysrq_char); - -int uart_prepare_sysrq_char(struct uart_port *port, unsigned int ch) -{ - if (!IS_ENABLED(CONFIG_MAGIC_SYSRQ_SERIAL)) - return 0; - - if (!port->has_sysrq || !port->sysrq) - return 0; - - if (ch && time_before(jiffies, port->sysrq)) { - if (sysrq_mask()) { - port->sysrq_ch = ch; - port->sysrq = 0; - return 1; - } - if (uart_try_toggle_sysrq(port, ch)) - return 1; - } - port->sysrq = 0; - - return 0; -} -EXPORT_SYMBOL_GPL(uart_prepare_sysrq_char); - -void uart_unlock_and_check_sysrq(struct uart_port *port, unsigned long irqflags) -{ - int sysrq_ch; - - if (!port->has_sysrq) { - spin_unlock_irqrestore(&port->lock, irqflags); - return; - } - - sysrq_ch = port->sysrq_ch; - port->sysrq_ch = 0; - - spin_unlock_irqrestore(&port->lock, irqflags); - - if (sysrq_ch) - handle_sysrq(sysrq_ch); -} -EXPORT_SYMBOL_GPL(uart_unlock_and_check_sysrq); - -/* - * We do the SysRQ and SAK checking like this... - */ -int uart_handle_break(struct uart_port *port) -{ - struct uart_state *state = port->state; - - if (port->handle_break) - port->handle_break(port); - - if (port->has_sysrq && uart_console(port)) { - if (!port->sysrq) { - port->sysrq = jiffies + SYSRQ_TIMEOUT; - return 1; - } - port->sysrq = 0; - } - - if (port->flags & UPF_SAK) - do_SAK(state->port.tty); - return 0; -} -EXPORT_SYMBOL_GPL(uart_handle_break); - EXPORT_SYMBOL(uart_write_wakeup); EXPORT_SYMBOL(uart_register_driver); EXPORT_SYMBOL(uart_unregister_driver); diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index ef4921ddbe97..03fa7b967103 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -462,11 +462,104 @@ extern void uart_handle_cts_change(struct uart_port *uport, extern void uart_insert_char(struct uart_port *port, unsigned int status, unsigned int overrun, unsigned int ch, unsigned int flag); -extern int uart_handle_sysrq_char(struct uart_port *port, unsigned int ch); -extern int uart_prepare_sysrq_char(struct uart_port *port, unsigned int ch); -extern void uart_unlock_and_check_sysrq(struct uart_port *port, - unsigned long irqflags); -extern int uart_handle_break(struct uart_port *port); +#ifdef CONFIG_MAGIC_SYSRQ_SERIAL +#define SYSRQ_TIMEOUT (HZ * 5) + +bool uart_try_toggle_sysrq(struct uart_port *port, unsigned int ch); + +static inline int uart_handle_sysrq_char(struct uart_port *port, unsigned int ch) +{ + if (!port->has_sysrq || !port->sysrq) + return 0; + + if (ch && time_before(jiffies, port->sysrq)) { + if (sysrq_mask()) { + handle_sysrq(ch); + port->sysrq = 0; + return 1; + } + if (uart_try_toggle_sysrq(port, ch)) + return 1; + } + port->sysrq = 0; + + return 0; +} + +static inline int uart_prepare_sysrq_char(struct uart_port *port, unsigned int ch) +{ + if (!port->has_sysrq || !port->sysrq) + return 0; + + if (ch && time_before(jiffies, port->sysrq)) { + if (sysrq_mask()) { + port->sysrq_ch = ch; + port->sysrq = 0; + return 1; + } + if (uart_try_toggle_sysrq(port, ch)) + return 1; + } + port->sysrq = 0; + + return 0; +} + +static inline void uart_unlock_and_check_sysrq(struct uart_port *port, unsigned long irqflags) +{ + int sysrq_ch; + + if (!port->has_sysrq) { + spin_unlock_irqrestore(&port->lock, irqflags); + return; + } + + sysrq_ch = port->sysrq_ch; + port->sysrq_ch = 0; + + spin_unlock_irqrestore(&port->lock, irqflags); + + if (sysrq_ch) + handle_sysrq(sysrq_ch); +} +#else /* CONFIG_MAGIC_SYSRQ_SERIAL */ +static inline int uart_handle_sysrq_char(struct uart_port *port, unsigned int ch) +{ + return 0; +} +static inline int uart_prepare_sysrq_char(struct uart_port *port, unsigned int ch) +{ + return 0; +} +static inline void uart_unlock_and_check_sysrq(struct uart_port *port, unsigned long irqflags) +{ + spin_unlock_irqrestore(&port->lock, irqflags); +} +#endif /* CONFIG_MAGIC_SYSRQ_SERIAL */ + +/* + * We do the SysRQ and SAK checking like this... + */ +static inline int uart_handle_break(struct uart_port *port) +{ + struct uart_state *state = port->state; + + if (port->handle_break) + port->handle_break(port); + +#ifdef CONFIG_MAGIC_SYSRQ_SERIAL + if (port->has_sysrq && uart_console(port)) { + if (!port->sysrq) { + port->sysrq = jiffies + SYSRQ_TIMEOUT; + return 1; + } + port->sysrq = 0; + } +#endif + if (port->flags & UPF_SAK) + do_SAK(state->port.tty); + return 0; +} /* * UART_ENABLE_MS - determine if port should enable modem status irqs -- cgit v1.2.3 From 225385657b7d81a99e17e04cd01f9ed5bb3109a8 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 10 Jun 2020 17:22:32 +0200 Subject: serial: core: drop redundant sysrq checks The sysrq timestamp will never be set unless port->has_sysrq is set (see uart_handle_break()) so drop the redundant checks that were added by commit 1997e9dfdc84 ("serial_core: Un-ifdef sysrq SUPPORT_SYSRQ"). Signed-off-by: Johan Hovold Reviewed-by: Andy Shevchenko Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://lore.kernel.org/r/20200610152232.16925-4-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 03fa7b967103..791f4844efeb 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -469,7 +469,7 @@ bool uart_try_toggle_sysrq(struct uart_port *port, unsigned int ch); static inline int uart_handle_sysrq_char(struct uart_port *port, unsigned int ch) { - if (!port->has_sysrq || !port->sysrq) + if (!port->sysrq) return 0; if (ch && time_before(jiffies, port->sysrq)) { @@ -488,7 +488,7 @@ static inline int uart_handle_sysrq_char(struct uart_port *port, unsigned int ch static inline int uart_prepare_sysrq_char(struct uart_port *port, unsigned int ch) { - if (!port->has_sysrq || !port->sysrq) + if (!port->sysrq) return 0; if (ch && time_before(jiffies, port->sysrq)) { -- cgit v1.2.3 From 3aa91625007807bfca4155df1867a5c924a08662 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Jun 2020 15:03:56 +0200 Subject: dma-mapping: Add a new dma_need_sync API Add a new API to check if calls to dma_sync_single_for_{device,cpu} are required for a given DMA streaming mapping. Signed-off-by: Christoph Hellwig Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200629130359.2690853-2-hch@lst.de --- Documentation/core-api/dma-api.rst | 8 ++++++++ include/linux/dma-direct.h | 1 + include/linux/dma-mapping.h | 5 +++++ kernel/dma/direct.c | 6 ++++++ kernel/dma/mapping.c | 10 ++++++++++ 5 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst index 2d8d2fed7317..f41620439ef3 100644 --- a/Documentation/core-api/dma-api.rst +++ b/Documentation/core-api/dma-api.rst @@ -204,6 +204,14 @@ Returns the maximum size of a mapping for the device. The size parameter of the mapping functions like dma_map_single(), dma_map_page() and others should not be larger than the returned value. +:: + + bool + dma_need_sync(struct device *dev, dma_addr_t dma_addr); + +Returns %true if dma_sync_single_for_{device,cpu} calls are required to +transfer memory ownership. Returns %false if those calls can be skipped. + :: unsigned long diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 136f984df0d9..8b006730687b 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -87,4 +87,5 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); int dma_direct_supported(struct device *dev, u64 mask); +bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr); #endif /* _LINUX_DMA_DIRECT_H */ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 78f677cf45ab..a33ed3954ed4 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -461,6 +461,7 @@ int dma_set_mask(struct device *dev, u64 mask); int dma_set_coherent_mask(struct device *dev, u64 mask); u64 dma_get_required_mask(struct device *dev); size_t dma_max_mapping_size(struct device *dev); +bool dma_need_sync(struct device *dev, dma_addr_t dma_addr); unsigned long dma_get_merge_boundary(struct device *dev); #else /* CONFIG_HAS_DMA */ static inline dma_addr_t dma_map_page_attrs(struct device *dev, @@ -571,6 +572,10 @@ static inline size_t dma_max_mapping_size(struct device *dev) { return 0; } +static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + return false; +} static inline unsigned long dma_get_merge_boundary(struct device *dev) { return 0; diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 0a4881e59aa7..ecb922a0bfa0 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -530,3 +530,9 @@ size_t dma_direct_max_mapping_size(struct device *dev) return swiotlb_max_mapping_size(dev); return SIZE_MAX; } + +bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + return !dev_is_dma_coherent(dev) || + is_swiotlb_buffer(dma_to_phys(dev, dma_addr)); +} diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 98e3d873792e..a8c18c9a796f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -397,6 +397,16 @@ size_t dma_max_mapping_size(struct device *dev) } EXPORT_SYMBOL_GPL(dma_max_mapping_size); +bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_is_direct(ops)) + return dma_direct_need_sync(dev, dma_addr); + return ops->sync_single_for_cpu || ops->sync_single_for_device; +} +EXPORT_SYMBOL_GPL(dma_need_sync); + unsigned long dma_get_merge_boundary(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); -- cgit v1.2.3 From 4ac2add65974e4efafb8d4ccd8fc5660417ea312 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:26 +0100 Subject: bpf: flow_dissector: Check value of unused flags to BPF_PROG_DETACH Using BPF_PROG_DETACH on a flow dissector program supports neither attach_flags nor attach_bpf_fd. Yet no value is enforced for them. Enforce that attach_flags are zero, and require the current program to be passed via attach_bpf_fd. This allows us to remove the check for CAP_SYS_ADMIN, since userspace can now no longer remove arbitrary flow dissector programs. Fixes: b27f7bb590ba ("flow_dissector: Move out netns_bpf prog callbacks") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-3-lmb@cloudflare.com --- include/linux/bpf-netns.h | 5 +++-- kernel/bpf/net_namespace.c | 19 +++++++++++++++---- kernel/bpf/syscall.c | 4 +--- 3 files changed, 19 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h index 4052d649f36d..47d5b0c708c9 100644 --- a/include/linux/bpf-netns.h +++ b/include/linux/bpf-netns.h @@ -33,7 +33,7 @@ int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); -int netns_bpf_prog_detach(const union bpf_attr *attr); +int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog); #else @@ -49,7 +49,8 @@ static inline int netns_bpf_prog_attach(const union bpf_attr *attr, return -EOPNOTSUPP; } -static inline int netns_bpf_prog_detach(const union bpf_attr *attr) +static inline int netns_bpf_prog_detach(const union bpf_attr *attr, + enum bpf_prog_type ptype) { return -EOPNOTSUPP; } diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 03045f45afec..3dbc29b6f51d 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -269,7 +269,8 @@ out_unlock: /* Must be called with netns_bpf_mutex held. */ static int __netns_bpf_prog_detach(struct net *net, - enum netns_bpf_attach_type type) + enum netns_bpf_attach_type type, + struct bpf_prog *old) { struct bpf_prog *attached; @@ -278,7 +279,7 @@ static int __netns_bpf_prog_detach(struct net *net, return -EINVAL; attached = net->bpf.progs[type]; - if (!attached) + if (!attached || attached != old) return -ENOENT; netns_bpf_run_array_detach(net, type); net->bpf.progs[type] = NULL; @@ -286,19 +287,29 @@ static int __netns_bpf_prog_detach(struct net *net, return 0; } -int netns_bpf_prog_detach(const union bpf_attr *attr) +int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { enum netns_bpf_attach_type type; + struct bpf_prog *prog; int ret; + if (attr->target_fd) + return -EINVAL; + type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + mutex_lock(&netns_bpf_mutex); - ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type, prog); mutex_unlock(&netns_bpf_mutex); + bpf_prog_put(prog); + return ret; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7d946435587d..28c6ef759037 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2897,9 +2897,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - return netns_bpf_prog_detach(attr); + return netns_bpf_prog_detach(attr, ptype); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: -- cgit v1.2.3 From bb0de3131f4c60a9bf976681e0fe4d1e55c7a821 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:28 +0100 Subject: bpf: sockmap: Require attach_bpf_fd when detaching a program The sockmap code currently ignores the value of attach_bpf_fd when detaching a program. This is contrary to the usual behaviour of checking that attach_bpf_fd represents the currently attached program. Ensure that attach_bpf_fd is indeed the currently attached program. It turns out that all sockmap selftests already do this, which indicates that this is unlikely to cause breakage. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-5-lmb@cloudflare.com --- include/linux/bpf.h | 13 +++++++++++-- include/linux/skmsg.h | 13 +++++++++++++ kernel/bpf/syscall.c | 2 +- net/core/sock_map.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 70 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 07052d44bca1..9750a1902ee5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1543,13 +1543,16 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_BPF_STREAM_PARSER) -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which); +int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); +int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); void sock_map_unhash(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); #else static inline int sock_map_prog_update(struct bpf_map *map, - struct bpf_prog *prog, u32 which) + struct bpf_prog *prog, + struct bpf_prog *old, u32 which) { return -EOPNOTSUPP; } @@ -1559,6 +1562,12 @@ static inline int sock_map_get_from_fd(const union bpf_attr *attr, { return -EINVAL; } + +static inline int sock_map_prog_detach(const union bpf_attr *attr, + enum bpf_prog_type ptype) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_BPF_STREAM_PARSER */ #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 08674cd14d5a..1e9ed840b9fc 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -430,6 +430,19 @@ static inline void psock_set_prog(struct bpf_prog **pprog, bpf_prog_put(prog); } +static inline int psock_replace_prog(struct bpf_prog **pprog, + struct bpf_prog *prog, + struct bpf_prog *old) +{ + if (cmpxchg(pprog, old, prog) != old) + return -ENOENT; + + if (old) + bpf_prog_put(old); + + return 0; +} + static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 28c6ef759037..a74fce8ce043 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2893,7 +2893,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) switch (ptype) { case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_SK_SKB: - return sock_map_get_from_fd(attr, NULL); + return sock_map_prog_detach(attr, ptype); case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 58016a5c63ff..0971f17e8e54 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -77,7 +77,42 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - ret = sock_map_prog_update(map, prog, attr->attach_type); + ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); + fdput(f); + return ret; +} + +int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +{ + u32 ufd = attr->target_fd; + struct bpf_prog *prog; + struct bpf_map *map; + struct fd f; + int ret; + + if (attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + prog = bpf_prog_get(attr->attach_bpf_fd); + if (IS_ERR(prog)) { + ret = PTR_ERR(prog); + goto put_map; + } + + if (prog->type != ptype) { + ret = -EINVAL; + goto put_prog; + } + + ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); +put_prog: + bpf_prog_put(prog); +put_map: fdput(f); return ret; } @@ -1206,27 +1241,32 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) } int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - u32 which) + struct bpf_prog *old, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog **pprog; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - psock_set_prog(&progs->msg_parser, prog); + pprog = &progs->msg_parser; break; case BPF_SK_SKB_STREAM_PARSER: - psock_set_prog(&progs->skb_parser, prog); + pprog = &progs->skb_parser; break; case BPF_SK_SKB_STREAM_VERDICT: - psock_set_prog(&progs->skb_verdict, prog); + pprog = &progs->skb_verdict; break; default: return -EOPNOTSUPP; } + if (old) + return psock_replace_prog(pprog, prog, old); + + psock_set_prog(pprog, prog); return 0; } -- cgit v1.2.3 From c463bb2a8f8d7d97aa414bf7714fc77e9d3b10df Mon Sep 17 00:00:00 2001 From: Merlijn Wajer Date: Tue, 30 Jun 2020 11:47:04 -0700 Subject: Input: add `SW_MACHINE_COVER` This event code represents the state of a removable cover of a device. Value 0 means that the cover is open or removed, value 1 means that the cover is closed. Reviewed-by: Sebastian Reichel Acked-by: Tony Lindgren Signed-off-by: Merlijn Wajer Link: https://lore.kernel.org/r/20200612125402.18393-2-merlijn@wizzup.org Signed-off-by: Dmitry Torokhov --- include/linux/mod_devicetable.h | 2 +- include/uapi/linux/input-event-codes.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index e3596db077dc..04a19e30b168 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -318,7 +318,7 @@ struct pcmcia_device_id { #define INPUT_DEVICE_ID_LED_MAX 0x0f #define INPUT_DEVICE_ID_SND_MAX 0x07 #define INPUT_DEVICE_ID_FF_MAX 0x7f -#define INPUT_DEVICE_ID_SW_MAX 0x0f +#define INPUT_DEVICE_ID_SW_MAX 0x10 #define INPUT_DEVICE_ID_PROP_MAX 0x1f #define INPUT_DEVICE_ID_MATCH_BUS 1 diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index b6a835d37826..0c2e27d28e0a 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -888,7 +888,8 @@ #define SW_LINEIN_INSERT 0x0d /* set = inserted */ #define SW_MUTE_DEVICE 0x0e /* set = device disabled */ #define SW_PEN_INSERTED 0x0f /* set = pen inserted */ -#define SW_MAX 0x0f +#define SW_MACHINE_COVER 0x10 /* set = cover closed */ +#define SW_MAX 0x10 #define SW_CNT (SW_MAX+1) /* -- cgit v1.2.3 From d7bf2ebebc2bd61ab95e2a8e33541ef282f303d4 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 3 Jul 2020 22:26:43 +0200 Subject: sched: consistently handle layer3 header accesses in the presence of VLANs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are a couple of places in net/sched/ that check skb->protocol and act on the value there. However, in the presence of VLAN tags, the value stored in skb->protocol can be inconsistent based on whether VLAN acceleration is enabled. The commit quoted in the Fixes tag below fixed the users of skb->protocol to use a helper that will always see the VLAN ethertype. However, most of the callers don't actually handle the VLAN ethertype, but expect to find the IP header type in the protocol field. This means that things like changing the ECN field, or parsing diffserv values, stops working if there's a VLAN tag, or if there are multiple nested VLAN tags (QinQ). To fix this, change the helper to take an argument that indicates whether the caller wants to skip the VLAN tags or not. When skipping VLAN tags, we make sure to skip all of them, so behaviour is consistent even in QinQ mode. To make the helper usable from the ECN code, move it to if_vlan.h instead of pkt_sched.h. v3: - Remove empty lines - Move vlan variable definitions inside loop in skb_protocol() - Also use skb_protocol() helper in IP{,6}_ECN_decapsulate() and bpf_skb_ecn_set_ce() v2: - Use eth_type_vlan() helper in skb_protocol() - Also fix code that reads skb->protocol directly - Change a couple of 'if/else if' statements to switch constructs to avoid calling the helper twice Reported-by: Ilya Ponetayev Fixes: d8b9605d2697 ("net: sched: fix skb->protocol use in case of accelerated vlan path") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 28 ++++++++++++++++++++++++++++ include/net/inet_ecn.h | 25 +++++++++++++++++-------- include/net/pkt_sched.h | 11 ----------- net/core/filter.c | 10 +++++++--- net/sched/act_connmark.c | 9 ++++++--- net/sched/act_csum.c | 2 +- net/sched/act_ct.c | 9 ++++----- net/sched/act_ctinfo.c | 9 ++++++--- net/sched/act_mpls.c | 2 +- net/sched/act_skbedit.c | 2 +- net/sched/cls_api.c | 2 +- net/sched/cls_flow.c | 8 ++++---- net/sched/cls_flower.c | 2 +- net/sched/em_ipset.c | 2 +- net/sched/em_ipt.c | 2 +- net/sched/em_meta.c | 2 +- net/sched/sch_cake.c | 4 ++-- net/sched/sch_dsmark.c | 6 +++--- net/sched/sch_teql.c | 2 +- 19 files changed, 86 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index b05e855f1ddd..427a5b8597c2 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -308,6 +308,34 @@ static inline bool eth_type_vlan(__be16 ethertype) } } +/* A getter for the SKB protocol field which will handle VLAN tags consistently + * whether VLAN acceleration is enabled or not. + */ +static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) +{ + unsigned int offset = skb_mac_offset(skb) + sizeof(struct ethhdr); + __be16 proto = skb->protocol; + + if (!skip_vlan) + /* VLAN acceleration strips the VLAN header from the skb and + * moves it to skb->vlan_proto + */ + return skb_vlan_tag_present(skb) ? skb->vlan_proto : proto; + + while (eth_type_vlan(proto)) { + struct vlan_hdr vhdr, *vh; + + vh = skb_header_pointer(skb, offset, sizeof(vhdr), &vhdr); + if (!vh) + break; + + proto = vh->h_vlan_encapsulated_proto; + offset += sizeof(vhdr); + } + + return proto; +} + static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index 0f0d1efe06dd..e1eaf1780288 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -172,7 +173,7 @@ static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner) static inline int INET_ECN_set_ce(struct sk_buff *skb) { - switch (skb->protocol) { + switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) @@ -191,7 +192,7 @@ static inline int INET_ECN_set_ce(struct sk_buff *skb) static inline int INET_ECN_set_ect1(struct sk_buff *skb) { - switch (skb->protocol) { + switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) @@ -272,12 +273,16 @@ static inline int IP_ECN_decapsulate(const struct iphdr *oiph, { __u8 inner; - if (skb->protocol == htons(ETH_P_IP)) + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) + break; + case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); - else + break; + default: return 0; + } return INET_ECN_decapsulate(skb, oiph->tos, inner); } @@ -287,12 +292,16 @@ static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h, { __u8 inner; - if (skb->protocol == htons(ETH_P_IP)) + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) + break; + case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); - else + break; + default: return 0; + } return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner); } diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 9092e697059e..ac8c890a2657 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -136,17 +136,6 @@ static inline void qdisc_run(struct Qdisc *q) } } -static inline __be16 tc_skb_protocol(const struct sk_buff *skb) -{ - /* We need to take extra care in case the skb came via - * vlan accelerated path. In that case, use skb->vlan_proto - * as the original vlan header was already stripped. - */ - if (skb_vlan_tag_present(skb)) - return skb->vlan_proto; - return skb->protocol; -} - /* Calculate maximal size of packet seen by hard_start_xmit routine of this device. */ diff --git a/net/core/filter.c b/net/core/filter.c index 73395384afe2..82e1b5b06167 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5853,12 +5853,16 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) { unsigned int iphdr_len; - if (skb->protocol == cpu_to_be16(ETH_P_IP)) + switch (skb_protocol(skb, true)) { + case cpu_to_be16(ETH_P_IP): iphdr_len = sizeof(struct iphdr); - else if (skb->protocol == cpu_to_be16(ETH_P_IPV6)) + break; + case cpu_to_be16(ETH_P_IPV6): iphdr_len = sizeof(struct ipv6hdr); - else + break; + default: return 0; + } if (skb_headlen(skb) < iphdr_len) return 0; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 43a243081e7d..f901421b0634 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -43,17 +43,20 @@ static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&ca->tcf_tm); bstats_update(&ca->tcf_bstats, skb); - if (skb->protocol == htons(ETH_P_IP)) { + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): if (skb->len < sizeof(struct iphdr)) goto out; proto = NFPROTO_IPV4; - } else if (skb->protocol == htons(ETH_P_IPV6)) { + break; + case htons(ETH_P_IPV6): if (skb->len < sizeof(struct ipv6hdr)) goto out; proto = NFPROTO_IPV6; - } else { + break; + default: goto out; } diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index cb8608f0a77a..c60674cf25c4 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -587,7 +587,7 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, goto drop; update_flags = params->update_flags; - protocol = tc_skb_protocol(skb); + protocol = skb_protocol(skb, false); again: switch (protocol) { case cpu_to_be16(ETH_P_IP): diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index e9f3576cbf71..86ed02487467 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -624,7 +624,7 @@ static u8 tcf_ct_skb_nf_family(struct sk_buff *skb) { u8 family = NFPROTO_UNSPEC; - switch (skb->protocol) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): family = NFPROTO_IPV4; break; @@ -748,6 +748,7 @@ static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { + __be16 proto = skb_protocol(skb, true); int hooknum, err = NF_ACCEPT; /* See HOOK2MANIP(). */ @@ -759,14 +760,13 @@ static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, switch (ctinfo) { case IP_CT_RELATED: case IP_CT_RELATED_REPLY: - if (skb->protocol == htons(ETH_P_IP) && + if (proto == htons(ETH_P_IP) && ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, hooknum)) err = NF_DROP; goto out; - } else if (IS_ENABLED(CONFIG_IPV6) && - skb->protocol == htons(ETH_P_IPV6)) { + } else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) { __be16 frag_off; u8 nexthdr = ipv6_hdr(skb)->nexthdr; int hdrlen = ipv6_skip_exthdr(skb, @@ -1550,4 +1550,3 @@ MODULE_AUTHOR("Yossi Kuperman "); MODULE_AUTHOR("Marcelo Ricardo Leitner "); MODULE_DESCRIPTION("Connection tracking action"); MODULE_LICENSE("GPL v2"); - diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 19649623493b..b5042f3ea079 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -96,19 +96,22 @@ static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, action = READ_ONCE(ca->tcf_action); wlen = skb_network_offset(skb); - if (tc_skb_protocol(skb) == htons(ETH_P_IP)) { + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): wlen += sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen)) goto out; proto = NFPROTO_IPV4; - } else if (tc_skb_protocol(skb) == htons(ETH_P_IPV6)) { + break; + case htons(ETH_P_IPV6): wlen += sizeof(struct ipv6hdr); if (!pskb_may_pull(skb, wlen)) goto out; proto = NFPROTO_IPV6; - } else { + break; + default: goto out; } diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index be3f215cd027..8118e2640979 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -82,7 +82,7 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, goto drop; break; case TCA_MPLS_ACT_PUSH: - new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol)); + new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb_protocol(skb, true))); if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len, skb->dev && skb->dev->type == ARPHRD_ETHER)) goto drop; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index b125b2be4467..b2b3faa57294 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -41,7 +41,7 @@ static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, if (params->flags & SKBEDIT_F_INHERITDSFIELD) { int wlen = skb_network_offset(skb); - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): wlen += sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen)) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index faa78b7dd962..e62beec0d844 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1538,7 +1538,7 @@ static inline int __tcf_classify(struct sk_buff *skb, reclassify: #endif for (; tp; tp = rcu_dereference_bh(tp->next)) { - __be16 protocol = tc_skb_protocol(skb); + __be16 protocol = skb_protocol(skb, false); int err; if (tp->protocol != protocol && diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 80ae7b9fa90a..ab53a93b2f2b 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -80,7 +80,7 @@ static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) if (dst) return ntohl(dst); - return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); + return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true); } static u32 flow_get_proto(const struct sk_buff *skb, @@ -104,7 +104,7 @@ static u32 flow_get_proto_dst(const struct sk_buff *skb, if (flow->ports.ports) return ntohs(flow->ports.dst); - return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); + return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true); } static u32 flow_get_iif(const struct sk_buff *skb) @@ -151,7 +151,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb) static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow) { - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, src.u3.ip)); case htons(ETH_P_IPV6): @@ -164,7 +164,7 @@ fallback: static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, dst.u3.ip)); case htons(ETH_P_IPV6): diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index b2da37286082..e30bd969fc48 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -313,7 +313,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, /* skb_flow_dissect() does not set n_proto in case an unknown * protocol, so do it rather here. */ - skb_key.basic.n_proto = skb->protocol; + skb_key.basic.n_proto = skb_protocol(skb, false); skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key); skb_flow_dissect_ct(skb, &mask->dissector, &skb_key, fl_ct_info_to_flower_map, diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c index df00566d327d..c95cf86fb431 100644 --- a/net/sched/em_ipset.c +++ b/net/sched/em_ipset.c @@ -59,7 +59,7 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em, }; int ret, network_offset; - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): state.pf = NFPROTO_IPV4; if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c index 18755d29fd15..3650117da47f 100644 --- a/net/sched/em_ipt.c +++ b/net/sched/em_ipt.c @@ -212,7 +212,7 @@ static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em, struct nf_hook_state state; int ret; - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) return 0; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index d99966a55c84..46254968d390 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -195,7 +195,7 @@ META_COLLECTOR(int_priority) META_COLLECTOR(int_protocol) { /* Let userspace take care of the byte ordering */ - dst->value = tc_skb_protocol(skb); + dst->value = skb_protocol(skb, false); } META_COLLECTOR(int_pkttype) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index ca813697728e..ebaeec1e5c82 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -592,7 +592,7 @@ static bool cake_update_flowkeys(struct flow_keys *keys, bool rev = !skb->_nfct, upd = false; __be32 ip; - if (tc_skb_protocol(skb) != htons(ETH_P_IP)) + if (skb_protocol(skb, true) != htons(ETH_P_IP)) return false; if (!nf_ct_get_tuple_skb(&tuple, skb)) @@ -1557,7 +1557,7 @@ static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) u16 *buf, buf_; u8 dscp; - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); if (unlikely(!buf)) diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 05605b30bef3..2b88710994d7 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -210,7 +210,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (p->set_tc_index) { int wlen = skb_network_offset(skb); - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): wlen += sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen) || @@ -303,7 +303,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) index = skb->tc_index & (p->indices - 1); pr_debug("index %d->%d\n", skb->tc_index, index); - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): ipv4_change_dsfield(ip_hdr(skb), p->mv[index].mask, p->mv[index].value); @@ -320,7 +320,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) */ if (p->mv[index].mask != 0xff || p->mv[index].value) pr_warn("%s: unsupported protocol %d\n", - __func__, ntohs(tc_skb_protocol(skb))); + __func__, ntohs(skb_protocol(skb, true))); break; } diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 689ef6f3ded8..2f1f0a378408 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -239,7 +239,7 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, char haddr[MAX_ADDR_LEN]; neigh_ha_snapshot(haddr, n, dev); - err = dev_hard_header(skb, dev, ntohs(tc_skb_protocol(skb)), + err = dev_hard_header(skb, dev, ntohs(skb_protocol(skb, false)), haddr, NULL, skb->len); if (err < 0) -- cgit v1.2.3 From 68d237056e007c88031d80900cdba0945121a287 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Tue, 30 Jun 2020 10:16:02 +0200 Subject: scatterlist: protect parameters of the sg_table related macros Add brackets to protect parameters of the recently added sg_table related macros from side-effects. Fixes: 709d6d73c756 ("scatterlist: add generic wrappers for iterating over sgtable objects") Signed-off-by: Marek Szyprowski Signed-off-by: Christoph Hellwig --- include/linux/scatterlist.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 4f922afb607a..45cf7b69d852 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -155,7 +155,7 @@ static inline void sg_set_buf(struct scatterlist *sg, const void *buf, * Loop over each sg element in the given sg_table object. */ #define for_each_sgtable_sg(sgt, sg, i) \ - for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) + for_each_sg((sgt)->sgl, sg, (sgt)->orig_nents, i) /* * Loop over each sg element in the given *DMA mapped* sg_table object. @@ -163,7 +163,7 @@ static inline void sg_set_buf(struct scatterlist *sg, const void *buf, * of the each element. */ #define for_each_sgtable_dma_sg(sgt, sg, i) \ - for_each_sg(sgt->sgl, sg, sgt->nents, i) + for_each_sg((sgt)->sgl, sg, (sgt)->nents, i) /** * sg_chain - Chain two sglists together @@ -451,7 +451,7 @@ sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) * See also for_each_sg_page(). In each loop it operates on PAGE_SIZE unit. */ #define for_each_sgtable_page(sgt, piter, pgoffset) \ - for_each_sg_page(sgt->sgl, piter, sgt->orig_nents, pgoffset) + for_each_sg_page((sgt)->sgl, piter, (sgt)->orig_nents, pgoffset) /** * for_each_sgtable_dma_page - iterate over the DMA mapped sg_table object @@ -465,7 +465,7 @@ sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) * unit. */ #define for_each_sgtable_dma_page(sgt, dma_iter, pgoffset) \ - for_each_sg_dma_page(sgt->sgl, dma_iter, sgt->nents, pgoffset) + for_each_sg_dma_page((sgt)->sgl, dma_iter, (sgt)->nents, pgoffset) /* -- cgit v1.2.3 From ad0f75e5f57ccbceec13274e1e242f2b5a6397ed Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 2 Jul 2020 11:52:56 -0700 Subject: cgroup: fix cgroup_sk_alloc() for sk_clone_lock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we clone a socket in sk_clone_lock(), its sk_cgrp_data is copied, so the cgroup refcnt must be taken too. And, unlike the sk_alloc() path, sock_update_netprioidx() is not called here. Therefore, it is safe and necessary to grab the cgroup refcnt even when cgroup_sk_alloc is disabled. sk_clone_lock() is in BH context anyway, the in_interrupt() would terminate this function if called there. And for sk_alloc() skcd->val is always zero. So it's safe to factor out the code to make it more readable. The global variable 'cgroup_sk_alloc_disabled' is used to determine whether to take these reference counts. It is impossible to make the reference counting correct unless we save this bit of information in skcd->val. So, add a new bit there to record whether the socket has already taken the reference counts. This obviously relies on kmalloc() to align cgroup pointers to at least 4 bytes, ARCH_KMALLOC_MINALIGN is certainly larger than that. This bug seems to be introduced since the beginning, commit d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets") tried to fix it but not compeletely. It seems not easy to trigger until the recent commit 090e28b229af ("netprio_cgroup: Fix unlimited memory leak of v2 cgroups") was merged. Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Reported-by: Cameron Berkenpas Reported-by: Peter Geis Reported-by: Lu Fengqi Reported-by: Daniël Sonck Reported-by: Zhang Qiang Tested-by: Cameron Berkenpas Tested-by: Peter Geis Tested-by: Thomas Lamprecht Cc: Daniel Borkmann Cc: Zefan Li Cc: Tejun Heo Cc: Roman Gushchin Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 6 ++++-- include/linux/cgroup.h | 4 +++- kernel/cgroup/cgroup.c | 31 +++++++++++++++++++------------ net/core/sock.c | 2 +- 4 files changed, 27 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 52661155f85f..4f1cd0edc57d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -790,7 +790,8 @@ struct sock_cgroup_data { union { #ifdef __LITTLE_ENDIAN struct { - u8 is_data; + u8 is_data : 1; + u8 no_refcnt : 1; u8 padding; u16 prioidx; u32 classid; @@ -800,7 +801,8 @@ struct sock_cgroup_data { u32 classid; u16 prioidx; u8 padding; - u8 is_data; + u8 no_refcnt : 1; + u8 is_data : 1; } __packed; #endif u64 val; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4598e4da6b1b..618838c48313 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -822,6 +822,7 @@ extern spinlock_t cgroup_sk_update_lock; void cgroup_sk_alloc_disable(void); void cgroup_sk_alloc(struct sock_cgroup_data *skcd); +void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) @@ -835,7 +836,7 @@ static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) */ v = READ_ONCE(skcd->val); - if (v & 1) + if (v & 3) return &cgrp_dfl_root.cgrp; return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp; @@ -847,6 +848,7 @@ static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) #else /* CONFIG_CGROUP_DATA */ static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} +static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1ea181a58465..dd247747ec14 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6439,18 +6439,8 @@ void cgroup_sk_alloc_disable(void) void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) - return; - - /* Socket clone path */ - if (skcd->val) { - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); + if (cgroup_sk_alloc_disabled) { + skcd->no_refcnt = 1; return; } @@ -6475,10 +6465,27 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) rcu_read_unlock(); } +void cgroup_sk_clone(struct sock_cgroup_data *skcd) +{ + if (skcd->val) { + if (skcd->no_refcnt) + return; + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(sock_cgroup_ptr(skcd)); + cgroup_bpf_get(sock_cgroup_ptr(skcd)); + } +} + void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); + if (skcd->no_refcnt) + return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } diff --git a/net/core/sock.c b/net/core/sock.c index d832c650287c..2e5b7870e5d3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1926,7 +1926,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; - cgroup_sk_alloc(&newsk->sk_cgrp_data); + cgroup_sk_clone(&newsk->sk_cgrp_data); rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); -- cgit v1.2.3 From 41da51bce36f44eefc1e3d0f47d18841cbd065ba Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 21 Nov 2019 23:25:07 +0000 Subject: fs: Add IOCB_NOIO flag for generic_file_read_iter Add an IOCB_NOIO flag that indicates to generic_file_read_iter that it shouldn't trigger any filesystem I/O for the actual request or for readahead. This allows to do tentative reads out of the page cache as some filesystems allow, and to take the appropriate locks and retry the reads only if the requested pages are not cached. Signed-off-by: Andreas Gruenbacher --- include/linux/fs.h | 1 + mm/filemap.c | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3f881a892ea7..4b7cb76e5837 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -315,6 +315,7 @@ enum rw_hint { #define IOCB_SYNC (1 << 5) #define IOCB_WRITE (1 << 6) #define IOCB_NOWAIT (1 << 7) +#define IOCB_NOIO (1 << 9) struct kiocb { struct file *ki_filp; diff --git a/mm/filemap.c b/mm/filemap.c index f0ae9a6308cb..385759c4ce4b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2028,7 +2028,7 @@ find_page: page = find_get_page(mapping, index); if (!page) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) goto would_block; page_cache_sync_readahead(mapping, ra, filp, @@ -2038,6 +2038,10 @@ find_page: goto no_cached_page; } if (PageReadahead(page)) { + if (iocb->ki_flags & IOCB_NOIO) { + put_page(page); + goto out; + } page_cache_async_readahead(mapping, ra, filp, page, index, last_index - index); @@ -2160,6 +2164,11 @@ page_not_up_to_date_locked: } readpage: + if (iocb->ki_flags & IOCB_NOIO) { + unlock_page(page); + put_page(page); + goto would_block; + } /* * A previous I/O error may have been due to temporary * failures, eg. multipath errors. @@ -2249,9 +2258,19 @@ EXPORT_SYMBOL_GPL(generic_file_buffered_read); * * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. + * + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall + * be returned when no data can be read without waiting for I/O requests + * to complete; it doesn't prevent readahead. + * + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O + * requests shall be made for the read or for readahead. When no data + * can be read, -EAGAIN shall be returned. When readahead would be + * triggered, a partial, possibly empty read shall be returned. + * * Return: * * number of bytes copied, even for partial reads - * * negative error code if nothing was read + * * negative error code (or 0 if IOCB_NOIO) if nothing was read */ ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) -- cgit v1.2.3 From 469aceddfa3ed16e17ee30533fae45e90f62efd8 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 7 Jul 2020 13:03:25 +0200 Subject: vlan: consolidate VLAN parsing code and limit max parsing depth MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Toshiaki pointed out that we now have two very similar functions to extract the L3 protocol number in the presence of VLAN tags. And Daniel pointed out that the unbounded parsing loop makes it possible for maliciously crafted packets to loop through potentially hundreds of tags. Fix both of these issues by consolidating the two parsing functions and limiting the VLAN tag parsing to a max depth of 8 tags. As part of this, switch over __vlan_get_protocol() to use skb_header_pointer() instead of pskb_may_pull(), to avoid the possible side effects of the latter and keep the skb pointer 'const' through all the parsing functions. v2: - Use limit of 8 tags instead of 32 (matching XMIT_RECURSION_LIMIT) Reported-by: Toshiaki Makita Reported-by: Daniel Borkmann Fixes: d7bf2ebebc2b ("sched: consistently handle layer3 header accesses in the presence of VLANs") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 57 +++++++++++++++++++------------------------------ 1 file changed, 22 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 427a5b8597c2..41a518336673 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -25,6 +25,8 @@ #define VLAN_ETH_DATA_LEN 1500 /* Max. octets in payload */ #define VLAN_ETH_FRAME_LEN 1518 /* Max. octets in frame sans FCS */ +#define VLAN_MAX_DEPTH 8 /* Max. number of nested VLAN tags parsed */ + /* * struct vlan_hdr - vlan header * @h_vlan_TCI: priority and VLAN ID @@ -308,34 +310,6 @@ static inline bool eth_type_vlan(__be16 ethertype) } } -/* A getter for the SKB protocol field which will handle VLAN tags consistently - * whether VLAN acceleration is enabled or not. - */ -static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) -{ - unsigned int offset = skb_mac_offset(skb) + sizeof(struct ethhdr); - __be16 proto = skb->protocol; - - if (!skip_vlan) - /* VLAN acceleration strips the VLAN header from the skb and - * moves it to skb->vlan_proto - */ - return skb_vlan_tag_present(skb) ? skb->vlan_proto : proto; - - while (eth_type_vlan(proto)) { - struct vlan_hdr vhdr, *vh; - - vh = skb_header_pointer(skb, offset, sizeof(vhdr), &vhdr); - if (!vh) - break; - - proto = vh->h_vlan_encapsulated_proto; - offset += sizeof(vhdr); - } - - return proto; -} - static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { @@ -605,10 +579,10 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, +static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, int *depth) { - unsigned int vlan_depth = skb->mac_len; + unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; /* if type is 802.1Q/AD then the header should already be * present at mac_len - VLAN_HLEN (if mac_len > 0), or at @@ -623,13 +597,12 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, vlan_depth = ETH_HLEN; } do { - struct vlan_hdr *vh; + struct vlan_hdr vhdr, *vh; - if (unlikely(!pskb_may_pull(skb, - vlan_depth + VLAN_HLEN))) + vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr); + if (unlikely(!vh || !--parse_depth)) return 0; - vh = (struct vlan_hdr *)(skb->data + vlan_depth); type = vh->h_vlan_encapsulated_proto; vlan_depth += VLAN_HLEN; } while (eth_type_vlan(type)); @@ -648,11 +621,25 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 vlan_get_protocol(struct sk_buff *skb) +static inline __be16 vlan_get_protocol(const struct sk_buff *skb) { return __vlan_get_protocol(skb, skb->protocol, NULL); } +/* A getter for the SKB protocol field which will handle VLAN tags consistently + * whether VLAN acceleration is enabled or not. + */ +static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) +{ + if (!skip_vlan) + /* VLAN acceleration strips the VLAN header from the skb and + * moves it to skb->vlan_proto + */ + return skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol; + + return vlan_get_protocol(skb); +} + static inline void vlan_set_encap_proto(struct sk_buff *skb, struct vlan_hdr *vhdr) { -- cgit v1.2.3 From 61a707c543e2afe3aa7e88f87267c5dafa4b5afa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 8 May 2020 08:54:16 +0200 Subject: fs: add a __kernel_read helper This is the counterpart to __kernel_write, and skip the rw_verify_area call compared to kernel_read. Signed-off-by: Christoph Hellwig --- fs/read_write.c | 23 +++++++++++++++++++++++ include/linux/fs.h | 1 + 2 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/fs/read_write.c b/fs/read_write.c index 96e8e354f99b..21c9d90a257e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -430,6 +430,29 @@ ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, return -EINVAL; } +ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs = get_fs(); + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) + return -EINVAL; + if (!(file->f_mode & FMODE_CAN_READ)) + return -EINVAL; + + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; + set_fs(KERNEL_DS); + ret = __vfs_read(file, (void __user *)buf, count, pos); + set_fs(old_fs); + if (ret > 0) { + fsnotify_access(file); + add_rchar(current, ret); + } + inc_syscr(current); + return ret; +} + ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { mm_segment_t old_fs; diff --git a/include/linux/fs.h b/include/linux/fs.h index 3f881a892ea7..22cbe7b2e919 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3033,6 +3033,7 @@ extern int kernel_read_file_from_path_initns(const char *, void **, loff_t *, lo extern int kernel_read_file_from_fd(int, void **, loff_t *, loff_t, enum kernel_read_file_id); extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *); +ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos); extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *); extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *); extern struct file * open_exec(const char *); -- cgit v1.2.3 From 775802c0571fb438cd4f6548a323f9e4cb89f5aa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 8 May 2020 11:17:46 +0200 Subject: fs: remove __vfs_read Fold it into the two callers. Signed-off-by: Christoph Hellwig --- fs/read_write.c | 43 +++++++++++++++++++++---------------------- include/linux/fs.h | 1 - 2 files changed, 21 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/fs/read_write.c b/fs/read_write.c index 42a027719324..4fb797822567 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -419,17 +419,6 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo return ret; } -ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, - loff_t *pos) -{ - if (file->f_op->read) - return file->f_op->read(file, buf, count, pos); - else if (file->f_op->read_iter) - return new_sync_read(file, buf, count, pos); - else - return -EINVAL; -} - ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { mm_segment_t old_fs = get_fs(); @@ -443,7 +432,12 @@ ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; set_fs(KERNEL_DS); - ret = __vfs_read(file, (void __user *)buf, count, pos); + if (file->f_op->read) + ret = file->f_op->read(file, (void __user *)buf, count, pos); + else if (file->f_op->read_iter) + ret = new_sync_read(file, (void __user *)buf, count, pos); + else + ret = -EINVAL; set_fs(old_fs); if (ret > 0) { fsnotify_access(file); @@ -476,17 +470,22 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); - if (!ret) { - if (count > MAX_RW_COUNT) - count = MAX_RW_COUNT; - ret = __vfs_read(file, buf, count, pos); - if (ret > 0) { - fsnotify_access(file); - add_rchar(current, ret); - } - inc_syscr(current); - } + if (ret) + return ret; + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; + if (file->f_op->read) + ret = file->f_op->read(file, buf, count, pos); + else if (file->f_op->read_iter) + ret = new_sync_read(file, buf, count, pos); + else + ret = -EINVAL; + if (ret > 0) { + fsnotify_access(file); + add_rchar(current, ret); + } + inc_syscr(current); return ret; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 22cbe7b2e919..0c0ec76b600b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1917,7 +1917,6 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, struct iovec *fast_pointer, struct iovec **ret_pointer); -extern ssize_t __vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_readv(struct file *, const struct iovec __user *, -- cgit v1.2.3 From dbfb089d360b1cc623c51a2c7cf9b99eff78e0e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 3 Jul 2020 12:40:33 +0200 Subject: sched: Fix loadavg accounting race The recent commit: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") moved these lines in ttwu(): p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; up before: smp_cond_load_acquire(&p->on_cpu, !VAL); into the 'p->on_rq == 0' block, with the thinking that once we hit schedule() the current task cannot change it's ->state anymore. And while this is true, it is both incorrect and flawed. It is incorrect in that we need at least an ACQUIRE on 'p->on_rq == 0' to avoid weak hardware from re-ordering things for us. This can fairly easily be achieved by relying on the control-dependency already in place. The second problem, which makes the flaw in the original argument, is that while schedule() will not change prev->state, it will read it a number of times (arguably too many times since it's marked volatile). The previous condition 'p->on_cpu == 0' was sufficient because that indicates schedule() has completed, and will no longer read prev->state. So now the trick is to make this same true for the (much) earlier 'prev->on_rq == 0' case. Furthermore, in order to make the ordering stick, the 'prev->on_rq = 0' assignment needs to he a RELEASE, but adding additional ordering to schedule() is an unwelcome proposition at the best of times, doubly so for mere accounting. Luckily we can push the prev->state load up before rq->lock, with the only caveat that we then have to re-read the state after. However, we know that if it changed, we no longer have to worry about the blocking path. This gives us the required ordering, if we block, we did the prev->state load before an (effective) smp_mb() and the p->on_rq store needs not change. With this we end up with the effective ordering: LOAD p->state LOAD-ACQUIRE p->on_rq == 0 MB STORE p->on_rq, 0 STORE p->state, TASK_WAKING which ensures the TASK_WAKING store happens after the prev->state load, and all is well again. Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") Reported-by: Dave Jones Reported-by: Paul Gortmaker Signed-off-by: Peter Zijlstra (Intel) Tested-by: Dave Jones Tested-by: Paul Gortmaker Link: https://lkml.kernel.org/r/20200707102957.GN117543@hirez.programming.kicks-ass.net --- include/linux/sched.h | 4 --- kernel/sched/core.c | 67 +++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 51 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 692e327d7455..683372943093 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -114,10 +114,6 @@ struct task_group; #define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) -#define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ - (task->flags & PF_FROZEN) == 0 && \ - (task->state & TASK_NOLOAD) == 0) - #ifdef CONFIG_DEBUG_ATOMIC_SLEEP /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca5db40392d4..950ac45d5480 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1311,9 +1311,6 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) void activate_task(struct rq *rq, struct task_struct *p, int flags) { - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; - enqueue_task(rq, p, flags); p->on_rq = TASK_ON_RQ_QUEUED; @@ -1323,9 +1320,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; - dequeue_task(rq, p, flags); } @@ -2236,10 +2230,10 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, lockdep_assert_held(&rq->lock); -#ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; +#ifdef CONFIG_SMP if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; #endif @@ -2583,7 +2577,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); - if (p->on_rq && ttwu_remote(p, wake_flags)) + if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags)) goto unlock; if (p->in_iowait) { @@ -2592,9 +2586,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) } #ifdef CONFIG_SMP - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; - /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -2613,8 +2604,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure + * schedule()'s deactivate_task() has 'happened' and p will no longer + * care about it's own p->state. See the comment in __schedule(). */ - smp_rmb(); + smp_acquire__after_ctrl_dep(); + + /* + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq + * == 0), which means we need to do an enqueue, change p->state to + * TASK_WAKING such that we can unlock p->pi_lock before doing the + * enqueue, such as ttwu_queue_wakelist(). + */ + p->state = TASK_WAKING; /* * If the owning (remote) CPU is still in the middle of schedule() with @@ -4097,6 +4100,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; + unsigned long prev_state; struct rq_flags rf; struct rq *rq; int cpu; @@ -4113,12 +4117,22 @@ static void __sched notrace __schedule(bool preempt) local_irq_disable(); rcu_note_context_switch(preempt); + /* See deactivate_task() below. */ + prev_state = prev->state; + /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) - * done by the caller to avoid the race with signal_wake_up(). + * done by the caller to avoid the race with signal_wake_up(): + * + * __set_current_state(@state) signal_wake_up() + * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) + * wake_up_state(p, state) + * LOCK rq->lock LOCK p->pi_state + * smp_mb__after_spinlock() smp_mb__after_spinlock() + * if (signal_pending_state()) if (p->state & @state) * - * The membarrier system call requires a full memory barrier + * Also, the membarrier system call requires a full memory barrier * after coming from user-space, before storing to rq->curr. */ rq_lock(rq, &rf); @@ -4129,10 +4143,31 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); switch_count = &prev->nivcsw; - if (!preempt && prev->state) { - if (signal_pending_state(prev->state, prev)) { + /* + * We must re-load prev->state in case ttwu_remote() changed it + * before we acquired rq->lock. + */ + if (!preempt && prev_state && prev_state == prev->state) { + if (signal_pending_state(prev_state, prev)) { prev->state = TASK_RUNNING; } else { + prev->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && + !(prev->flags & PF_FROZEN); + + if (prev->sched_contributes_to_load) + rq->nr_uninterruptible++; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (READ_ONCE(p->on_rq) && ...) + * LOCK rq->lock goto out; + * smp_mb__after_spinlock(); smp_acquire__after_ctrl_dep(); + * p->on_rq = 0; p->state = TASK_WAKING; + * + * After this, schedule() must not care about p->state any more. + */ deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); if (prev->in_iowait) { -- cgit v1.2.3 From 6ec4476ac82512f09c94aff5972654b70f3772b2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Jul 2020 10:48:35 -0700 Subject: Raise gcc version requirement to 4.9 I realize that we fairly recently raised it to 4.8, but the fact is, 4.9 is a much better minimum version to target. We have a number of workarounds for actual bugs in pre-4.9 gcc versions (including things like internal compiler errors on ARM), but we also have some syntactic workarounds for lacking features. In particular, raising the minimum to 4.9 means that we can now just assume _Generic() exists, which is likely the much better replacement for a lot of very convoluted built-time magic with conditionals on sizeof and/or __builtin_choose_expr() with same_type() etc. Using _Generic also means that you will need to have a very recent version of 'sparse', but thats easy to build yourself, and much less of a hassle than some old gcc version can be. The latest (in a long string) of reasons for minimum compiler version upgrades was commit 5435f73d5c4a ("efi/x86: Fix build with gcc 4"). Ard points out that RHEL 7 uses gcc-4.8, but the people who stay back on old RHEL versions persumably also don't build their own kernels anyway. And maybe they should cross-built or just have a little side affair with a newer compiler? Acked-by: Ard Biesheuvel Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- arch/arm/kernel/asm-offsets.c | 9 --------- arch/mips/include/asm/unroll.h | 7 +++---- include/linux/bits.h | 3 +-- include/linux/compiler-gcc.h | 2 +- include/linux/compiler_types.h | 27 +-------------------------- mm/migrate.c | 13 +------------ tools/include/linux/bits.h | 3 +-- 7 files changed, 8 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index c036a4a2f8e2..a1570c8bab25 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -31,15 +31,6 @@ #if defined(__APCS_26__) #error Sorry, your compiler targets APCS-26 but this kernel requires APCS-32 #endif -/* - * GCC 4.8.0-4.8.2: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58854 - * miscompiles find_get_entry(), and can result in EXT3 and EXT4 - * filesystem corruption (possibly other FS too). - */ -#if defined(GCC_VERSION) && GCC_VERSION >= 40800 && GCC_VERSION < 40803 -#error Your compiler is too buggy; it is known to miscompile kernels -#error and result in filesystem corruption and oopses. -#endif int main(void) { diff --git a/arch/mips/include/asm/unroll.h b/arch/mips/include/asm/unroll.h index c628747d4ecd..8ed660adc84f 100644 --- a/arch/mips/include/asm/unroll.h +++ b/arch/mips/include/asm/unroll.h @@ -19,14 +19,13 @@ \ /* \ * We can't unroll if the number of iterations isn't \ - * compile-time constant. Unfortunately GCC versions \ - * up until 4.6 tend to miss obvious constants & cause \ + * compile-time constant. Unfortunately clang versions \ + * up until 8.0 tend to miss obvious constants & cause \ * this check to fail, even though they go on to \ * generate reasonable code for the switch statement, \ * so we skip the sanity check for those compilers. \ */ \ - BUILD_BUG_ON((CONFIG_GCC_VERSION >= 40700 || \ - CONFIG_CLANG_VERSION >= 80000) && \ + BUILD_BUG_ON((CONFIG_CLANG_VERSION >= 80000) && \ !__builtin_constant_p(times)); \ \ switch (times) { \ diff --git a/include/linux/bits.h b/include/linux/bits.h index 4671fbf28842..7f475d59a097 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -18,8 +18,7 @@ * position @h. For example * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. */ -#if !defined(__ASSEMBLY__) && \ - (!defined(CONFIG_CC_IS_GCC) || CONFIG_GCC_VERSION >= 49000) +#if !defined(__ASSEMBLY__) #include #define GENMASK_INPUT_CHECK(h, l) \ (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \ diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 1c74464c80c6..0b1dc61f3955 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -11,7 +11,7 @@ + __GNUC_PATCHLEVEL__) /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 */ -#if GCC_VERSION < 40800 +#if GCC_VERSION < 40900 # error Sorry, your compiler is too old - please upgrade it. #endif diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index c3bf7710f69a..01dd58c74d80 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -252,32 +252,8 @@ struct ftrace_likely_data { * __unqual_scalar_typeof(x) - Declare an unqualified scalar type, leaving * non-scalar types unchanged. */ -#if (defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 40900) || defined(__CHECKER__) /* - * We build this out of a couple of helper macros in a vain attempt to - * help you keep your lunch down while reading it. - */ -#define __pick_scalar_type(x, type, otherwise) \ - __builtin_choose_expr(__same_type(x, type), (type)0, otherwise) - -/* - * 'char' is not type-compatible with either 'signed char' or 'unsigned char', - * so we include the naked type here as well as the signed/unsigned variants. - */ -#define __pick_integer_type(x, type, otherwise) \ - __pick_scalar_type(x, type, \ - __pick_scalar_type(x, unsigned type, \ - __pick_scalar_type(x, signed type, otherwise))) - -#define __unqual_scalar_typeof(x) typeof( \ - __pick_integer_type(x, char, \ - __pick_integer_type(x, short, \ - __pick_integer_type(x, int, \ - __pick_integer_type(x, long, \ - __pick_integer_type(x, long long, x)))))) -#else -/* - * If supported, prefer C11 _Generic for better compile-times. As above, 'char' + * Prefer C11 _Generic for better compile-times and simpler code. Note: 'char' * is not type-compatible with 'signed char', and we define a separate case. */ #define __scalar_type_to_expr_cases(type) \ @@ -293,7 +269,6 @@ struct ftrace_likely_data { __scalar_type_to_expr_cases(long), \ __scalar_type_to_expr_cases(long long), \ default: (x))) -#endif /* Is this type a native word size -- useful for atomic operations */ #define __native_word(t) \ diff --git a/mm/migrate.c b/mm/migrate.c index f37729673558..40cd7016ae6f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1160,22 +1160,11 @@ out: return rc; } -/* - * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work - * around it. - */ -#if defined(CONFIG_ARM) && \ - defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700 -#define ICE_noinline noinline -#else -#define ICE_noinline -#endif - /* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static ICE_noinline int unmap_and_move(new_page_t get_new_page, +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *page, int force, enum migrate_mode mode, diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h index 4671fbf28842..7f475d59a097 100644 --- a/tools/include/linux/bits.h +++ b/tools/include/linux/bits.h @@ -18,8 +18,7 @@ * position @h. For example * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. */ -#if !defined(__ASSEMBLY__) && \ - (!defined(CONFIG_CC_IS_GCC) || CONFIG_GCC_VERSION >= 49000) +#if !defined(__ASSEMBLY__) #include #define GENMASK_INPUT_CHECK(h, l) \ (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \ -- cgit v1.2.3 From 160251842cd35a75edfb0a1d76afa3eb674ff40a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Jul 2020 11:49:23 -0700 Subject: kallsyms: Refactor kallsyms_show_value() to take cred In order to perform future tests against the cred saved during open(), switch kallsyms_show_value() to operate on a cred, and have all current callers pass current_cred(). This makes it very obvious where callers are checking the wrong credential in their "read" contexts. These will be fixed in the coming patches. Additionally switch return value to bool, since it is always used as a direct permission check, not a 0-on-success, negative-on-error style function return. Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- include/linux/filter.h | 2 +- include/linux/kallsyms.h | 5 +++-- kernel/kallsyms.c | 17 +++++++++++------ kernel/kprobes.c | 4 ++-- kernel/module.c | 2 +- 5 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 259377723603..55104f6c78e8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -889,7 +889,7 @@ static inline bool bpf_dump_raw_ok(void) /* Reconstruction of call-sites is dependent on kallsyms, * thus make dump the same restriction. */ - return kallsyms_show_value() == 1; + return kallsyms_show_value(current_cred()); } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 98338dc6b5d2..481273f0c72d 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -18,6 +18,7 @@ #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \ 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1) +struct cred; struct module; static inline int is_kernel_inittext(unsigned long addr) @@ -98,7 +99,7 @@ int lookup_symbol_name(unsigned long addr, char *symname); int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); /* How and when do we show kallsyms values? */ -extern int kallsyms_show_value(void); +extern bool kallsyms_show_value(const struct cred *cred); #else /* !CONFIG_KALLSYMS */ @@ -158,7 +159,7 @@ static inline int lookup_symbol_attrs(unsigned long addr, unsigned long *size, u return -ERANGE; } -static inline int kallsyms_show_value(void) +static inline bool kallsyms_show_value(const struct cred *cred) { return false; } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 16c8c605f4b0..bb14e64f62a4 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -644,19 +644,20 @@ static inline int kallsyms_for_perf(void) * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to * block even that). */ -int kallsyms_show_value(void) +bool kallsyms_show_value(const struct cred *cred) { switch (kptr_restrict) { case 0: if (kallsyms_for_perf()) - return 1; + return true; /* fallthrough */ case 1: - if (has_capability_noaudit(current, CAP_SYSLOG)) - return 1; + if (security_capable(cred, &init_user_ns, CAP_SYSLOG, + CAP_OPT_NOAUDIT) == 0) + return true; /* fallthrough */ default: - return 0; + return false; } } @@ -673,7 +674,11 @@ static int kallsyms_open(struct inode *inode, struct file *file) return -ENOMEM; reset_iter(iter, 0); - iter->show_value = kallsyms_show_value(); + /* + * Instead of checking this on every s_show() call, cache + * the result here at open time. + */ + iter->show_value = kallsyms_show_value(file->f_cred); return 0; } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 4a904cc56d68..d4de217e4a91 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2448,7 +2448,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, else kprobe_type = "k"; - if (!kallsyms_show_value()) + if (!kallsyms_show_value(current_cred())) addr = NULL; if (sym) @@ -2540,7 +2540,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) * If /proc/kallsyms is not showing kernel address, we won't * show them here either. */ - if (!kallsyms_show_value()) + if (!kallsyms_show_value(current_cred())) seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL, (void *)ent->start_addr); else diff --git a/kernel/module.c b/kernel/module.c index e8a198588f26..a5022ae84e50 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4377,7 +4377,7 @@ static int modules_open(struct inode *inode, struct file *file) if (!err) { struct seq_file *m = file->private_data; - m->private = kallsyms_show_value() ? NULL : (void *)8ul; + m->private = kallsyms_show_value(current_cred()) ? NULL : (void *)8ul; } return err; -- cgit v1.2.3 From 63960260457a02af2a6cb35d75e6bdb17299c882 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Jul 2020 15:45:23 -0700 Subject: bpf: Check correct cred for CAP_SYSLOG in bpf_dump_raw_ok() When evaluating access control over kallsyms visibility, credentials at open() time need to be used, not the "current" creds (though in BPF's case, this has likely always been the same). Plumb access to associated file->f_cred down through bpf_dump_raw_ok() and its callers now that kallsysm_show_value() has been refactored to take struct cred. Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: bpf@vger.kernel.org Cc: stable@vger.kernel.org Fixes: 7105e828c087 ("bpf: allow for correlation of maps and helpers in dump") Signed-off-by: Kees Cook --- include/linux/filter.h | 4 ++-- kernel/bpf/syscall.c | 37 +++++++++++++++++++++---------------- net/core/sysctl_net_core.c | 2 +- 3 files changed, 24 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 55104f6c78e8..0b0144752d78 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -884,12 +884,12 @@ void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_helper_changes_pkt_data(void *func); -static inline bool bpf_dump_raw_ok(void) +static inline bool bpf_dump_raw_ok(const struct cred *cred) { /* Reconstruction of call-sites is dependent on kallsyms, * thus make dump the same restriction. */ - return kallsyms_show_value(current_cred()); + return kallsyms_show_value(cred); } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8da159936bab..859053ddf05b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3139,7 +3139,8 @@ static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, return NULL; } -static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) +static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, + const struct cred *f_cred) { const struct bpf_map *map; struct bpf_insn *insns; @@ -3165,7 +3166,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) code == (BPF_JMP | BPF_CALL_ARGS)) { if (code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; - if (!bpf_dump_raw_ok()) + if (!bpf_dump_raw_ok(f_cred)) insns[i].imm = 0; continue; } @@ -3221,7 +3222,8 @@ static int set_info_rec_size(struct bpf_prog_info *info) return 0; } -static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, +static int bpf_prog_get_info_by_fd(struct file *file, + struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3290,11 +3292,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, struct bpf_insn *insns_sanitized; bool fault; - if (prog->blinded && !bpf_dump_raw_ok()) { + if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { info.xlated_prog_insns = 0; goto done; } - insns_sanitized = bpf_insn_prepare_dump(prog); + insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); if (!insns_sanitized) return -ENOMEM; uinsns = u64_to_user_ptr(info.xlated_prog_insns); @@ -3328,7 +3330,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } if (info.jited_prog_len && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { uinsns = u64_to_user_ptr(info.jited_prog_insns); ulen = min_t(u32, info.jited_prog_len, ulen); @@ -3363,7 +3365,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_ksyms; info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; if (ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { unsigned long ksym_addr; u64 __user *user_ksyms; u32 i; @@ -3394,7 +3396,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_func_lens; info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; if (ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { u32 __user *user_lens; u32 func_len, i; @@ -3451,7 +3453,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, else info.nr_jited_line_info = 0; if (info.nr_jited_line_info && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { __u64 __user *user_linfo; u32 i; @@ -3497,7 +3499,8 @@ done: return 0; } -static int bpf_map_get_info_by_fd(struct bpf_map *map, +static int bpf_map_get_info_by_fd(struct file *file, + struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3540,7 +3543,8 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return 0; } -static int bpf_btf_get_info_by_fd(struct btf *btf, +static int bpf_btf_get_info_by_fd(struct file *file, + struct btf *btf, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3555,7 +3559,8 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, return btf_get_info_by_fd(btf, attr, uattr); } -static int bpf_link_get_info_by_fd(struct bpf_link *link, +static int bpf_link_get_info_by_fd(struct file *file, + struct bpf_link *link, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3608,15 +3613,15 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return -EBADFD; if (f.file->f_op == &bpf_prog_fops) - err = bpf_prog_get_info_by_fd(f.file->private_data, attr, + err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_map_fops) - err = bpf_map_get_info_by_fd(f.file->private_data, attr, + err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &btf_fops) - err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); + err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_link_fops) - err = bpf_link_get_info_by_fd(f.file->private_data, + err = bpf_link_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else err = -EINVAL; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index f93f8ace6c56..6ada114bbcca 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -274,7 +274,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) { if (jit_enable < 2 || - (jit_enable == 2 && bpf_dump_raw_ok())) { + (jit_enable == 2 && bpf_dump_raw_ok(current_cred()))) { *(int *)table->data = jit_enable; if (jit_enable == 2) pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); -- cgit v1.2.3 From a50ca29523b18baea548bdf5df9b4b923c2bb4f6 Mon Sep 17 00:00:00 2001 From: Dave Wang Date: Wed, 8 Jul 2020 22:25:03 -0700 Subject: Input: elan_i2c - add more hardware ID for Lenovo laptops This adds more hardware IDs for Elan touchpads found in various Lenovo laptops. Signed-off-by: Dave Wang Link: https://lore.kernel.org/r/000201d5a8bd$9fead3f0$dfc07bd0$@emc.com.tw Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- include/linux/input/elan-i2c-ids.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/input/elan-i2c-ids.h b/include/linux/input/elan-i2c-ids.h index 1ecb6b45812c..520858d12680 100644 --- a/include/linux/input/elan-i2c-ids.h +++ b/include/linux/input/elan-i2c-ids.h @@ -67,8 +67,15 @@ static const struct acpi_device_id elan_acpi_id[] = { { "ELAN062B", 0 }, { "ELAN062C", 0 }, { "ELAN062D", 0 }, + { "ELAN062E", 0 }, /* Lenovo V340 Whiskey Lake U */ + { "ELAN062F", 0 }, /* Lenovo V340 Comet Lake U */ { "ELAN0631", 0 }, { "ELAN0632", 0 }, + { "ELAN0633", 0 }, /* Lenovo S145 */ + { "ELAN0634", 0 }, /* Lenovo V340 Ice lake */ + { "ELAN0635", 0 }, /* Lenovo V1415-IIL */ + { "ELAN0636", 0 }, /* Lenovo V1415-Dali */ + { "ELAN0637", 0 }, /* Lenovo V1415-IGLR */ { "ELAN1000", 0 }, { } }; -- cgit v1.2.3 From f88814cc2578c121e6edef686365036db72af0ed Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 8 Jul 2020 13:01:57 +0300 Subject: efi/efivars: Expose RT service availability via efivars abstraction Commit bf67fad19e493b ("efi: Use more granular check for availability for variable services") introduced a check into the efivarfs, efi-pstore and other drivers that aborts loading of the module if not all three variable runtime services (GetVariable, SetVariable and GetNextVariable) are supported. However, this results in efivarfs being unavailable entirely if only SetVariable support is missing, which is only needed if you want to make any modifications. Also, efi-pstore and the sysfs EFI variable interface could be backed by another implementation of the 'efivars' abstraction, in which case it is completely irrelevant which services are supported by the EFI firmware. So make the generic 'efivars' abstraction dependent on the availibility of the GetVariable and GetNextVariable EFI runtime services, and add a helper 'efivar_supports_writes()' to find out whether the currently active efivars abstraction supports writes (and wire it up to the availability of SetVariable for the generic one). Then, use the efivar_supports_writes() helper to decide whether to permit efivarfs to be mounted read-write, and whether to enable efi-pstore or the sysfs EFI variable interface altogether. Fixes: bf67fad19e493b ("efi: Use more granular check for availability for variable services") Reported-by: Heinrich Schuchardt Acked-by: Ilias Apalodimas Tested-by: Ilias Apalodimas Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/efi-pstore.c | 5 +---- drivers/firmware/efi/efi.c | 12 ++++++++---- drivers/firmware/efi/efivars.c | 5 +---- drivers/firmware/efi/vars.c | 6 ++++++ fs/efivarfs/super.c | 6 +++--- include/linux/efi.h | 1 + 6 files changed, 20 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c index c2f1d4e6630b..feb7fe6f2da7 100644 --- a/drivers/firmware/efi/efi-pstore.c +++ b/drivers/firmware/efi/efi-pstore.c @@ -356,10 +356,7 @@ static struct pstore_info efi_pstore_info = { static __init int efivars_pstore_init(void) { - if (!efi_rt_services_supported(EFI_RT_SUPPORTED_VARIABLE_SERVICES)) - return 0; - - if (!efivars_kobject()) + if (!efivars_kobject() || !efivar_supports_writes()) return 0; if (efivars_pstore_disable) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 5114cae4ec97..fdd1db025dbf 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -176,11 +176,13 @@ static struct efivar_operations generic_ops; static int generic_ops_register(void) { generic_ops.get_variable = efi.get_variable; - generic_ops.set_variable = efi.set_variable; - generic_ops.set_variable_nonblocking = efi.set_variable_nonblocking; generic_ops.get_next_variable = efi.get_next_variable; generic_ops.query_variable_store = efi_query_variable_store; + if (efi_rt_services_supported(EFI_RT_SUPPORTED_SET_VARIABLE)) { + generic_ops.set_variable = efi.set_variable; + generic_ops.set_variable_nonblocking = efi.set_variable_nonblocking; + } return efivars_register(&generic_efivars, &generic_ops, efi_kobj); } @@ -382,7 +384,8 @@ static int __init efisubsys_init(void) return -ENOMEM; } - if (efi_rt_services_supported(EFI_RT_SUPPORTED_VARIABLE_SERVICES)) { + if (efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE | + EFI_RT_SUPPORTED_GET_NEXT_VARIABLE_NAME)) { efivar_ssdt_load(); error = generic_ops_register(); if (error) @@ -416,7 +419,8 @@ static int __init efisubsys_init(void) err_remove_group: sysfs_remove_group(efi_kobj, &efi_subsys_attr_group); err_unregister: - if (efi_rt_services_supported(EFI_RT_SUPPORTED_VARIABLE_SERVICES)) + if (efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE | + EFI_RT_SUPPORTED_GET_NEXT_VARIABLE_NAME)) generic_ops_unregister(); err_put: kobject_put(efi_kobj); diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c index 26528a46d99e..dcea137142b3 100644 --- a/drivers/firmware/efi/efivars.c +++ b/drivers/firmware/efi/efivars.c @@ -680,11 +680,8 @@ int efivars_sysfs_init(void) struct kobject *parent_kobj = efivars_kobject(); int error = 0; - if (!efi_rt_services_supported(EFI_RT_SUPPORTED_VARIABLE_SERVICES)) - return -ENODEV; - /* No efivars has been registered yet */ - if (!parent_kobj) + if (!parent_kobj || !efivar_supports_writes()) return 0; printk(KERN_INFO "EFI Variables Facility v%s %s\n", EFIVARS_VERSION, diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index 5f2a4d162795..973eef234b36 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -1229,3 +1229,9 @@ out: return rv; } EXPORT_SYMBOL_GPL(efivars_unregister); + +int efivar_supports_writes(void) +{ + return __efivars && __efivars->ops->set_variable; +} +EXPORT_SYMBOL_GPL(efivar_supports_writes); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 12c66f5d92dd..28bb5689333a 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -201,6 +201,9 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_d_op = &efivarfs_d_ops; sb->s_time_gran = 1; + if (!efivar_supports_writes()) + sb->s_flags |= SB_RDONLY; + inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0, true); if (!inode) return -ENOMEM; @@ -252,9 +255,6 @@ static struct file_system_type efivarfs_type = { static __init int efivarfs_init(void) { - if (!efi_rt_services_supported(EFI_RT_SUPPORTED_VARIABLE_SERVICES)) - return -ENODEV; - if (!efivars_kobject()) return -ENODEV; diff --git a/include/linux/efi.h b/include/linux/efi.h index bb35f3305e55..05c47f857383 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -994,6 +994,7 @@ int efivars_register(struct efivars *efivars, int efivars_unregister(struct efivars *efivars); struct kobject *efivars_kobject(void); +int efivar_supports_writes(void); int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), void *data, bool duplicates, struct list_head *head); -- cgit v1.2.3 From 14b032b8f8fce03a546dcf365454bec8c4a58d7d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 9 Jul 2020 16:28:44 -0700 Subject: cgroup: Fix sock_cgroup_data on big-endian. In order for no_refcnt and is_data to be the lowest order two bits in the 'val' we have to pad out the bitfield of the u8. Fixes: ad0f75e5f57c ("cgroup: fix cgroup_sk_alloc() for sk_clone_lock()") Reported-by: Guenter Roeck Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4f1cd0edc57d..fee0b5547cd0 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -792,6 +792,7 @@ struct sock_cgroup_data { struct { u8 is_data : 1; u8 no_refcnt : 1; + u8 unused : 6; u8 padding; u16 prioidx; u32 classid; @@ -801,6 +802,7 @@ struct sock_cgroup_data { u32 classid; u16 prioidx; u8 padding; + u8 unused : 6; u8 no_refcnt : 1; u8 is_data : 1; } __packed; -- cgit v1.2.3 From 88b3d5c90e9685be54dd5bc441970044020eca76 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 22 Jun 2020 09:03:31 +0300 Subject: net/mlx5e: Fix port buffers cell size value Device unit for port buffers size, xoff_threshold and xon_threshold is cells. Fix a bug in driver where cell unit size was hard-coded to 128 bytes. This hard-coded value is buggy, as it is wrong for some hardware versions. Driver to read cell size from SBCAM register and translate bytes to cell units accordingly. In order to fix the bug, this patch exposes SBCAM (Shared buffer capabilities mask) layout and defines. If SBCAM.cap_cell_size is valid, use it for all bytes to cells calculations. If not valid, fallback to 128. Cell size do not change on the fly per device. Instead of issuing SBCAM access reg command every time such translation is needed, cache it in mlx5e_dcbx as part of mlx5e_dcbnl_initialize(). Pass dcbx.port_buff_cell_sz as a param to every function that needs bytes to cells translation. While fixing the bug, move MLX5E_BUFFER_CELL_SHIFT macro to en_dcbnl.c, as it is only used by that file. Fixes: 0696d60853d5 ("net/mlx5e: Receive buffer configuration") Signed-off-by: Eran Ben Elisha Reviewed-by: Huy Nguyen Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h | 1 + .../ethernet/mellanox/mlx5/core/en/port_buffer.c | 53 ++++++++++++---------- .../ethernet/mellanox/mlx5/core/en/port_buffer.h | 1 - drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 19 ++++++++ include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 28 ++++++++++++ 6 files changed, 78 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h b/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h index 7be6b2d36b60..9976de8b9047 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h @@ -29,6 +29,7 @@ struct mlx5e_dcbx { bool manual_buffer; u32 cable_len; u32 xoff; + u16 port_buff_cell_sz; }; #define MLX5E_MAX_DSCP (64) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c index ae99fac08b53..673f1c82d381 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c @@ -34,6 +34,7 @@ int mlx5e_port_query_buffer(struct mlx5e_priv *priv, struct mlx5e_port_buffer *port_buffer) { + u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz; struct mlx5_core_dev *mdev = priv->mdev; int sz = MLX5_ST_SZ_BYTES(pbmc_reg); u32 total_used = 0; @@ -57,11 +58,11 @@ int mlx5e_port_query_buffer(struct mlx5e_priv *priv, port_buffer->buffer[i].epsb = MLX5_GET(bufferx_reg, buffer, epsb); port_buffer->buffer[i].size = - MLX5_GET(bufferx_reg, buffer, size) << MLX5E_BUFFER_CELL_SHIFT; + MLX5_GET(bufferx_reg, buffer, size) * port_buff_cell_sz; port_buffer->buffer[i].xon = - MLX5_GET(bufferx_reg, buffer, xon_threshold) << MLX5E_BUFFER_CELL_SHIFT; + MLX5_GET(bufferx_reg, buffer, xon_threshold) * port_buff_cell_sz; port_buffer->buffer[i].xoff = - MLX5_GET(bufferx_reg, buffer, xoff_threshold) << MLX5E_BUFFER_CELL_SHIFT; + MLX5_GET(bufferx_reg, buffer, xoff_threshold) * port_buff_cell_sz; total_used += port_buffer->buffer[i].size; mlx5e_dbg(HW, priv, "buffer %d: size=%d, xon=%d, xoff=%d, epsb=%d, lossy=%d\n", i, @@ -73,7 +74,7 @@ int mlx5e_port_query_buffer(struct mlx5e_priv *priv, } port_buffer->port_buffer_size = - MLX5_GET(pbmc_reg, out, port_buffer_size) << MLX5E_BUFFER_CELL_SHIFT; + MLX5_GET(pbmc_reg, out, port_buffer_size) * port_buff_cell_sz; port_buffer->spare_buffer_size = port_buffer->port_buffer_size - total_used; @@ -88,9 +89,9 @@ out: static int port_set_buffer(struct mlx5e_priv *priv, struct mlx5e_port_buffer *port_buffer) { + u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz; struct mlx5_core_dev *mdev = priv->mdev; int sz = MLX5_ST_SZ_BYTES(pbmc_reg); - void *buffer; void *in; int err; int i; @@ -104,16 +105,18 @@ static int port_set_buffer(struct mlx5e_priv *priv, goto out; for (i = 0; i < MLX5E_MAX_BUFFER; i++) { - buffer = MLX5_ADDR_OF(pbmc_reg, in, buffer[i]); - - MLX5_SET(bufferx_reg, buffer, size, - port_buffer->buffer[i].size >> MLX5E_BUFFER_CELL_SHIFT); - MLX5_SET(bufferx_reg, buffer, lossy, - port_buffer->buffer[i].lossy); - MLX5_SET(bufferx_reg, buffer, xoff_threshold, - port_buffer->buffer[i].xoff >> MLX5E_BUFFER_CELL_SHIFT); - MLX5_SET(bufferx_reg, buffer, xon_threshold, - port_buffer->buffer[i].xon >> MLX5E_BUFFER_CELL_SHIFT); + void *buffer = MLX5_ADDR_OF(pbmc_reg, in, buffer[i]); + u64 size = port_buffer->buffer[i].size; + u64 xoff = port_buffer->buffer[i].xoff; + u64 xon = port_buffer->buffer[i].xon; + + do_div(size, port_buff_cell_sz); + do_div(xoff, port_buff_cell_sz); + do_div(xon, port_buff_cell_sz); + MLX5_SET(bufferx_reg, buffer, size, size); + MLX5_SET(bufferx_reg, buffer, lossy, port_buffer->buffer[i].lossy); + MLX5_SET(bufferx_reg, buffer, xoff_threshold, xoff); + MLX5_SET(bufferx_reg, buffer, xon_threshold, xon); } err = mlx5e_port_set_pbmc(mdev, in); @@ -143,7 +146,7 @@ static u32 calculate_xoff(struct mlx5e_priv *priv, unsigned int mtu) } static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer, - u32 xoff, unsigned int max_mtu) + u32 xoff, unsigned int max_mtu, u16 port_buff_cell_sz) { int i; @@ -155,7 +158,7 @@ static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer, } if (port_buffer->buffer[i].size < - (xoff + max_mtu + (1 << MLX5E_BUFFER_CELL_SHIFT))) { + (xoff + max_mtu + port_buff_cell_sz)) { pr_err("buffer_size[%d]=%d is not enough for lossless buffer\n", i, port_buffer->buffer[i].size); return -ENOMEM; @@ -175,6 +178,7 @@ static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer, * @pfc_en: current pfc configuration * @buffer: current prio to buffer mapping * @xoff: xoff value + * @port_buff_cell_sz: port buffer cell_size * @port_buffer: port receive buffer configuration * @change: * @@ -189,7 +193,7 @@ static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer, * sets change to true if buffer configuration was modified. */ static int update_buffer_lossy(unsigned int max_mtu, - u8 pfc_en, u8 *buffer, u32 xoff, + u8 pfc_en, u8 *buffer, u32 xoff, u16 port_buff_cell_sz, struct mlx5e_port_buffer *port_buffer, bool *change) { @@ -225,7 +229,7 @@ static int update_buffer_lossy(unsigned int max_mtu, } if (changed) { - err = update_xoff_threshold(port_buffer, xoff, max_mtu); + err = update_xoff_threshold(port_buffer, xoff, max_mtu, port_buff_cell_sz); if (err) return err; @@ -262,6 +266,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, u32 *buffer_size, u8 *prio2buffer) { + u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz; struct mlx5e_port_buffer port_buffer; u32 xoff = calculate_xoff(priv, mtu); bool update_prio2buffer = false; @@ -282,7 +287,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, if (change & MLX5E_PORT_BUFFER_CABLE_LEN) { update_buffer = true; - err = update_xoff_threshold(&port_buffer, xoff, max_mtu); + err = update_xoff_threshold(&port_buffer, xoff, max_mtu, port_buff_cell_sz); if (err) return err; } @@ -292,7 +297,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, if (err) return err; - err = update_buffer_lossy(max_mtu, pfc->pfc_en, buffer, xoff, + err = update_buffer_lossy(max_mtu, pfc->pfc_en, buffer, xoff, port_buff_cell_sz, &port_buffer, &update_buffer); if (err) return err; @@ -304,7 +309,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, if (err) return err; - err = update_buffer_lossy(max_mtu, curr_pfc_en, prio2buffer, + err = update_buffer_lossy(max_mtu, curr_pfc_en, prio2buffer, port_buff_cell_sz, xoff, &port_buffer, &update_buffer); if (err) return err; @@ -329,7 +334,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, return -EINVAL; update_buffer = true; - err = update_xoff_threshold(&port_buffer, xoff, max_mtu); + err = update_xoff_threshold(&port_buffer, xoff, max_mtu, port_buff_cell_sz); if (err) return err; } @@ -337,7 +342,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, /* Need to update buffer configuration if xoff value is changed */ if (!update_buffer && xoff != priv->dcbx.xoff) { update_buffer = true; - err = update_xoff_threshold(&port_buffer, xoff, max_mtu); + err = update_xoff_threshold(&port_buffer, xoff, max_mtu, port_buff_cell_sz); if (err) return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h index 34f55b81a0de..80af7a5ac604 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h @@ -36,7 +36,6 @@ #include "port.h" #define MLX5E_MAX_BUFFER 8 -#define MLX5E_BUFFER_CELL_SHIFT 7 #define MLX5E_DEFAULT_CABLE_LEN 7 /* 7 meters */ #define MLX5_BUFFER_SUPPORTED(mdev) (MLX5_CAP_GEN(mdev, pcam_reg) && \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index bc102d094bbd..d20243d6a032 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -1217,6 +1217,24 @@ static int mlx5e_trust_initialize(struct mlx5e_priv *priv) return 0; } +#define MLX5E_BUFFER_CELL_SHIFT 7 + +static u16 mlx5e_query_port_buffers_cell_size(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u32 out[MLX5_ST_SZ_DW(sbcam_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(sbcam_reg)] = {}; + + if (!MLX5_CAP_GEN(mdev, sbcam_reg)) + return (1 << MLX5E_BUFFER_CELL_SHIFT); + + if (mlx5_core_access_reg(mdev, in, sizeof(in), out, sizeof(out), + MLX5_REG_SBCAM, 0, 0)) + return (1 << MLX5E_BUFFER_CELL_SHIFT); + + return MLX5_GET(sbcam_reg, out, cap_cell_size); +} + void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv) { struct mlx5e_dcbx *dcbx = &priv->dcbx; @@ -1234,6 +1252,7 @@ void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv) if (priv->dcbx.mode == MLX5E_DCBX_PARAM_VER_OPER_HOST) priv->dcbx.cap |= DCB_CAP_DCBX_HOST; + priv->dcbx.port_buff_cell_sz = mlx5e_query_port_buffers_cell_size(priv); priv->dcbx.manual_buffer = false; priv->dcbx.cable_len = MLX5E_DEFAULT_CABLE_LEN; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 13c0e4556eda..1e6ca716635a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -147,6 +147,7 @@ enum { MLX5_REG_MCDA = 0x9063, MLX5_REG_MCAM = 0x907f, MLX5_REG_MIRC = 0x9162, + MLX5_REG_SBCAM = 0xB01F, MLX5_REG_RESOURCE_DUMP = 0xC000, }; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ca1887dd0423..073b79eacc99 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9960,6 +9960,34 @@ struct mlx5_ifc_pptb_reg_bits { u8 untagged_buff[0x4]; }; +struct mlx5_ifc_sbcam_reg_bits { + u8 reserved_at_0[0x8]; + u8 feature_group[0x8]; + u8 reserved_at_10[0x8]; + u8 access_reg_group[0x8]; + + u8 reserved_at_20[0x20]; + + u8 sb_access_reg_cap_mask[4][0x20]; + + u8 reserved_at_c0[0x80]; + + u8 sb_feature_cap_mask[4][0x20]; + + u8 reserved_at_1c0[0x40]; + + u8 cap_total_buffer_size[0x20]; + + u8 cap_cell_size[0x10]; + u8 cap_max_pg_buffers[0x8]; + u8 cap_num_pool_supported[0x8]; + + u8 reserved_at_240[0x8]; + u8 cap_sbsr_stat_size[0x8]; + u8 cap_max_tclass_data[0x8]; + u8 cap_max_cpu_ingress_tclass_sb[0x8]; +}; + struct mlx5_ifc_pbmc_reg_bits { u8 reserved_at_0[0x8]; u8 local_port[0x8]; -- cgit v1.2.3 From 8c080d3a974ad471d8324825851044284f1886c9 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jun 2020 13:36:42 +0800 Subject: kgdb: enable arch to support XML packet. The XML packet could be supported by required architecture if the architecture defines CONFIG_HAVE_ARCH_KGDB_QXFER_PKT and implement its own kgdb_arch_handle_qxfer_pkt(). Except for the kgdb_arch_handle_qxfer_pkt(), the architecture also needs to record the feature supported by gdb stub into the kgdb_arch_gdb_stub_feature, and these features will be reported to host gdb when gdb stub receives the qSupported packet. Signed-off-by: Vincent Chen Acked-by: Daniel Thompson Signed-off-by: Palmer Dabbelt --- include/linux/kgdb.h | 11 +++++++++++ kernel/debug/gdbstub.c | 13 +++++++++++++ lib/Kconfig.kgdb | 5 +++++ 3 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 529116b0cabe..0e4e3a80d58c 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -176,6 +176,17 @@ kgdb_arch_handle_exception(int vector, int signo, int err_code, char *remcom_out_buffer, struct pt_regs *regs); +/** + * kgdb_arch_handle_qxfer_pkt - Handle architecture specific GDB XML + * packets. + * @remcom_in_buffer: The buffer of the packet we have read. + * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into. + */ + +extern void +kgdb_arch_handle_qxfer_pkt(char *remcom_in_buffer, + char *remcom_out_buffer); + /** * kgdb_call_nmi_hook - Call kgdb_nmicallback() on the current CPU * @ignored: This parameter is only here to match the prototype. diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 61774aec46b4..a790026e42d0 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -792,6 +792,19 @@ static void gdb_cmd_query(struct kgdb_state *ks) } break; #endif +#ifdef CONFIG_HAVE_ARCH_KGDB_QXFER_PKT + case 'S': + if (!strncmp(remcom_in_buffer, "qSupported:", 11)) + strcpy(remcom_out_buffer, kgdb_arch_gdb_stub_feature); + break; + case 'X': + if (!strncmp(remcom_in_buffer, "qXfer:", 6)) + kgdb_arch_handle_qxfer_pkt(remcom_in_buffer, + remcom_out_buffer); + break; +#endif + default: + break; } } diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb index ffa7a76de086..256f2486f9bd 100644 --- a/lib/Kconfig.kgdb +++ b/lib/Kconfig.kgdb @@ -3,6 +3,11 @@ config HAVE_ARCH_KGDB bool +# set if architecture has the its kgdb_arch_handle_qxfer_pkt +# function to enable gdb stub to address XML packet sent from GDB. +config HAVE_ARCH_KGDB_QXFER_PKT + bool + menuconfig KGDB bool "KGDB: kernel debugger" depends on HAVE_ARCH_KGDB -- cgit v1.2.3 From def0aa218e6d42231540329e6f5741fdec9e7da4 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jun 2020 13:37:25 +0800 Subject: kgdb: Move the extern declaration kgdb_has_hit_break() to generic kgdb.h Currently, only riscv kgdb.c uses the kgdb_has_hit_break() to identify the kgdb breakpoint. It causes other architectures will encounter the "no previous prototype" warnings if the compile option has W=1. Moving the declaration of extern kgdb_has_hit_break() from risc-v kgdb.h to generic kgdb.h to avoid generating these warnings. Signed-off-by: Vincent Chen Acked-by: Daniel Thompson Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/kgdb.h | 1 - include/linux/kgdb.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/riscv/include/asm/kgdb.h b/arch/riscv/include/asm/kgdb.h index 8177a457caff..f45889bbb965 100644 --- a/arch/riscv/include/asm/kgdb.h +++ b/arch/riscv/include/asm/kgdb.h @@ -19,7 +19,6 @@ #ifndef __ASSEMBLY__ -extern int kgdb_has_hit_break(unsigned long addr); extern unsigned long kgdb_compiled_break; static inline void arch_kgdb_breakpoint(void) diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 0e4e3a80d58c..477b8b7c908f 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -325,6 +325,7 @@ extern int kgdb_hex2mem(char *buf, char *mem, int count); extern int kgdb_isremovedbreak(unsigned long addr); extern void kgdb_schedule_breakpoint(void); +extern int kgdb_has_hit_break(unsigned long addr); extern int kgdb_handle_exception(int ex_vector, int signo, int err_code, -- cgit v1.2.3 From 6348dd291e3653534a9e28e6917569bc9967b35b Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Fri, 19 Jun 2020 17:27:19 +0530 Subject: dmabuf: use spinlock to access dmabuf->name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There exists a sleep-while-atomic bug while accessing the dmabuf->name under mutex in the dmabuffs_dname(). This is caused from the SELinux permissions checks on a process where it tries to validate the inherited files from fork() by traversing them through iterate_fd() (which traverse files under spin_lock) and call match_file(security/selinux/hooks.c) where the permission checks happen. This audit information is logged using dump_common_audit_data() where it calls d_path() to get the file path name. If the file check happen on the dmabuf's fd, then it ends up in ->dmabuffs_dname() and use mutex to access dmabuf->name. The flow will be like below: flush_unauthorized_files() iterate_fd() spin_lock() --> Start of the atomic section. match_file() file_has_perm() avc_has_perm() avc_audit() slow_avc_audit() common_lsm_audit() dump_common_audit_data() audit_log_d_path() d_path() dmabuffs_dname() mutex_lock()--> Sleep while atomic. Call trace captured (on 4.19 kernels) is below: ___might_sleep+0x204/0x208 __might_sleep+0x50/0x88 __mutex_lock_common+0x5c/0x1068 __mutex_lock_common+0x5c/0x1068 mutex_lock_nested+0x40/0x50 dmabuffs_dname+0xa0/0x170 d_path+0x84/0x290 audit_log_d_path+0x74/0x130 common_lsm_audit+0x334/0x6e8 slow_avc_audit+0xb8/0xf8 avc_has_perm+0x154/0x218 file_has_perm+0x70/0x180 match_file+0x60/0x78 iterate_fd+0x128/0x168 selinux_bprm_committing_creds+0x178/0x248 security_bprm_committing_creds+0x30/0x48 install_exec_creds+0x1c/0x68 load_elf_binary+0x3a4/0x14e0 search_binary_handler+0xb0/0x1e0 So, use spinlock to access dmabuf->name to avoid sleep-while-atomic. Cc: [5.3+] Signed-off-by: Charan Teja Kalla Reviewed-by: Michael J. Ruhl Acked-by: Christian König [sumits: added comment to spinlock_t definition to avoid warning] Signed-off-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/a83e7f0d-4e54-9848-4b58-e1acdbe06735@codeaurora.org --- drivers/dma-buf/dma-buf.c | 11 +++++++---- include/linux/dma-buf.h | 1 + 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 412629601ad3..1ca609f66fdf 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -45,10 +45,10 @@ static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen) size_t ret = 0; dmabuf = dentry->d_fsdata; - dma_resv_lock(dmabuf->resv, NULL); + spin_lock(&dmabuf->name_lock); if (dmabuf->name) ret = strlcpy(name, dmabuf->name, DMA_BUF_NAME_LEN); - dma_resv_unlock(dmabuf->resv); + spin_unlock(&dmabuf->name_lock); return dynamic_dname(dentry, buffer, buflen, "/%s:%s", dentry->d_name.name, ret > 0 ? name : ""); @@ -338,8 +338,10 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf) kfree(name); goto out_unlock; } + spin_lock(&dmabuf->name_lock); kfree(dmabuf->name); dmabuf->name = name; + spin_unlock(&dmabuf->name_lock); out_unlock: dma_resv_unlock(dmabuf->resv); @@ -402,10 +404,10 @@ static void dma_buf_show_fdinfo(struct seq_file *m, struct file *file) /* Don't count the temporary reference taken inside procfs seq_show */ seq_printf(m, "count:\t%ld\n", file_count(dmabuf->file) - 1); seq_printf(m, "exp_name:\t%s\n", dmabuf->exp_name); - dma_resv_lock(dmabuf->resv, NULL); + spin_lock(&dmabuf->name_lock); if (dmabuf->name) seq_printf(m, "name:\t%s\n", dmabuf->name); - dma_resv_unlock(dmabuf->resv); + spin_unlock(&dmabuf->name_lock); } static const struct file_operations dma_buf_fops = { @@ -542,6 +544,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info) dmabuf->size = exp_info->size; dmabuf->exp_name = exp_info->exp_name; dmabuf->owner = exp_info->owner; + spin_lock_init(&dmabuf->name_lock); init_waitqueue_head(&dmabuf->poll); dmabuf->cb_excl.poll = dmabuf->cb_shared.poll = &dmabuf->poll; dmabuf->cb_excl.active = dmabuf->cb_shared.active = 0; diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index ab0c156abee6..a2ca294eaebe 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -311,6 +311,7 @@ struct dma_buf { void *vmap_ptr; const char *exp_name; const char *name; + spinlock_t name_lock; /* spinlock to protect name access */ struct module *owner; struct list_head list_node; void *priv; -- cgit v1.2.3 From ec7bd78498f29680f536451fbdf9464e851273ed Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Wed, 1 Jul 2020 12:42:58 -0700 Subject: driver core: Rename dev_links_info.defer_sync to defer_hook The defer_sync field is used as a hook to add the device to the deferred_sync list. Rename it so that it's more meaningful for the next patch that'll also use this field as a hook to a deferred_fw_devlink list. Signed-off-by: Saravana Kannan Reviewed-by: Rafael J. Wysocki Tested-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20200701194259.3337652-3-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 22 +++++++++++----------- include/linux/device.h | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/core.c b/drivers/base/core.c index 35cc9896eb9e..d1d2cdc3a8d8 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -754,11 +754,11 @@ static void __device_links_queue_sync_state(struct device *dev, */ dev->state_synced = true; - if (WARN_ON(!list_empty(&dev->links.defer_sync))) + if (WARN_ON(!list_empty(&dev->links.defer_hook))) return; get_device(dev); - list_add_tail(&dev->links.defer_sync, list); + list_add_tail(&dev->links.defer_hook, list); } /** @@ -776,8 +776,8 @@ static void device_links_flush_sync_list(struct list_head *list, { struct device *dev, *tmp; - list_for_each_entry_safe(dev, tmp, list, links.defer_sync) { - list_del_init(&dev->links.defer_sync); + list_for_each_entry_safe(dev, tmp, list, links.defer_hook) { + list_del_init(&dev->links.defer_hook); if (dev != dont_lock_dev) device_lock(dev); @@ -815,12 +815,12 @@ void device_links_supplier_sync_state_resume(void) if (defer_sync_state_count) goto out; - list_for_each_entry_safe(dev, tmp, &deferred_sync, links.defer_sync) { + list_for_each_entry_safe(dev, tmp, &deferred_sync, links.defer_hook) { /* * Delete from deferred_sync list before queuing it to - * sync_list because defer_sync is used for both lists. + * sync_list because defer_hook is used for both lists. */ - list_del_init(&dev->links.defer_sync); + list_del_init(&dev->links.defer_hook); __device_links_queue_sync_state(dev, &sync_list); } out: @@ -838,8 +838,8 @@ late_initcall(sync_state_resume_initcall); static void __device_links_supplier_defer_sync(struct device *sup) { - if (list_empty(&sup->links.defer_sync) && dev_has_sync_state(sup)) - list_add_tail(&sup->links.defer_sync, &deferred_sync); + if (list_empty(&sup->links.defer_hook) && dev_has_sync_state(sup)) + list_add_tail(&sup->links.defer_hook, &deferred_sync); } static void device_link_drop_managed(struct device_link *link) @@ -1052,7 +1052,7 @@ void device_links_driver_cleanup(struct device *dev) WRITE_ONCE(link->status, DL_STATE_DORMANT); } - list_del_init(&dev->links.defer_sync); + list_del_init(&dev->links.defer_hook); __device_links_no_driver(dev); device_links_write_unlock(); @@ -2171,7 +2171,7 @@ void device_initialize(struct device *dev) INIT_LIST_HEAD(&dev->links.consumers); INIT_LIST_HEAD(&dev->links.suppliers); INIT_LIST_HEAD(&dev->links.needs_suppliers); - INIT_LIST_HEAD(&dev->links.defer_sync); + INIT_LIST_HEAD(&dev->links.defer_hook); dev->links.status = DL_DEV_NO_DRIVER; } EXPORT_SYMBOL_GPL(device_initialize); diff --git a/include/linux/device.h b/include/linux/device.h index 15460a5ac024..9bb2bc7bb8e3 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -433,7 +433,7 @@ enum dl_dev_state { * @suppliers: List of links to supplier devices. * @consumers: List of links to consumer devices. * @needs_suppliers: Hook to global list of devices waiting for suppliers. - * @defer_sync: Hook to global list of devices that have deferred sync_state. + * @defer_hook: Hook to global list of devices that have deferred sync_state. * @need_for_probe: If needs_suppliers is on a list, this indicates if the * suppliers are needed for probe or not. * @status: Driver status information. @@ -442,7 +442,7 @@ struct dev_links_info { struct list_head suppliers; struct list_head consumers; struct list_head needs_suppliers; - struct list_head defer_sync; + struct list_head defer_hook; bool need_for_probe; enum dl_dev_state status; }; -- cgit v1.2.3 From 2451e746478a6a6e981cfa66b62b791ca93b90c8 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Wed, 1 Jul 2020 12:42:59 -0700 Subject: driver core: Avoid deferred probe due to fw_devlink_pause/resume() With the earlier patch in this series, all devices that deferred probe due to fw_devlink_pause() will have their probes delayed till the deferred probe thread is kicked off during late_initcall. This will also affect all their consumers. This delayed probing in unnecessary. So this patch just keeps track of the devices that had their probe deferred due to fw_devlink_pause() and attempts to probe them once during fw_devlink_resume(). Fixes: 716a7a259690 ("driver core: fw_devlink: Add support for batching fwnode parsing") Signed-off-by: Saravana Kannan Tested-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20200701194259.3337652-4-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 21 +++++++++++++++++++++ include/linux/device.h | 3 ++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/base/core.c b/drivers/base/core.c index d1d2cdc3a8d8..05d414e9e8a4 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -50,6 +50,7 @@ static DEFINE_MUTEX(wfs_lock); static LIST_HEAD(deferred_sync); static unsigned int defer_sync_state_count = 1; static unsigned int defer_fw_devlink_count; +static LIST_HEAD(deferred_fw_devlink); static DEFINE_MUTEX(defer_fw_devlink_lock); static bool fw_devlink_is_permissive(void); @@ -1244,6 +1245,12 @@ static void fw_devlink_link_device(struct device *dev) fw_ret = -EAGAIN; } else { fw_ret = -ENODEV; + /* + * defer_hook is not used to add device to deferred_sync list + * until device is bound. Since deferred fw devlink also blocks + * probing, same list hook can be used for deferred_fw_devlink. + */ + list_add_tail(&dev->links.defer_hook, &deferred_fw_devlink); } if (fw_ret == -ENODEV) @@ -1312,6 +1319,9 @@ void fw_devlink_pause(void) */ void fw_devlink_resume(void) { + struct device *dev, *tmp; + LIST_HEAD(probe_list); + mutex_lock(&defer_fw_devlink_lock); if (!defer_fw_devlink_count) { WARN(true, "Unmatched fw_devlink pause/resume!"); @@ -1323,8 +1333,19 @@ void fw_devlink_resume(void) goto out; device_link_add_missing_supplier_links(); + list_splice_tail_init(&deferred_fw_devlink, &probe_list); out: mutex_unlock(&defer_fw_devlink_lock); + + /* + * bus_probe_device() can cause new devices to get added and they'll + * try to grab defer_fw_devlink_lock. So, this needs to be done outside + * the defer_fw_devlink_lock. + */ + list_for_each_entry_safe(dev, tmp, &probe_list, links.defer_hook) { + list_del_init(&dev->links.defer_hook); + bus_probe_device(dev); + } } /* Device links support end. */ diff --git a/include/linux/device.h b/include/linux/device.h index 9bb2bc7bb8e3..5efed864b387 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -433,7 +433,8 @@ enum dl_dev_state { * @suppliers: List of links to supplier devices. * @consumers: List of links to consumer devices. * @needs_suppliers: Hook to global list of devices waiting for suppliers. - * @defer_hook: Hook to global list of devices that have deferred sync_state. + * @defer_hook: Hook to global list of devices that have deferred sync_state or + * deferred fw_devlink. * @need_for_probe: If needs_suppliers is on a list, this indicates if the * suppliers are needed for probe or not. * @status: Driver status information. -- cgit v1.2.3 From b330966f79fb4fdc49183f58db113303695a750f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 14 Jul 2020 14:45:41 +0200 Subject: fuse: reject options on reconfigure via fsconfig(2) Previous patch changed handling of remount/reconfigure to ignore all options, including those that are unknown to the fuse kernel fs. This was done for backward compatibility, but this likely only affects the old mount(2) API. The new fsconfig(2) based reconfiguration could possibly be improved. This would make the new API less of a drop in replacement for the old, OTOH this is a good chance to get rid of some weirdnesses in the old API. Several other behaviors might make sense: 1) unknown options are rejected, known options are ignored 2) unknown options are rejected, known options are rejected if the value is changed, allowed otherwise 3) all options are rejected Prior to the backward compatibility fix to ignore all options all known options were accepted (1), even if they change the value of a mount parameter; fuse_reconfigure() does not look at the config values set by fuse_parse_param(). To fix that we'd need to verify that the value provided is the same as set in the initial configuration (2). The major drawback is that this is much more complex than just rejecting all attempts at changing options (3); i.e. all options signify initial configuration values and don't make sense on reconfigure. This patch opts for (3) with the rationale that no mount options are reconfigurable in fuse. Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 16 ++++++++++------ fs/namespace.c | 1 + include/linux/fs_context.h | 1 + 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ba201bf5ffad..bba747520e9b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -477,12 +477,16 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) struct fuse_fs_context *ctx = fc->fs_private; int opt; - /* - * Ignore options coming from mount(MS_REMOUNT) for backward - * compatibility. - */ - if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) - return 0; + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + /* + * Ignore options coming from mount(MS_REMOUNT) for backward + * compatibility. + */ + if (fc->oldapi) + return 0; + + return invalfc(fc, "No changes allowed in reconfigure"); + } opt = fs_parse(fc, fuse_fs_parameters, param, &result); if (opt < 0) diff --git a/fs/namespace.c b/fs/namespace.c index f30ed401cc6d..4a0f600a3328 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2603,6 +2603,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, if (IS_ERR(fc)) return PTR_ERR(fc); + fc->oldapi = true; err = parse_monolithic_mount_data(fc, data); if (!err) { down_write(&sb->s_umount); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 5f24fcbfbfb4..37e1e8f7f08d 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -109,6 +109,7 @@ struct fs_context { enum fs_context_phase phase:8; /* The phase the context is in */ bool need_free:1; /* Need to call ops->free() */ bool global:1; /* Goes into &init_user_ns */ + bool oldapi:1; /* Coming from mount(2) */ }; struct fs_context_operations { -- cgit v1.2.3 From 567f6a6eba0c09e5f502e0290e57651befa8aacb Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Tue, 14 Jul 2020 14:39:25 +0200 Subject: dma-direct: provide function to check physical memory area validity dma_coherent_ok() checks if a physical memory area fits a device's DMA constraints. Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Christoph Hellwig --- include/linux/dma-direct.h | 1 + kernel/dma/direct.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 5184735a0fe8..ab2e20cba951 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -69,6 +69,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size, u64 dma_direct_get_required_mask(struct device *dev); gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, u64 *phys_mask); +bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size); void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 95866b647581..67f060b86a73 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -70,7 +70,7 @@ gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, return 0; } -static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) +bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { return phys_to_dma_direct(dev, phys) + size - 1 <= min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); -- cgit v1.2.3 From 7f5f81406e2b36785f1e25fe5209edd9dd3610d7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Jul 2020 16:37:25 -0700 Subject: rhashtable: drop duplicated word in Drop the doubled word "be" in a comment. Signed-off-by: Randy Dunlap Cc: Thomas Graf Cc: Herbert Xu Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 70ebef866cc8..d3432ee65de7 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -33,7 +33,7 @@ * of two or more hash tables when the rhashtable is being resized. * The end of the chain is marked with a special nulls marks which has * the least significant bit set but otherwise stores the address of - * the hash bucket. This allows us to be be sure we've found the end + * the hash bucket. This allows us to be sure we've found the end * of the right list. * The value stored in the hash bucket has BIT(0) used as a lock bit. * This bit must be atomically set before any changes are made to -- cgit v1.2.3 From aca7ed091117d9b4ce499855c383119afb2819a2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Jul 2020 16:38:15 -0700 Subject: i2c: drop duplicated word in the header file Drop the doubled word "be" in a comment. Signed-off-by: Randy Dunlap Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index b8b8963f8bb9..ee328cf80bd9 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -56,7 +56,7 @@ struct property_entry; * on a bus (or read from them). Apart from two basic transfer functions to * transmit one message at a time, a more complex version can be used to * transmit an arbitrary number of messages without interruption. - * @count must be be less than 64k since msg.len is u16. + * @count must be less than 64k since msg.len is u16. */ int i2c_transfer_buffer_flags(const struct i2c_client *client, char *buf, int count, u16 flags); -- cgit v1.2.3 From 5df96f2b9f58a5d2dc1f30fe7de75e197f2c25f2 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 23 Jul 2020 10:42:09 -0400 Subject: dm integrity: fix integrity recalculation that is improperly skipped Commit adc0daad366b62ca1bce3e2958a40b0b71a8b8b3 ("dm: report suspended device during destroy") broke integrity recalculation. The problem is dm_suspended() returns true not only during suspend, but also during resume. So this race condition could occur: 1. dm_integrity_resume calls queue_work(ic->recalc_wq, &ic->recalc_work) 2. integrity_recalc (&ic->recalc_work) preempts the current thread 3. integrity_recalc calls if (unlikely(dm_suspended(ic->ti))) goto unlock_ret; 4. integrity_recalc exits and no recalculating is done. To fix this race condition, add a function dm_post_suspending that is only true during the postsuspend phase and use it instead of dm_suspended(). Signed-off-by: Mikulas Patocka Fixes: adc0daad366b ("dm: report suspended device during destroy") Cc: stable vger kernel org # v4.18+ Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 4 ++-- drivers/md/dm.c | 17 +++++++++++++++++ include/linux/device-mapper.h | 1 + 3 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 81dc5ff08909..a83a1de1e03f 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2420,7 +2420,7 @@ static void integrity_writer(struct work_struct *w) unsigned prev_free_sectors; /* the following test is not needed, but it tests the replay code */ - if (unlikely(dm_suspended(ic->ti)) && !ic->meta_dev) + if (unlikely(dm_post_suspending(ic->ti)) && !ic->meta_dev) return; spin_lock_irq(&ic->endio_wait.lock); @@ -2481,7 +2481,7 @@ static void integrity_recalc(struct work_struct *w) next_chunk: - if (unlikely(dm_suspended(ic->ti))) + if (unlikely(dm_post_suspending(ic->ti))) goto unlock_ret; range.logical_sector = le64_to_cpu(ic->sb->recalc_sector); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 52449afd58eb..5b9de2f71bb0 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -143,6 +143,7 @@ EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); #define DMF_NOFLUSH_SUSPENDING 5 #define DMF_DEFERRED_REMOVE 6 #define DMF_SUSPENDED_INTERNALLY 7 +#define DMF_POST_SUSPENDING 8 #define DM_NUMA_NODE NUMA_NO_NODE static int dm_numa_node = DM_NUMA_NODE; @@ -2408,6 +2409,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait) if (!dm_suspended_md(md)) { dm_table_presuspend_targets(map); set_bit(DMF_SUSPENDED, &md->flags); + set_bit(DMF_POST_SUSPENDING, &md->flags); dm_table_postsuspend_targets(map); } /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ @@ -2766,7 +2768,9 @@ retry: if (r) goto out_unlock; + set_bit(DMF_POST_SUSPENDING, &md->flags); dm_table_postsuspend_targets(map); + clear_bit(DMF_POST_SUSPENDING, &md->flags); out_unlock: mutex_unlock(&md->suspend_lock); @@ -2863,7 +2867,9 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE, DMF_SUSPENDED_INTERNALLY); + set_bit(DMF_POST_SUSPENDING, &md->flags); dm_table_postsuspend_targets(map); + clear_bit(DMF_POST_SUSPENDING, &md->flags); } static void __dm_internal_resume(struct mapped_device *md) @@ -3024,6 +3030,11 @@ int dm_suspended_md(struct mapped_device *md) return test_bit(DMF_SUSPENDED, &md->flags); } +static int dm_post_suspending_md(struct mapped_device *md) +{ + return test_bit(DMF_POST_SUSPENDING, &md->flags); +} + int dm_suspended_internally_md(struct mapped_device *md) { return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); @@ -3040,6 +3051,12 @@ int dm_suspended(struct dm_target *ti) } EXPORT_SYMBOL_GPL(dm_suspended); +int dm_post_suspending(struct dm_target *ti) +{ + return dm_post_suspending_md(dm_table_get_md(ti->table)); +} +EXPORT_SYMBOL_GPL(dm_post_suspending); + int dm_noflush_suspending(struct dm_target *ti) { return __noflush_suspending(dm_table_get_md(ti->table)); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 8750f2dc5613..73dec4b5d5be 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -426,6 +426,7 @@ const char *dm_device_name(struct mapped_device *md); int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid); struct gendisk *dm_disk(struct mapped_device *md); int dm_suspended(struct dm_target *ti); +int dm_post_suspending(struct dm_target *ti); int dm_noflush_suspending(struct dm_target *ti); void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors); union map_info *dm_get_rq_mapinfo(struct request *rq); -- cgit v1.2.3 From 76be93fc0702322179bb0ea87295d820ee46ad14 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 23 Jul 2020 12:00:06 -0700 Subject: tcp: allow at most one TLP probe per flight Previously TLP may send multiple probes of new data in one flight. This happens when the sender is cwnd limited. After the initial TLP containing new data is sent, the sender receives another ACK that acks partial inflight. It may re-arm another TLP timer to send more, if no further ACK returns before the next TLP timeout (PTO) expires. The sender may send in theory a large amount of TLP until send queue is depleted. This only happens if the sender sees such irregular uncommon ACK pattern. But it is generally undesirable behavior during congestion especially. The original TLP design restrict only one TLP probe per inflight as published in "Reducing Web Latency: the Virtue of Gentle Aggression", SIGCOMM 2013. This patch changes TLP to send at most one probe per inflight. Note that if the sender is app-limited, TLP retransmits old data and did not have this issue. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 6 ++++-- net/ipv4/tcp_input.c | 11 ++++++----- net/ipv4/tcp_output.c | 13 ++++++++----- 3 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 9aac824c523c..a1bbaa1c1a3a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -220,7 +220,9 @@ struct tcp_sock { } rack; u16 advmss; /* Advertised MSS */ u8 compressed_ack; - u8 dup_ack_counter; + u8 dup_ack_counter:2, + tlp_retrans:1, /* TLP is a retransmission */ + unused:5; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ @@ -243,7 +245,7 @@ struct tcp_sock { save_syn:1, /* Save headers of SYN packet */ is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ syn_smc:1; /* SYN includes SMC */ - u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ + u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9615e72656d1..518f04355fbf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3488,10 +3488,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) } } -/* This routine deals with acks during a TLP episode. - * We mark the end of a TLP episode on receiving TLP dupack or when - * ack is after tlp_high_seq. - * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. +/* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack */ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) { @@ -3500,7 +3498,10 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) if (before(ack, tp->tlp_high_seq)) return; - if (flag & FLAG_DSACKING_ACK) { + if (!tp->tlp_retrans) { + /* TLP of new data has been acknowledged */ + tp->tlp_high_seq = 0; + } else if (flag & FLAG_DSACKING_ACK) { /* This DSACK means original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; } else if (after(ack, tp->tlp_high_seq)) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5f5b2f0b0e60..0bc05d68cd74 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2624,6 +2624,11 @@ void tcp_send_loss_probe(struct sock *sk) int pcount; int mss = tcp_current_mss(sk); + /* At most one outstanding TLP */ + if (tp->tlp_high_seq) + goto rearm_timer; + + tp->tlp_retrans = 0; skb = tcp_send_head(sk); if (skb && tcp_snd_wnd_test(tp, skb, mss)) { pcount = tp->packets_out; @@ -2641,10 +2646,6 @@ void tcp_send_loss_probe(struct sock *sk) return; } - /* At most one outstanding TLP retransmission. */ - if (tp->tlp_high_seq) - goto rearm_timer; - if (skb_still_in_host_queue(sk, skb)) goto rearm_timer; @@ -2666,10 +2667,12 @@ void tcp_send_loss_probe(struct sock *sk) if (__tcp_retransmit_skb(sk, skb, 1)) goto rearm_timer; + tp->tlp_retrans = 1; + +probe_sent: /* Record snd_nxt for loss detection. */ tp->tlp_high_seq = tp->snd_nxt; -probe_sent: NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ inet_csk(sk)->icsk_pending = 0; -- cgit v1.2.3 From 3bef735ad7b7d987069181e7b58588043cbd1509 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Thu, 23 Jul 2020 21:15:14 -0700 Subject: vfs/xattr: mm/shmem: kernfs: release simple xattr entry in a right way After commit fdc85222d58e ("kernfs: kvmalloc xattr value instead of kmalloc"), simple xattr entry is allocated with kvmalloc() instead of kmalloc(), so we should release it with kvfree() instead of kfree(). Fixes: fdc85222d58e ("kernfs: kvmalloc xattr value instead of kmalloc") Signed-off-by: Chengguang Xu Signed-off-by: Andrew Morton Acked-by: Hugh Dickins Acked-by: Tejun Heo Cc: Daniel Xu Cc: Chris Down Cc: Andreas Dilger Cc: Greg Kroah-Hartman Cc: Al Viro Cc: [5.7] Link: http://lkml.kernel.org/r/20200704051608.15043-1-cgxu519@mykernel.net Signed-off-by: Linus Torvalds --- include/linux/xattr.h | 3 ++- mm/shmem.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 47eaa34f8761..c5afaf8ca7a2 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -15,6 +15,7 @@ #include #include #include +#include #include struct inode; @@ -94,7 +95,7 @@ static inline void simple_xattrs_free(struct simple_xattrs *xattrs) list_for_each_entry_safe(xattr, node, &xattrs->head, list) { kfree(xattr->name); - kfree(xattr); + kvfree(xattr); } } diff --git a/mm/shmem.c b/mm/shmem.c index a0dbe62f8042..b2abca3f7f33 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3178,7 +3178,7 @@ static int shmem_initxattrs(struct inode *inode, new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, GFP_KERNEL); if (!new_xattr->name) { - kfree(new_xattr); + kvfree(new_xattr); return -ENOMEM; } -- cgit v1.2.3 From e0b3e0b1a04367fc15c07f44e78361545b55357c Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Thu, 23 Jul 2020 21:15:46 -0700 Subject: io-mapping: indicate mapping failure The !ATOMIC_IOMAP version of io_maping_init_wc will always return success, even when the ioremap fails. Since the ATOMIC_IOMAP version returns NULL when the init fails, and callers check for a NULL return on error this is unexpected. During a device probe, where the ioremap failed, a crash can look like this: BUG: unable to handle page fault for address: 0000000000210000 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page Oops: 0002 [#1] PREEMPT SMP CPU: 0 PID: 177 Comm: RIP: 0010:fill_page_dma [i915] gen8_ppgtt_create [i915] i915_ppgtt_create [i915] intel_gt_init [i915] i915_gem_init [i915] i915_driver_probe [i915] pci_device_probe really_probe driver_probe_device The remap failure occurred much earlier in the probe. If it had been propagated, the driver would have exited with an error. Return NULL on ioremap failure. [akpm@linux-foundation.org: detect ioremap_wc() errors earlier] Fixes: cafaf14a5d8f ("io-mapping: Always create a struct to hold metadata about the io-mapping") Signed-off-by: Michael J. Ruhl Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Mike Rapoport Cc: Andy Shevchenko Cc: Chris Wilson Cc: Daniel Vetter Cc: Link: http://lkml.kernel.org/r/20200721171936.81563-1-michael.j.ruhl@intel.com Signed-off-by: Linus Torvalds --- include/linux/io-mapping.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 0beaa3eba155..c75e4d3d8833 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -107,9 +107,12 @@ io_mapping_init_wc(struct io_mapping *iomap, resource_size_t base, unsigned long size) { + iomap->iomem = ioremap_wc(base, size); + if (!iomap->iomem) + return NULL; + iomap->base = base; iomap->size = size; - iomap->iomem = ioremap_wc(base, size); #if defined(pgprot_noncached_wc) /* archs can't agree on a name ... */ iomap->prot = pgprot_noncached_wc(PAGE_KERNEL); #elif defined(pgprot_writecombine) -- cgit v1.2.3 From 8be23aec0ee151de731626d5578973fde25b2285 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 25 Jul 2020 16:07:36 +0200 Subject: i2c: also convert placeholder function to return errno All i2c_new_device-alike functions return ERR_PTR these days, but this fallback function was missed. Fixes: 2dea645ffc21 ("i2c: acpi: Return error pointers from i2c_acpi_new_device()") Signed-off-by: Wolfram Sang Reviewed-by: Andy Shevchenko [wsa: changed from 'ENOSYS' to 'ENODEV'] Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index ee328cf80bd9..4e7714c88f95 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -1001,7 +1001,7 @@ static inline u32 i2c_acpi_find_bus_speed(struct device *dev) static inline struct i2c_client *i2c_acpi_new_device(struct device *dev, int index, struct i2c_board_info *info) { - return NULL; + return ERR_PTR(-ENODEV); } static inline struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle) { -- cgit v1.2.3 From 7d0314b11cdd92bca8b89684c06953bf114605fc Mon Sep 17 00:00:00 2001 From: Ron Diskin Date: Sun, 5 Apr 2020 13:58:40 +0300 Subject: net/mlx5e: Modify uplink state on interface up/down When setting the PF interface up/down, notify the firmware to update uplink state via MODIFY_VPORT_STATE, when E-Switch is enabled. This behavior will prevent sending traffic out on uplink port when PF is down, such as sending traffic from a VF interface which is still up. Currently when calling mlx5e_open/close(), the driver only sends PAOS command to notify the firmware to set the physical port state to up/down, however, it is not sufficient. When VF is in "auto" state, it follows the uplink state, which was not updated on mlx5e_open/close() before this patch. When switchdev mode is enabled and uplink representor is first enabled, set the uplink port state value back to its FW default "AUTO". Fixes: 63bfd399de55 ("net/mlx5e: Send PAOS command on interface up/down") Signed-off-by: Ron Diskin Reviewed-by: Roi Dayan Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 25 ++++++++++++++++++++--- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 ++ drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 16 +++++++++------ drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 2 ++ include/linux/mlx5/mlx5_ifc.h | 1 + 5 files changed, 37 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 31f9ecae98df..07fdbea7ea13 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3069,6 +3069,25 @@ void mlx5e_timestamp_init(struct mlx5e_priv *priv) priv->tstamp.rx_filter = HWTSTAMP_FILTER_NONE; } +static void mlx5e_modify_admin_state(struct mlx5_core_dev *mdev, + enum mlx5_port_status state) +{ + struct mlx5_eswitch *esw = mdev->priv.eswitch; + int vport_admin_state; + + mlx5_set_port_admin_status(mdev, state); + + if (!MLX5_ESWITCH_MANAGER(mdev) || mlx5_eswitch_mode(esw) == MLX5_ESWITCH_OFFLOADS) + return; + + if (state == MLX5_PORT_UP) + vport_admin_state = MLX5_VPORT_ADMIN_STATE_AUTO; + else + vport_admin_state = MLX5_VPORT_ADMIN_STATE_DOWN; + + mlx5_eswitch_set_vport_state(esw, MLX5_VPORT_UPLINK, vport_admin_state); +} + int mlx5e_open_locked(struct net_device *netdev) { struct mlx5e_priv *priv = netdev_priv(netdev); @@ -3101,7 +3120,7 @@ int mlx5e_open(struct net_device *netdev) mutex_lock(&priv->state_lock); err = mlx5e_open_locked(netdev); if (!err) - mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_UP); + mlx5e_modify_admin_state(priv->mdev, MLX5_PORT_UP); mutex_unlock(&priv->state_lock); return err; @@ -3135,7 +3154,7 @@ int mlx5e_close(struct net_device *netdev) return -ENODEV; mutex_lock(&priv->state_lock); - mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_DOWN); + mlx5e_modify_admin_state(priv->mdev, MLX5_PORT_DOWN); err = mlx5e_close_locked(netdev); mutex_unlock(&priv->state_lock); @@ -5182,7 +5201,7 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv) /* Marking the link as currently not needed by the Driver */ if (!netif_running(netdev)) - mlx5_set_port_admin_status(mdev, MLX5_PORT_DOWN); + mlx5e_modify_admin_state(mdev, MLX5_PORT_DOWN); mlx5e_set_netdev_mtu_boundaries(priv); mlx5e_set_dev_port_mtu(priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 8c294ab43f90..9519a61bd8ec 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1081,6 +1081,8 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv) mlx5e_rep_tc_enable(priv); + mlx5_modify_vport_admin_state(mdev, MLX5_VPORT_STATE_OP_MOD_UPLINK, + 0, 0, MLX5_VPORT_ADMIN_STATE_AUTO); mlx5_lag_add(mdev, netdev); priv->events_nb.notifier_call = uplink_rep_async_event; mlx5_notifier_register(mdev, &priv->events_nb); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index d9376627584e..71d01143c455 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1826,6 +1826,8 @@ int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw, u16 vport, int link_state) { struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, vport); + int opmod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT; + int other_vport = 1; int err = 0; if (!ESW_ALLOWED(esw)) @@ -1833,15 +1835,17 @@ int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw, if (IS_ERR(evport)) return PTR_ERR(evport); + if (vport == MLX5_VPORT_UPLINK) { + opmod = MLX5_VPORT_STATE_OP_MOD_UPLINK; + other_vport = 0; + vport = 0; + } mutex_lock(&esw->state_lock); - err = mlx5_modify_vport_admin_state(esw->dev, - MLX5_VPORT_STATE_OP_MOD_ESW_VPORT, - vport, 1, link_state); + err = mlx5_modify_vport_admin_state(esw->dev, opmod, vport, other_vport, link_state); if (err) { - mlx5_core_warn(esw->dev, - "Failed to set vport %d link state, err = %d", - vport, err); + mlx5_core_warn(esw->dev, "Failed to set vport %d link state, opmod = %d, err = %d", + vport, opmod, err); goto unlock; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index a5175e98c0b3..5785596f13f5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -680,6 +680,8 @@ static inline int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs) { r static inline void mlx5_eswitch_disable(struct mlx5_eswitch *esw, bool clear_vf) {} static inline bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1) { return true; } static inline bool mlx5_eswitch_is_funcs_handler(struct mlx5_core_dev *dev) { return false; } +static inline +int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw, u16 vport, int link_state) { return 0; } static inline const u32 *mlx5_esw_query_functions(struct mlx5_core_dev *dev) { return ERR_PTR(-EOPNOTSUPP); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 073b79eacc99..1340e02b14ef 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4381,6 +4381,7 @@ struct mlx5_ifc_query_vport_state_out_bits { enum { MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT = 0x0, MLX5_VPORT_STATE_OP_MOD_ESW_VPORT = 0x1, + MLX5_VPORT_STATE_OP_MOD_UPLINK = 0x2, }; struct mlx5_ifc_arm_monitor_counter_in_bits { -- cgit v1.2.3 From 1748f6a2cbc4694523f16da1c892b59861045b9d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 24 Jul 2020 20:12:53 +1000 Subject: rhashtable: Fix unprotected RCU dereference in __rht_ptr The rcu_dereference call in rht_ptr_rcu is completely bogus because we've already dereferenced the value in __rht_ptr and operated on it. This causes potential double readings which could be fatal. The RCU dereference must occur prior to the comparison in __rht_ptr. This patch changes the order of RCU dereference so that it is done first and the result is then fed to __rht_ptr. The RCU marking changes have been minimised using casts which will be removed in a follow-up patch. Fixes: ba6306e3f648 ("rhashtable: Remove RCU marking from...") Reported-by: "Gong, Sishuai" Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index d3432ee65de7..b8feb5da7c5a 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -349,11 +349,11 @@ static inline void rht_unlock(struct bucket_table *tbl, local_bh_enable(); } -static inline struct rhash_head __rcu *__rht_ptr( - struct rhash_lock_head *const *bkt) +static inline struct rhash_head *__rht_ptr( + struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt) { - return (struct rhash_head __rcu *) - ((unsigned long)*bkt & ~BIT(0) ?: + return (struct rhash_head *) + ((unsigned long)p & ~BIT(0) ?: (unsigned long)RHT_NULLS_MARKER(bkt)); } @@ -365,25 +365,26 @@ static inline struct rhash_head __rcu *__rht_ptr( * access is guaranteed, such as when destroying the table. */ static inline struct rhash_head *rht_ptr_rcu( - struct rhash_lock_head *const *bkt) + struct rhash_lock_head *const *p) { - struct rhash_head __rcu *p = __rht_ptr(bkt); - - return rcu_dereference(p); + struct rhash_lock_head __rcu *const *bkt = (void *)p; + return __rht_ptr(rcu_dereference(*bkt), bkt); } static inline struct rhash_head *rht_ptr( - struct rhash_lock_head *const *bkt, + struct rhash_lock_head *const *p, struct bucket_table *tbl, unsigned int hash) { - return rht_dereference_bucket(__rht_ptr(bkt), tbl, hash); + struct rhash_lock_head __rcu *const *bkt = (void *)p; + return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); } static inline struct rhash_head *rht_ptr_exclusive( - struct rhash_lock_head *const *bkt) + struct rhash_lock_head *const *p) { - return rcu_dereference_protected(__rht_ptr(bkt), 1); + struct rhash_lock_head __rcu *const *bkt = (void *)p; + return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt); } static inline void rht_assign_locked(struct rhash_lock_head **bkt, -- cgit v1.2.3 From ce9b362bf6db51a083c4221ef0f93c16cfb1facf Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 24 Jul 2020 20:14:34 +1000 Subject: rhashtable: Restore RCU marking on rhash_lock_head This patch restores the RCU marking on bucket_table->buckets as it really does need RCU protection. Its removal had led to a fatal bug. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 56 ++++++++++++++++++++-------------------------- lib/rhashtable.c | 35 +++++++++++++---------------- 2 files changed, 40 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index b8feb5da7c5a..68dab3e08aad 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -84,7 +84,7 @@ struct bucket_table { struct lockdep_map dep_map; - struct rhash_lock_head *buckets[] ____cacheline_aligned_in_smp; + struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; /* @@ -261,13 +261,12 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, void *arg); void rhashtable_destroy(struct rhashtable *ht); -struct rhash_lock_head **rht_bucket_nested(const struct bucket_table *tbl, - unsigned int hash); -struct rhash_lock_head **__rht_bucket_nested(const struct bucket_table *tbl, - unsigned int hash); -struct rhash_lock_head **rht_bucket_nested_insert(struct rhashtable *ht, - struct bucket_table *tbl, - unsigned int hash); +struct rhash_lock_head __rcu **rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash); +struct rhash_lock_head __rcu **__rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash); +struct rhash_lock_head __rcu **rht_bucket_nested_insert( + struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash); #define rht_dereference(p, ht) \ rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) @@ -284,21 +283,21 @@ struct rhash_lock_head **rht_bucket_nested_insert(struct rhashtable *ht, #define rht_entry(tpos, pos, member) \ ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) -static inline struct rhash_lock_head *const *rht_bucket( +static inline struct rhash_lock_head __rcu *const *rht_bucket( const struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } -static inline struct rhash_lock_head **rht_bucket_var( +static inline struct rhash_lock_head __rcu **rht_bucket_var( struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } -static inline struct rhash_lock_head **rht_bucket_insert( +static inline struct rhash_lock_head __rcu **rht_bucket_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) : @@ -325,7 +324,7 @@ static inline struct rhash_lock_head **rht_bucket_insert( */ static inline void rht_lock(struct bucket_table *tbl, - struct rhash_lock_head **bkt) + struct rhash_lock_head __rcu **bkt) { local_bh_disable(); bit_spin_lock(0, (unsigned long *)bkt); @@ -333,7 +332,7 @@ static inline void rht_lock(struct bucket_table *tbl, } static inline void rht_lock_nested(struct bucket_table *tbl, - struct rhash_lock_head **bucket, + struct rhash_lock_head __rcu **bucket, unsigned int subclass) { local_bh_disable(); @@ -342,7 +341,7 @@ static inline void rht_lock_nested(struct bucket_table *tbl, } static inline void rht_unlock(struct bucket_table *tbl, - struct rhash_lock_head **bkt) + struct rhash_lock_head __rcu **bkt) { lock_map_release(&tbl->dep_map); bit_spin_unlock(0, (unsigned long *)bkt); @@ -365,48 +364,41 @@ static inline struct rhash_head *__rht_ptr( * access is guaranteed, such as when destroying the table. */ static inline struct rhash_head *rht_ptr_rcu( - struct rhash_lock_head *const *p) + struct rhash_lock_head __rcu *const *bkt) { - struct rhash_lock_head __rcu *const *bkt = (void *)p; return __rht_ptr(rcu_dereference(*bkt), bkt); } static inline struct rhash_head *rht_ptr( - struct rhash_lock_head *const *p, + struct rhash_lock_head __rcu *const *bkt, struct bucket_table *tbl, unsigned int hash) { - struct rhash_lock_head __rcu *const *bkt = (void *)p; return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); } static inline struct rhash_head *rht_ptr_exclusive( - struct rhash_lock_head *const *p) + struct rhash_lock_head __rcu *const *bkt) { - struct rhash_lock_head __rcu *const *bkt = (void *)p; return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt); } -static inline void rht_assign_locked(struct rhash_lock_head **bkt, +static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, struct rhash_head *obj) { - struct rhash_head __rcu **p = (struct rhash_head __rcu **)bkt; - if (rht_is_a_nulls(obj)) obj = NULL; - rcu_assign_pointer(*p, (void *)((unsigned long)obj | BIT(0))); + rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0))); } static inline void rht_assign_unlock(struct bucket_table *tbl, - struct rhash_lock_head **bkt, + struct rhash_lock_head __rcu **bkt, struct rhash_head *obj) { - struct rhash_head __rcu **p = (struct rhash_head __rcu **)bkt; - if (rht_is_a_nulls(obj)) obj = NULL; lock_map_release(&tbl->dep_map); - rcu_assign_pointer(*p, obj); + rcu_assign_pointer(*bkt, (void *)obj); preempt_enable(); __release(bitlock); local_bh_enable(); @@ -594,7 +586,7 @@ static inline struct rhash_head *__rhashtable_lookup( .ht = ht, .key = key, }; - struct rhash_lock_head *const *bkt; + struct rhash_lock_head __rcu *const *bkt; struct bucket_table *tbl; struct rhash_head *he; unsigned int hash; @@ -710,7 +702,7 @@ static inline void *__rhashtable_insert_fast( .ht = ht, .key = key, }; - struct rhash_lock_head **bkt; + struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct bucket_table *tbl; struct rhash_head *head; @@ -996,7 +988,7 @@ static inline int __rhashtable_remove_fast_one( struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { - struct rhash_lock_head **bkt; + struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; unsigned int hash; @@ -1148,7 +1140,7 @@ static inline int __rhashtable_replace_fast( struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) { - struct rhash_lock_head **bkt; + struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; unsigned int hash; diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 9f6890aedd1a..c949c1e3b87c 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -31,7 +31,7 @@ union nested_table { union nested_table __rcu *table; - struct rhash_lock_head *bucket; + struct rhash_lock_head __rcu *bucket; }; static u32 head_hashfn(struct rhashtable *ht, @@ -222,7 +222,7 @@ static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, } static int rhashtable_rehash_one(struct rhashtable *ht, - struct rhash_lock_head **bkt, + struct rhash_lock_head __rcu **bkt, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); @@ -275,7 +275,7 @@ static int rhashtable_rehash_chain(struct rhashtable *ht, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); - struct rhash_lock_head **bkt = rht_bucket_var(old_tbl, old_hash); + struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash); int err; if (!bkt) @@ -485,7 +485,7 @@ fail: } static void *rhashtable_lookup_one(struct rhashtable *ht, - struct rhash_lock_head **bkt, + struct rhash_lock_head __rcu **bkt, struct bucket_table *tbl, unsigned int hash, const void *key, struct rhash_head *obj) { @@ -535,12 +535,10 @@ static void *rhashtable_lookup_one(struct rhashtable *ht, return ERR_PTR(-ENOENT); } -static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, - struct rhash_lock_head **bkt, - struct bucket_table *tbl, - unsigned int hash, - struct rhash_head *obj, - void *data) +static struct bucket_table *rhashtable_insert_one( + struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, + struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj, + void *data) { struct bucket_table *new_tbl; struct rhash_head *head; @@ -591,7 +589,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, { struct bucket_table *new_tbl; struct bucket_table *tbl; - struct rhash_lock_head **bkt; + struct rhash_lock_head __rcu **bkt; unsigned int hash; void *data; @@ -1173,8 +1171,8 @@ void rhashtable_destroy(struct rhashtable *ht) } EXPORT_SYMBOL_GPL(rhashtable_destroy); -struct rhash_lock_head **__rht_bucket_nested(const struct bucket_table *tbl, - unsigned int hash) +struct rhash_lock_head __rcu **__rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); unsigned int index = hash & ((1 << tbl->nest) - 1); @@ -1202,10 +1200,10 @@ struct rhash_lock_head **__rht_bucket_nested(const struct bucket_table *tbl, } EXPORT_SYMBOL_GPL(__rht_bucket_nested); -struct rhash_lock_head **rht_bucket_nested(const struct bucket_table *tbl, - unsigned int hash) +struct rhash_lock_head __rcu **rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash) { - static struct rhash_lock_head *rhnull; + static struct rhash_lock_head __rcu *rhnull; if (!rhnull) INIT_RHT_NULLS_HEAD(rhnull); @@ -1213,9 +1211,8 @@ struct rhash_lock_head **rht_bucket_nested(const struct bucket_table *tbl, } EXPORT_SYMBOL_GPL(rht_bucket_nested); -struct rhash_lock_head **rht_bucket_nested_insert(struct rhashtable *ht, - struct bucket_table *tbl, - unsigned int hash) +struct rhash_lock_head __rcu **rht_bucket_nested_insert( + struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); unsigned int index = hash & ((1 << tbl->nest) - 1); -- cgit v1.2.3 From f227e3ec3b5cad859ad15666874405e8c1bbc1d4 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Fri, 10 Jul 2020 15:23:19 +0200 Subject: random32: update the net random state on interrupt and activity This modifies the first 32 bits out of the 128 bits of a random CPU's net_rand_state on interrupt or CPU activity to complicate remote observations that could lead to guessing the network RNG's internal state. Note that depending on some network devices' interrupt rate moderation or binding, this re-seeding might happen on every packet or even almost never. In addition, with NOHZ some CPUs might not even get timer interrupts, leaving their local state rarely updated, while they are running networked processes making use of the random state. For this reason, we also perform this update in update_process_times() in order to at least update the state when there is user or system activity, since it's the only case we care about. Reported-by: Amit Klein Suggested-by: Linus Torvalds Cc: Eric Dumazet Cc: "Jason A. Donenfeld" Cc: Andy Lutomirski Cc: Kees Cook Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Signed-off-by: Willy Tarreau Signed-off-by: Linus Torvalds --- drivers/char/random.c | 1 + include/linux/random.h | 3 +++ kernel/time/timer.c | 8 ++++++++ lib/random32.c | 2 +- 4 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/char/random.c b/drivers/char/random.c index 2a41b21623ae..d20ba1b104ca 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1277,6 +1277,7 @@ void add_interrupt_randomness(int irq, int irq_flags) fast_mix(fast_pool); add_interrupt_bench(cycles); + this_cpu_add(net_rand_state.s1, fast_pool->pool[cycles & 3]); if (unlikely(crng_init == 0)) { if ((fast_pool->count >= 64) && diff --git a/include/linux/random.h b/include/linux/random.h index 45e1f8fa742b..39aaa1f78f9d 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -119,6 +120,8 @@ struct rnd_state { __u32 s1, s2, s3, s4; }; +DECLARE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy; + u32 prandom_u32_state(struct rnd_state *state); void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes); void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index df1ff803acc4..026ac01af9da 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -1742,6 +1743,13 @@ void update_process_times(int user_tick) scheduler_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); + + /* The current CPU might make use of net randoms without receiving IRQs + * to renew them often enough. Let's update the net_rand_state from a + * non-constant value that's not affine to the number of calls to make + * sure it's updated when there's some activity (we don't care in idle). + */ + this_cpu_add(net_rand_state.s1, rol32(jiffies, 24) + user_tick); } /** diff --git a/lib/random32.c b/lib/random32.c index 763b920a6206..c4d317be2997 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -48,7 +48,7 @@ static inline void prandom_state_selftest(void) } #endif -static DEFINE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy; +DEFINE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy; /** * prandom_u32_state - seeded pseudo-random number generator. -- cgit v1.2.3 From 83bdc7275e6206f560d247be856bceba3e1ed8f2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 29 Jul 2020 19:11:00 -0700 Subject: random32: remove net_rand_state from the latent entropy gcc plugin It turns out that the plugin right now ends up being really unhappy about the change from 'static' to 'extern' storage that happened in commit f227e3ec3b5c ("random32: update the net random state on interrupt and activity"). This is probably a trivial fix for the latent_entropy plugin, but for now, just remove net_rand_state from the list of things the plugin worries about. Reported-by: Stephen Rothwell Cc: Emese Revfy Cc: Kees Cook Cc: Willy Tarreau Signed-off-by: Linus Torvalds --- include/linux/random.h | 2 +- lib/random32.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 39aaa1f78f9d..f310897f051d 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -120,7 +120,7 @@ struct rnd_state { __u32 s1, s2, s3, s4; }; -DECLARE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy; +DECLARE_PER_CPU(struct rnd_state, net_rand_state); u32 prandom_u32_state(struct rnd_state *state); void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes); diff --git a/lib/random32.c b/lib/random32.c index c4d317be2997..3d749abb9e80 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -48,7 +48,7 @@ static inline void prandom_state_selftest(void) } #endif -DEFINE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy; +DEFINE_PER_CPU(struct rnd_state, net_rand_state); /** * prandom_u32_state - seeded pseudo-random number generator. -- cgit v1.2.3 From 1c9df907da83812e4f33b59d3d142c864d9da57f Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 30 Jul 2020 07:59:24 +0200 Subject: random: fix circular include dependency on arm64 after addition of percpu.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Daniel Díaz and Kees Cook independently reported that commit f227e3ec3b5c ("random32: update the net random state on interrupt and activity") broke arm64 due to a circular dependency on include files since the addition of percpu.h in random.h. The correct fix would definitely be to move all the prandom32 stuff out of random.h but for backporting, a smaller solution is preferred. This one replaces linux/percpu.h with asm/percpu.h, and this fixes the problem on x86_64, arm64, arm, and mips. Note that moving percpu.h around didn't change anything and that removing it entirely broke differently. When backporting, such options might still be considered if this patch fails to help. [ It turns out that an alternate fix seems to be to just remove the troublesome remove from the arm64 that causes the circular dependency. But we might as well do the whole belt-and-suspenders thing, and minimize inclusion in too. Either will fix the problem, and both are good changes. - Linus ] Reported-by: Daniel Díaz Reported-by: Kees Cook Tested-by: Marc Zyngier Fixes: f227e3ec3b5c Cc: Stephen Rothwell Signed-off-by: Willy Tarreau Signed-off-by: Linus Torvalds --- include/linux/random.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index f310897f051d..9ab7443bd91b 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include -- cgit v1.2.3