From 26149e3e1f44d27897d0af9ca4bcd723674bad44 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 21 Jul 2019 14:18:42 +0300 Subject: net/mlx5: kTLS, Fix wrong TIS opmod constants Fix the used constants for TLS TIS opmods, per the HW specification. Fixes: a12ff35e0fb7 ("net/mlx5: Introduce TLS TX offload hardware bits and structures") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index ce9839c8bc1a..c2f056b5766d 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -446,11 +446,11 @@ enum { }; enum { - MLX5_OPC_MOD_TLS_TIS_STATIC_PARAMS = 0x20, + MLX5_OPC_MOD_TLS_TIS_STATIC_PARAMS = 0x1, }; enum { - MLX5_OPC_MOD_TLS_TIS_PROGRESS_PARAMS = 0x20, + MLX5_OPC_MOD_TLS_TIS_PROGRESS_PARAMS = 0x1, }; enum { -- cgit v1.2.3 From a9bc3390327317345dd4683b70970c83ab400ea3 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 30 Jul 2019 11:55:25 +0300 Subject: net/mlx5e: kTLS, Fix progress params context WQE layout The TLS progress params context WQE should not include an Eth segment, drop it. In addition, align the tls_progress_params layout with the HW specification document: - fix the tisn field name. - remove the valid bit. Fixes: a12ff35e0fb7 ("net/mlx5: Introduce TLS TX offload hardware bits and structures") Fixes: d2ead1f360e8 ("net/mlx5e: Add kTLS TX HW offload support") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 9 +++++++-- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h | 6 ++++-- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c | 4 ++-- include/linux/mlx5/mlx5_ifc.h | 5 ++--- 4 files changed, 15 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index ce1be2a84231..f6b64a03cd06 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -184,8 +184,13 @@ static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev) struct mlx5e_tx_wqe { struct mlx5_wqe_ctrl_seg ctrl; - struct mlx5_wqe_eth_seg eth; - struct mlx5_wqe_data_seg data[0]; + union { + struct { + struct mlx5_wqe_eth_seg eth; + struct mlx5_wqe_data_seg data[0]; + }; + u8 tls_progress_params_ctx[0]; + }; }; struct mlx5e_rx_wqe_ll { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h index 407da83474ef..b7298f9ee3d3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h @@ -11,12 +11,14 @@ #include "accel/tls.h" #define MLX5E_KTLS_STATIC_UMR_WQE_SZ \ - (sizeof(struct mlx5e_umr_wqe) + MLX5_ST_SZ_BYTES(tls_static_params)) + (offsetof(struct mlx5e_umr_wqe, tls_static_params_ctx) + \ + MLX5_ST_SZ_BYTES(tls_static_params)) #define MLX5E_KTLS_STATIC_WQEBBS \ (DIV_ROUND_UP(MLX5E_KTLS_STATIC_UMR_WQE_SZ, MLX5_SEND_WQE_BB)) #define MLX5E_KTLS_PROGRESS_WQE_SZ \ - (sizeof(struct mlx5e_tx_wqe) + MLX5_ST_SZ_BYTES(tls_progress_params)) + (offsetof(struct mlx5e_tx_wqe, tls_progress_params_ctx) + \ + MLX5_ST_SZ_BYTES(tls_progress_params)) #define MLX5E_KTLS_PROGRESS_WQEBBS \ (DIV_ROUND_UP(MLX5E_KTLS_PROGRESS_WQE_SZ, MLX5_SEND_WQE_BB)) #define MLX5E_KTLS_MAX_DUMP_WQEBBS 2 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c index 3766545ce259..9f67bfb559f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c @@ -80,7 +80,7 @@ build_static_params(struct mlx5e_umr_wqe *wqe, u16 pc, u32 sqn, static void fill_progress_params_ctx(void *ctx, struct mlx5e_ktls_offload_context_tx *priv_tx) { - MLX5_SET(tls_progress_params, ctx, pd, priv_tx->tisn); + MLX5_SET(tls_progress_params, ctx, tisn, priv_tx->tisn); MLX5_SET(tls_progress_params, ctx, record_tracker_state, MLX5E_TLS_PROGRESS_PARAMS_RECORD_TRACKER_STATE_START); MLX5_SET(tls_progress_params, ctx, auth_state, @@ -104,7 +104,7 @@ build_progress_params(struct mlx5e_tx_wqe *wqe, u16 pc, u32 sqn, PROGRESS_PARAMS_DS_CNT); cseg->fm_ce_se = fence ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0; - fill_progress_params_ctx(wqe->data, priv_tx); + fill_progress_params_ctx(wqe->tls_progress_params_ctx, priv_tx); } static void tx_fill_wi(struct mlx5e_txqsq *sq, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ec571fd7fcf8..b8b570c30b5e 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10054,9 +10054,8 @@ struct mlx5_ifc_tls_static_params_bits { }; struct mlx5_ifc_tls_progress_params_bits { - u8 valid[0x1]; - u8 reserved_at_1[0x7]; - u8 pd[0x18]; + u8 reserved_at_0[0x8]; + u8 tisn[0x18]; u8 next_record_tcp_sn[0x20]; -- cgit v1.2.3 From 414776621d1006e57e80e6db7fdc3837897aaa64 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 7 Aug 2019 17:03:59 -0700 Subject: net/tls: prevent skb_orphan() from leaking TLS plain text with offload sk_validate_xmit_skb() and drivers depend on the sk member of struct sk_buff to identify segments requiring encryption. Any operation which removes or does not preserve the original TLS socket such as skb_orphan() or skb_clone() will cause clear text leaks. Make the TCP socket underlying an offloaded TLS connection mark all skbs as decrypted, if TLS TX is in offload mode. Then in sk_validate_xmit_skb() catch skbs which have no socket (or a socket with no validation) and decrypted flag set. Note that CONFIG_SOCK_VALIDATE_XMIT, CONFIG_TLS_DEVICE and sk->sk_validate_xmit_skb are slightly interchangeable right now, they all imply TLS offload. The new checks are guarded by CONFIG_TLS_DEVICE because that's the option guarding the sk_buff->decrypted member. Second, smaller issue with orphaning is that it breaks the guarantee that packets will be delivered to device queues in-order. All TLS offload drivers depend on that scheduling property. This means skb_orphan_partial()'s trick of preserving partial socket references will cause issues in the drivers. We need a full orphan, and as a result netem delay/throttling will cause all TLS offload skbs to be dropped. Reusing the sk_buff->decrypted flag also protects from leaking clear text when incoming, decrypted skb is redirected (e.g. by TC). See commit 0608c69c9a80 ("bpf: sk_msg, sock{map|hash} redirect through ULP") for justification why the internal flag is safe. The only location which could leak the flag in is tcp_bpf_sendmsg(), which is taken care of by clearing the previously unused bit. v2: - remove superfluous decrypted mark copy (Willem); - remove the stale doc entry (Boris); - rely entirely on EOR marking to prevent coalescing (Boris); - use an internal sendpages flag instead of marking the socket (Boris). v3 (Willem): - reorganize the can_skb_orphan_partial() condition; - fix the flag leak-in through tcp_bpf_sendmsg. Signed-off-by: Jakub Kicinski Acked-by: Willem de Bruijn Reviewed-by: Boris Pismenny Signed-off-by: David S. Miller --- Documentation/networking/tls-offload.rst | 18 ------------------ include/linux/skbuff.h | 8 ++++++++ include/linux/socket.h | 3 +++ include/net/sock.h | 10 +++++++++- net/core/sock.c | 19 ++++++++++++++----- net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_bpf.c | 6 +++++- net/ipv4/tcp_output.c | 3 +++ net/tls/tls_device.c | 9 +++++++-- 9 files changed, 52 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst index b70b70dc4524..0dd3f748239f 100644 --- a/Documentation/networking/tls-offload.rst +++ b/Documentation/networking/tls-offload.rst @@ -506,21 +506,3 @@ Drivers should ignore the changes to TLS the device feature flags. These flags will be acted upon accordingly by the core ``ktls`` code. TLS device feature flags only control adding of new TLS connection offloads, old connections will remain active after flags are cleared. - -Known bugs -========== - -skb_orphan() leaks clear text ------------------------------ - -Currently drivers depend on the :c:member:`sk` member of -:c:type:`struct sk_buff ` to identify segments requiring -encryption. Any operation which removes or does not preserve the socket -association such as :c:func:`skb_orphan` or :c:func:`skb_clone` -will cause the driver to miss the packets and lead to clear text leaks. - -Redirects leak clear text -------------------------- - -In the RX direction, if segment has already been decrypted by the device -and it gets redirected or mirrored - clear text will be transmitted out. diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d8af86d995d6..ba5583522d24 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1374,6 +1374,14 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) to->l4_hash = from->l4_hash; }; +static inline void skb_copy_decrypted(struct sk_buff *to, + const struct sk_buff *from) +{ +#ifdef CONFIG_TLS_DEVICE + to->decrypted = from->decrypted; +#endif +} + #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { diff --git a/include/linux/socket.h b/include/linux/socket.h index 97523818cb14..fc0bed59fc84 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -292,6 +292,9 @@ struct ucred { #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN #define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */ +#define MSG_SENDPAGE_DECRYPTED 0x100000 /* sendpage() internal : page may carry + * plain text and require encryption + */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ diff --git a/include/net/sock.h b/include/net/sock.h index 228db3998e46..2c53f1a1d905 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2482,6 +2482,7 @@ static inline bool sk_fullsock(const struct sock *sk) /* Checks if this SKB belongs to an HW offloaded socket * and whether any SW fallbacks are required based on dev. + * Check decrypted mark in case skb_orphan() cleared socket. */ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) @@ -2489,8 +2490,15 @@ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, #ifdef CONFIG_SOCK_VALIDATE_XMIT struct sock *sk = skb->sk; - if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) + if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { skb = sk->sk_validate_xmit_skb(sk, dev, skb); +#ifdef CONFIG_TLS_DEVICE + } else if (unlikely(skb->decrypted)) { + pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); + kfree_skb(skb); + skb = NULL; +#endif + } #endif return skb; diff --git a/net/core/sock.c b/net/core/sock.c index d57b0cc995a0..6d08553f885c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1992,6 +1992,19 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) } EXPORT_SYMBOL(skb_set_owner_w); +static bool can_skb_orphan_partial(const struct sk_buff *skb) +{ +#ifdef CONFIG_TLS_DEVICE + /* Drivers depend on in-order delivery for crypto offload, + * partial orphan breaks out-of-order-OK logic. + */ + if (skb->decrypted) + return false; +#endif + return (skb->destructor == sock_wfree || + (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); +} + /* This helper is used by netem, as it can hold packets in its * delay queue. We want to allow the owner socket to send more * packets, as if they were already TX completed by a typical driver. @@ -2003,11 +2016,7 @@ void skb_orphan_partial(struct sk_buff *skb) if (skb_is_tcp_pure_ack(skb)) return; - if (skb->destructor == sock_wfree -#ifdef CONFIG_INET - || skb->destructor == tcp_wfree -#endif - ) { + if (can_skb_orphan_partial(skb)) { struct sock *sk = skb->sk; if (refcount_inc_not_zero(&sk->sk_refcnt)) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 776905899ac0..77b485d60b9d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -984,6 +984,9 @@ new_segment: if (!skb) goto wait_for_memory; +#ifdef CONFIG_TLS_DEVICE + skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); +#endif skb_entail(sk, skb); copy = size_goal; } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 3d1e15401384..8a56e09cfb0e 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -398,10 +398,14 @@ more_data: static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_msg tmp, *msg_tx = NULL; - int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; int copied = 0, err = 0; struct sk_psock *psock; long timeo; + int flags; + + /* Don't let internal do_tcp_sendpages() flags through */ + flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED); + flags |= MSG_NO_SHARED_FRAGS; psock = sk_psock_get(sk); if (unlikely(!psock)) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6e4afc48d7bb..979520e46e33 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1320,6 +1320,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, buff = sk_stream_alloc_skb(sk, nsize, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ + skb_copy_decrypted(buff, skb); sk->sk_wmem_queued += buff->truesize; sk_mem_charge(sk, buff->truesize); @@ -1874,6 +1875,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, buff = sk_stream_alloc_skb(sk, 0, gfp, true); if (unlikely(!buff)) return -ENOMEM; + skb_copy_decrypted(buff, skb); sk->sk_wmem_queued += buff->truesize; sk_mem_charge(sk, buff->truesize); @@ -2143,6 +2145,7 @@ static int tcp_mtu_probe(struct sock *sk) sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); + skb_copy_decrypted(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 7c0b2b778703..43922d86e510 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -373,9 +373,9 @@ static int tls_push_data(struct sock *sk, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); - int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST; int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE); struct tls_record_info *record = ctx->open_record; + int tls_push_record_flags; struct page_frag *pfrag; size_t orig_size = size; u32 max_open_record_len; @@ -390,6 +390,9 @@ static int tls_push_data(struct sock *sk, if (sk->sk_err) return -sk->sk_err; + flags |= MSG_SENDPAGE_DECRYPTED; + tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST; + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); if (tls_is_partially_sent_record(tls_ctx)) { rc = tls_push_partial_record(sk, tls_ctx, flags); @@ -576,7 +579,9 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx) gfp_t sk_allocation = sk->sk_allocation; sk->sk_allocation = GFP_ATOMIC; - tls_push_partial_record(sk, ctx, MSG_DONTWAIT | MSG_NOSIGNAL); + tls_push_partial_record(sk, ctx, + MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_SENDPAGE_DECRYPTED); sk->sk_allocation = sk_allocation; } } -- cgit v1.2.3 From b884e2de2afc68ce30f7093747378ef972dde253 Mon Sep 17 00:00:00 2001 From: John Garry Date: Tue, 30 Jul 2019 21:29:54 +0800 Subject: lib: logic_pio: Add logic_pio_unregister_range() Add a function to unregister a logical PIO range. Logical PIO space can still be leaked when unregistering certain LOGIC_PIO_CPU_MMIO regions, but this acceptable for now since there are no callers to unregister LOGIC_PIO_CPU_MMIO regions, and the logical PIO region allocation scheme would need significant work to improve this. Cc: stable@vger.kernel.org Signed-off-by: John Garry Signed-off-by: Wei Xu --- include/linux/logic_pio.h | 1 + lib/logic_pio.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/logic_pio.h b/include/linux/logic_pio.h index cbd9d8495690..88e1e6304a71 100644 --- a/include/linux/logic_pio.h +++ b/include/linux/logic_pio.h @@ -117,6 +117,7 @@ struct logic_pio_hwaddr *find_io_range_by_fwnode(struct fwnode_handle *fwnode); unsigned long logic_pio_trans_hwaddr(struct fwnode_handle *fwnode, resource_size_t hw_addr, resource_size_t size); int logic_pio_register_range(struct logic_pio_hwaddr *newrange); +void logic_pio_unregister_range(struct logic_pio_hwaddr *range); resource_size_t logic_pio_to_hwaddr(unsigned long pio); unsigned long logic_pio_trans_cpuaddr(resource_size_t hw_addr); diff --git a/lib/logic_pio.c b/lib/logic_pio.c index d0165c88f705..905027574e5d 100644 --- a/lib/logic_pio.c +++ b/lib/logic_pio.c @@ -98,6 +98,20 @@ end_register: return ret; } +/** + * logic_pio_unregister_range - unregister a logical PIO range for a host + * @range: pointer to the IO range which has been already registered. + * + * Unregister a previously-registered IO range node. + */ +void logic_pio_unregister_range(struct logic_pio_hwaddr *range) +{ + mutex_lock(&io_range_mutex); + list_del_rcu(&range->list); + mutex_unlock(&io_range_mutex); + synchronize_rcu(); +} + /** * find_io_range_by_fwnode - find logical PIO range for given FW node * @fwnode: FW node handle associated with logical PIO range -- cgit v1.2.3 From 68e03b85474a51ec1921b4d13204782594ef7223 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 31 Jul 2019 20:38:14 +0800 Subject: gpio: Fix build error of function redefinition when do randbuilding, I got this error: In file included from drivers/hwmon/pmbus/ucd9000.c:19:0: ./include/linux/gpio/driver.h:576:1: error: redefinition of gpiochip_add_pin_range gpiochip_add_pin_range(struct gpio_chip *chip, const char *pinctl_name, ^~~~~~~~~~~~~~~~~~~~~~ In file included from drivers/hwmon/pmbus/ucd9000.c:18:0: ./include/linux/gpio.h:245:1: note: previous definition of gpiochip_add_pin_range was here gpiochip_add_pin_range(struct gpio_chip *chip, const char *pinctl_name, ^~~~~~~~~~~~~~~~~~~~~~ Reported-by: Hulk Robot Fixes: 964cb341882f ("gpio: move pincontrol calls to ") Signed-off-by: YueHaibing Link: https://lore.kernel.org/r/20190731123814.46624-1-yuehaibing@huawei.com Signed-off-by: Linus Walleij --- include/linux/gpio.h | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio.h b/include/linux/gpio.h index 40915b461f18..f757a58191a6 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -241,30 +241,6 @@ static inline int irq_to_gpio(unsigned irq) return -EINVAL; } -static inline int -gpiochip_add_pin_range(struct gpio_chip *chip, const char *pinctl_name, - unsigned int gpio_offset, unsigned int pin_offset, - unsigned int npins) -{ - WARN_ON(1); - return -EINVAL; -} - -static inline int -gpiochip_add_pingroup_range(struct gpio_chip *chip, - struct pinctrl_dev *pctldev, - unsigned int gpio_offset, const char *pin_group) -{ - WARN_ON(1); - return -EINVAL; -} - -static inline void -gpiochip_remove_pin_ranges(struct gpio_chip *chip) -{ - WARN_ON(1); -} - static inline int devm_gpio_request(struct device *dev, unsigned gpio, const char *label) { -- cgit v1.2.3 From 33da8e7c814f77310250bb54a9db36a44c5de784 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 16 Aug 2019 12:33:54 -0500 Subject: signal: Allow cifs and drbd to receive their terminating signals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My recent to change to only use force_sig for a synchronous events wound up breaking signal reception cifs and drbd. I had overlooked the fact that by default kthreads start out with all signals set to SIG_IGN. So a change I thought was safe turned out to have made it impossible for those kernel thread to catch their signals. Reverting the work on force_sig is a bad idea because what the code was doing was very much a misuse of force_sig. As the way force_sig ultimately allowed the signal to happen was to change the signal handler to SIG_DFL. Which after the first signal will allow userspace to send signals to these kernel threads. At least for wake_ack_receiver in drbd that does not appear actively wrong. So correct this problem by adding allow_kernel_signal that will allow signals whose siginfo reports they were sent by the kernel through, but will not allow userspace generated signals, and update cifs and drbd to call allow_kernel_signal in an appropriate place so that their thread can receive this signal. Fixing things this way ensures that userspace won't be able to send signals and cause problems, that it is clear which signals the threads are expecting to receive, and it guarantees that nothing else in the system will be affected. This change was partly inspired by similar cifs and drbd patches that added allow_signal. Reported-by: ronnie sahlberg Reported-by: Christoph Böhmwalder Tested-by: Christoph Böhmwalder Cc: Steve French Cc: Philipp Reisner Cc: David Laight Fixes: 247bc9470b1e ("cifs: fix rmmod regression in cifs.ko caused by force_sig changes") Fixes: 72abe3bcf091 ("signal/cifs: Fix cifs_put_tcp_session to call send_sig instead of force_sig") Fixes: fee109901f39 ("signal/drbd: Use send_sig not force_sig") Fixes: 3cf5d076fb4d ("signal: Remove task parameter from force_sig") Signed-off-by: "Eric W. Biederman" --- drivers/block/drbd/drbd_main.c | 2 ++ fs/cifs/connect.c | 2 +- include/linux/signal.h | 15 ++++++++++++++- kernel/signal.c | 5 +++++ 4 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 9bd4ddd12b25..5b248763a672 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -322,6 +322,8 @@ static int drbd_thread_setup(void *arg) thi->name[0], resource->name); + allow_kernel_signal(DRBD_SIGKILL); + allow_kernel_signal(SIGXCPU); restart: retval = thi->function(thi); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a15a6e738eb5..1795e80cbdf7 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1113,7 +1113,7 @@ cifs_demultiplex_thread(void *p) mempool_resize(cifs_req_poolp, length + cifs_min_rcv); set_freezable(); - allow_signal(SIGKILL); + allow_kernel_signal(SIGKILL); while (server->tcpStatus != CifsExiting) { if (try_to_freeze()) continue; diff --git a/include/linux/signal.h b/include/linux/signal.h index b5d99482d3fe..1a5f88316b08 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -282,6 +282,9 @@ extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping); extern void exit_signals(struct task_struct *tsk); extern void kernel_sigaction(int, __sighandler_t); +#define SIG_KTHREAD ((__force __sighandler_t)2) +#define SIG_KTHREAD_KERNEL ((__force __sighandler_t)3) + static inline void allow_signal(int sig) { /* @@ -289,7 +292,17 @@ static inline void allow_signal(int sig) * know it'll be handled, so that they don't get converted to * SIGKILL or just silently dropped. */ - kernel_sigaction(sig, (__force __sighandler_t)2); + kernel_sigaction(sig, SIG_KTHREAD); +} + +static inline void allow_kernel_signal(int sig) +{ + /* + * Kernel threads handle their own signals. Let the signal code + * know signals sent by the kernel will be handled, so that they + * don't get silently dropped. + */ + kernel_sigaction(sig, SIG_KTHREAD_KERNEL); } static inline void disallow_signal(int sig) diff --git a/kernel/signal.c b/kernel/signal.c index e667be6907d7..534fec266a33 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -90,6 +90,11 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force) handler == SIG_DFL && !(force && sig_kernel_only(sig))) return true; + /* Only allow kernel generated signals to this kthread */ + if (unlikely((t->flags & PF_KTHREAD) && + (handler == SIG_KTHREAD_KERNEL) && !force)) + return true; + return sig_handler_ignored(handler, sig); } -- cgit v1.2.3 From 38a429c898ddd210cc35463b096389f97c3c5a73 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 19 Aug 2019 16:39:27 +0900 Subject: netfilter: add include guard to nf_conntrack_h323_types.h Add a header include guard just in case. Signed-off-by: Masahiro Yamada Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_h323_types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_h323_types.h b/include/linux/netfilter/nf_conntrack_h323_types.h index 7a6871ac8784..74c6f9241944 100644 --- a/include/linux/netfilter/nf_conntrack_h323_types.h +++ b/include/linux/netfilter/nf_conntrack_h323_types.h @@ -4,6 +4,9 @@ * Copyright (c) 2006 Jing Min Zhao */ +#ifndef _NF_CONNTRACK_H323_TYPES_H +#define _NF_CONNTRACK_H323_TYPES_H + typedef struct TransportAddress_ipAddress { /* SEQUENCE */ int options; /* No use */ unsigned int ip; @@ -931,3 +934,5 @@ typedef struct RasMessage { /* CHOICE */ InfoRequestResponse infoRequestResponse; }; } RasMessage; + +#endif /* _NF_CONNTRACK_H323_TYPES_H */ -- cgit v1.2.3 From 555df336c754ac9de1af9a5c72508918b3796b18 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 16:02:01 +0100 Subject: keys: Fix description size The maximum key description size is 4095. Commit f771fde82051 ("keys: Simplify key description management") inadvertantly reduced that to 255 and made sizes between 256 and 4095 work weirdly, and any size whereby size & 255 == 0 would cause an assertion in __key_link_begin() at the following line: BUG_ON(index_key->desc_len == 0); This can be fixed by simply increasing the size of desc_len in struct keyring_index_key to a u16. Note the argument length test in keyutils only checked empty descriptions and descriptions with a size around the limit (ie. 4095) and not for all the values in between, so it missed this. This has been addressed and https://git.kernel.org/pub/scm/linux/kernel/git/dhowells/keyutils.git/commit/?id=066bf56807c26cd3045a25f355b34c1d8a20a5aa now exhaustively tests all possible lengths of type, description and payload and then some. The assertion failure looks something like: kernel BUG at security/keys/keyring.c:1245! ... RIP: 0010:__key_link_begin+0x88/0xa0 ... Call Trace: key_create_or_update+0x211/0x4b0 __x64_sys_add_key+0x101/0x200 do_syscall_64+0x5b/0x1e0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 It can be triggered by: keyctl add user "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" a @s Fixes: f771fde82051 ("keys: Simplify key description management") Reported-by: kernel test robot Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- include/linux/key.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/key.h b/include/linux/key.h index 91f391cd272e..50028338a4cc 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -94,11 +94,11 @@ struct keyring_index_key { union { struct { #ifdef __LITTLE_ENDIAN /* Put desc_len at the LSB of x */ - u8 desc_len; - char desc[sizeof(long) - 1]; /* First few chars of description */ + u16 desc_len; + char desc[sizeof(long) - 2]; /* First few chars of description */ #else - char desc[sizeof(long) - 1]; /* First few chars of description */ - u8 desc_len; + char desc[sizeof(long) - 2]; /* First few chars of description */ + u16 desc_len; #endif }; unsigned long x; -- cgit v1.2.3 From 90ae409f9eb3bcaf38688f9ec22375816053a08e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Aug 2019 11:45:49 +0900 Subject: dma-direct: fix zone selection after an unaddressable CMA allocation The new dma_alloc_contiguous hides if we allocate CMA or regular pages, and thus fails to retry a ZONE_NORMAL allocation if the CMA allocation succeeds but isn't addressable. That means we either fail outright or dip into a small zone that might not succeed either. Thanks to Hillf Danton for debugging this issue. Fixes: b1d2dc009dec ("dma-contiguous: add dma_{alloc,free}_contiguous() helpers") Reported-by: Tobias Klausmann Signed-off-by: Christoph Hellwig Tested-by: Tobias Klausmann --- drivers/iommu/dma-iommu.c | 3 +++ include/linux/dma-contiguous.h | 5 +---- kernel/dma/contiguous.c | 8 ++------ kernel/dma/direct.c | 10 +++++++++- 4 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index d991d40f797f..f68a62c3c32b 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -965,10 +965,13 @@ static void *iommu_dma_alloc_pages(struct device *dev, size_t size, { bool coherent = dev_is_dma_coherent(dev); size_t alloc_size = PAGE_ALIGN(size); + int node = dev_to_node(dev); struct page *page = NULL; void *cpu_addr; page = dma_alloc_contiguous(dev, alloc_size, gfp); + if (!page) + page = alloc_pages_node(node, gfp, get_order(alloc_size)); if (!page) return NULL; diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index c05d4e661489..03f8e98e3bcc 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h @@ -160,10 +160,7 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, static inline struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { - int node = dev ? dev_to_node(dev) : NUMA_NO_NODE; - size_t align = get_order(PAGE_ALIGN(size)); - - return alloc_pages_node(node, gfp, align); + return NULL; } static inline void dma_free_contiguous(struct device *dev, struct page *page, diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 2bd410f934b3..69cfb4345388 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -230,9 +230,7 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, */ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { - int node = dev ? dev_to_node(dev) : NUMA_NO_NODE; - size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT; - size_t align = get_order(PAGE_ALIGN(size)); + size_t count = size >> PAGE_SHIFT; struct page *page = NULL; struct cma *cma = NULL; @@ -243,14 +241,12 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) /* CMA can be used only in the context which permits sleeping */ if (cma && gfpflags_allow_blocking(gfp)) { + size_t align = get_order(size); size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN); } - /* Fallback allocation of normal pages */ - if (!page) - page = alloc_pages_node(node, gfp, align); return page; } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 795c9b095d75..706113c6bebc 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -85,6 +85,8 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { + size_t alloc_size = PAGE_ALIGN(size); + int node = dev_to_node(dev); struct page *page = NULL; u64 phys_mask; @@ -95,8 +97,14 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp &= ~__GFP_ZERO; gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_mask); + page = dma_alloc_contiguous(dev, alloc_size, gfp); + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + dma_free_contiguous(dev, page, alloc_size); + page = NULL; + } again: - page = dma_alloc_contiguous(dev, size, gfp); + if (!page) + page = alloc_pages_node(node, gfp, get_order(alloc_size)); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, size); page = NULL; -- cgit v1.2.3 From 5c498950f730aa17c5f8a2cdcb903524e4002ed2 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Fri, 19 Jul 2019 15:32:19 +0100 Subject: libceph: allow ceph_buffer_put() to receive a NULL ceph_buffer Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/buffer.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/buffer.h b/include/linux/ceph/buffer.h index 5e58bb29b1a3..11cdc7c60480 100644 --- a/include/linux/ceph/buffer.h +++ b/include/linux/ceph/buffer.h @@ -30,7 +30,8 @@ static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b) static inline void ceph_buffer_put(struct ceph_buffer *b) { - kref_put(&b->kref, ceph_buffer_release); + if (b) + kref_put(&b->kref, ceph_buffer_release); } extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end); -- cgit v1.2.3 From b99328a60a482108f5195b4d611f90992ca016ba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 22 Aug 2019 13:00:15 +0200 Subject: timekeeping/vsyscall: Prevent math overflow in BOOTTIME update The VDSO update for CLOCK_BOOTTIME has a overflow issue as it shifts the nanoseconds based boot time offset left by the clocksource shift. That overflows once the boot time offset becomes large enough. As a consequence CLOCK_BOOTTIME in the VDSO becomes a random number causing applications to misbehave. Fix it by storing a timespec64 representation of the offset when boot time is adjusted and add that to the MONOTONIC base time value in the vdso data page. Using the timespec64 representation avoids a 64bit division in the update code. Fixes: 44f57d788e7d ("timekeeping: Provide a generic update_vsyscall() implementation") Reported-by: Chris Clayton Signed-off-by: Thomas Gleixner Tested-by: Chris Clayton Tested-by: Vincenzo Frascino Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1908221257580.1983@nanos.tec.linutronix.de --- include/linux/timekeeper_internal.h | 5 +++++ kernel/time/timekeeping.c | 5 +++++ kernel/time/vsyscall.c | 22 +++++++++++++--------- 3 files changed, 23 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 7acb953298a7..84ff2844df2a 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -57,6 +57,7 @@ struct tk_read_base { * @cs_was_changed_seq: The sequence number of clocksource change events * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds + * @monotonic_to_boot: CLOCK_MONOTONIC to CLOCK_BOOTTIME offset * @cycle_interval: Number of clock cycles in one NTP interval * @xtime_interval: Number of clock shifted nano seconds in one NTP * interval. @@ -84,6 +85,9 @@ struct tk_read_base { * * wall_to_monotonic is no longer the boot time, getboottime must be * used instead. + * + * @monotonic_to_boottime is a timespec64 representation of @offs_boot to + * accelerate the VDSO update for CLOCK_BOOTTIME. */ struct timekeeper { struct tk_read_base tkr_mono; @@ -99,6 +103,7 @@ struct timekeeper { u8 cs_was_changed_seq; ktime_t next_leap_ktime; u64 raw_sec; + struct timespec64 monotonic_to_boot; /* The following members are for timekeeping internal use */ u64 cycle_interval; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d911c8470149..ca69290bee2a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -146,6 +146,11 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { tk->offs_boot = ktime_add(tk->offs_boot, delta); + /* + * Timespec representation for VDSO update to avoid 64bit division + * on every update. + */ + tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); } /* diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 8cf3596a4ce6..4bc37ac3bb05 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -17,7 +17,7 @@ static inline void update_vdso_data(struct vdso_data *vdata, struct timekeeper *tk) { struct vdso_timestamp *vdso_ts; - u64 nsec; + u64 nsec, sec; vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; @@ -45,23 +45,27 @@ static inline void update_vdso_data(struct vdso_data *vdata, } vdso_ts->nsec = nsec; - /* CLOCK_MONOTONIC_RAW */ - vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; - vdso_ts->sec = tk->raw_sec; - vdso_ts->nsec = tk->tkr_raw.xtime_nsec; + /* Copy MONOTONIC time for BOOTTIME */ + sec = vdso_ts->sec; + /* Add the boot offset */ + sec += tk->monotonic_to_boot.tv_sec; + nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; - vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec; - nsec += ((u64)(tk->wall_to_monotonic.tv_nsec + - ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift); + vdso_ts->sec = sec; + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; + /* CLOCK_MONOTONIC_RAW */ + vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts->sec = tk->raw_sec; + vdso_ts->nsec = tk->tkr_raw.xtime_nsec; + /* CLOCK_TAI */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; -- cgit v1.2.3 From d5711920ec6e578f51db95caa6f185f5090b865e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 16 Aug 2019 08:37:26 -0400 Subject: Revert "NFSv4/flexfiles: Abort I/O early if the layout segment was invalidated" This reverts commit a79f194aa4879e9baad118c3f8bb2ca24dbef765. The mechanism for aborting I/O is racy, since we are not guaranteed that the request is asleep while we're changing both task->tk_status and task->tk_action. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v5.1 --- fs/nfs/flexfilelayout/flexfilelayout.c | 17 ----------------- include/linux/sunrpc/sched.h | 1 - net/sunrpc/xprt.c | 7 ------- 3 files changed, 25 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index b04e20d28162..2c7e1eca1ed7 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1148,8 +1148,6 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, break; case -NFS4ERR_RETRY_UNCACHED_REP: break; - case -EAGAIN: - return -NFS4ERR_RESET_TO_PNFS; /* Invalidate Layout errors */ case -NFS4ERR_PNFS_NO_LAYOUT: case -ESTALE: /* mapped NFS4ERR_STALE */ @@ -1210,7 +1208,6 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, case -EBADHANDLE: case -ELOOP: case -ENOSPC: - case -EAGAIN: break; case -EJUKEBOX: nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); @@ -1445,16 +1442,6 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) ff_layout_read_prepare_common(task, hdr); } -static void -ff_layout_io_prepare_transmit(struct rpc_task *task, - void *data) -{ - struct nfs_pgio_header *hdr = data; - - if (!pnfs_is_valid_lseg(hdr->lseg)) - rpc_exit(task, -EAGAIN); -} - static void ff_layout_read_call_done(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; @@ -1740,7 +1727,6 @@ static void ff_layout_commit_release(void *data) static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { .rpc_call_prepare = ff_layout_read_prepare_v3, - .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, .rpc_release = ff_layout_read_release, @@ -1748,7 +1734,6 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { .rpc_call_prepare = ff_layout_read_prepare_v4, - .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, .rpc_release = ff_layout_read_release, @@ -1756,7 +1741,6 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { .rpc_call_prepare = ff_layout_write_prepare_v3, - .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, .rpc_release = ff_layout_write_release, @@ -1764,7 +1748,6 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { .rpc_call_prepare = ff_layout_write_prepare_v4, - .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, .rpc_release = ff_layout_write_release, diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index baa3ecdb882f..27536b961552 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -98,7 +98,6 @@ typedef void (*rpc_action)(struct rpc_task *); struct rpc_call_ops { void (*rpc_call_prepare)(struct rpc_task *, void *); - void (*rpc_call_prepare_transmit)(struct rpc_task *, void *); void (*rpc_call_done)(struct rpc_task *, void *); void (*rpc_count_stats)(struct rpc_task *, void *); void (*rpc_release)(void *); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 783748dc5e6f..2e71f5455c6c 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1408,13 +1408,6 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) status = -EBADMSG; goto out_dequeue; } - if (task->tk_ops->rpc_call_prepare_transmit) { - task->tk_ops->rpc_call_prepare_transmit(task, - task->tk_calldata); - status = task->tk_status; - if (status < 0) - goto out_dequeue; - } if (RPC_SIGNALLED(task)) { status = -ERESTARTSYS; goto out_dequeue; -- cgit v1.2.3 From 94acaeb50ced653bfe2c4d8037c70b107af14124 Mon Sep 17 00:00:00 2001 From: Marco Hartmann Date: Wed, 21 Aug 2019 11:00:46 +0000 Subject: Add genphy_c45_config_aneg() function to phy-c45.c Commit 34786005eca3 ("net: phy: prevent PHYs w/o Clause 22 regs from calling genphy_config_aneg") introduced a check that aborts phy_config_aneg() if the phy is a C45 phy. This causes phy_state_machine() to call phy_error() so that the phy ends up in PHY_HALTED state. Instead of returning -EOPNOTSUPP, call genphy_c45_config_aneg() (analogous to the C22 case) so that the state machine can run correctly. genphy_c45_config_aneg() closely resembles mv3310_config_aneg() in drivers/net/phy/marvell10g.c, excluding vendor specific configurations for 1000BaseT. Fixes: 22b56e827093 ("net: phy: replace genphy_10g_driver with genphy_c45_driver") Signed-off-by: Marco Hartmann Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phy-c45.c | 26 ++++++++++++++++++++++++++ drivers/net/phy/phy.c | 2 +- include/linux/phy.h | 1 + 3 files changed, 28 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index 58bb25e4af10..7935593debb1 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -523,6 +523,32 @@ int genphy_c45_read_status(struct phy_device *phydev) } EXPORT_SYMBOL_GPL(genphy_c45_read_status); +/** + * genphy_c45_config_aneg - restart auto-negotiation or forced setup + * @phydev: target phy_device struct + * + * Description: If auto-negotiation is enabled, we configure the + * advertising, and then restart auto-negotiation. If it is not + * enabled, then we force a configuration. + */ +int genphy_c45_config_aneg(struct phy_device *phydev) +{ + bool changed = false; + int ret; + + if (phydev->autoneg == AUTONEG_DISABLE) + return genphy_c45_pma_setup_forced(phydev); + + ret = genphy_c45_an_config_aneg(phydev); + if (ret < 0) + return ret; + if (ret > 0) + changed = true; + + return genphy_c45_check_and_restart_aneg(phydev, changed); +} +EXPORT_SYMBOL_GPL(genphy_c45_config_aneg); + /* The gen10g_* functions are the old Clause 45 stub */ int gen10g_config_aneg(struct phy_device *phydev) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index ef7aa738e0dc..6b0f89369b46 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -507,7 +507,7 @@ static int phy_config_aneg(struct phy_device *phydev) * allowed to call genphy_config_aneg() */ if (phydev->is_c45 && !(phydev->c45_ids.devices_in_package & BIT(0))) - return -EOPNOTSUPP; + return genphy_c45_config_aneg(phydev); return genphy_config_aneg(phydev); } diff --git a/include/linux/phy.h b/include/linux/phy.h index 462b90b73f93..2fb9c8ffaf10 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1107,6 +1107,7 @@ int genphy_c45_an_disable_aneg(struct phy_device *phydev); int genphy_c45_read_mdix(struct phy_device *phydev); int genphy_c45_pma_read_abilities(struct phy_device *phydev); int genphy_c45_read_status(struct phy_device *phydev); +int genphy_c45_config_aneg(struct phy_device *phydev); /* The gen10g_* functions are the old Clause 45 stub */ int gen10g_config_aneg(struct phy_device *phydev); -- cgit v1.2.3 From bee07b33db78d4ee7ed6a2fe810b9473d5471fe4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 30 Aug 2019 16:04:32 -0700 Subject: mm: memcontrol: flush percpu slab vmstats on kmem offlining I've noticed that the "slab" value in memory.stat is sometimes 0, even if some children memory cgroups have a non-zero "slab" value. The following investigation showed that this is the result of the kmem_cache reparenting in combination with the per-cpu batching of slab vmstats. At the offlining some vmstat value may leave in the percpu cache, not being propagated upwards by the cgroup hierarchy. It means that stats on ancestor levels are lower than actual. Later when slab pages are released, the precise number of pages is substracted on the parent level, making the value negative. We don't show negative values, 0 is printed instead. To fix this issue, let's flush percpu slab memcg and lruvec stats on memcg offlining. This guarantees that numbers on all ancestor levels are accurate and match the actual number of outstanding slab pages. Link: http://lkml.kernel.org/r/20190819202338.363363-3-guro@fb.com Fixes: fb2f2b0adb98 ("mm: memcg/slab: reparent memcg kmem_caches on cgroup removal") Signed-off-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 5 +++-- mm/memcontrol.c | 35 +++++++++++++++++++++++++++-------- 2 files changed, 30 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d77d717c620c..3f38c30d2f13 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -215,8 +215,9 @@ enum node_stat_item { NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ NR_UNEVICTABLE, /* " " " " " */ - NR_SLAB_RECLAIMABLE, - NR_SLAB_UNRECLAIMABLE, + NR_SLAB_RECLAIMABLE, /* Please do not reorder this item */ + NR_SLAB_UNRECLAIMABLE, /* and this one without looking at + * memcg_flush_percpu_vmstats() first. */ NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_NODES, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 26e2999af608..1f585d6c77c1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3260,37 +3260,49 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } -static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) +static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only) { unsigned long stat[MEMCG_NR_STAT]; struct mem_cgroup *mi; int node, cpu, i; + int min_idx, max_idx; - for (i = 0; i < MEMCG_NR_STAT; i++) + if (slab_only) { + min_idx = NR_SLAB_RECLAIMABLE; + max_idx = NR_SLAB_UNRECLAIMABLE; + } else { + min_idx = 0; + max_idx = MEMCG_NR_STAT; + } + + for (i = min_idx; i < max_idx; i++) stat[i] = 0; for_each_online_cpu(cpu) - for (i = 0; i < MEMCG_NR_STAT; i++) + for (i = min_idx; i < max_idx; i++) stat[i] += raw_cpu_read(memcg->vmstats_percpu->stat[i]); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - for (i = 0; i < MEMCG_NR_STAT; i++) + for (i = min_idx; i < max_idx; i++) atomic_long_add(stat[i], &mi->vmstats[i]); + if (!slab_only) + max_idx = NR_VM_NODE_STAT_ITEMS; + for_each_node(node) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; struct mem_cgroup_per_node *pi; - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + for (i = min_idx; i < max_idx; i++) stat[i] = 0; for_each_online_cpu(cpu) - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + for (i = min_idx; i < max_idx; i++) stat[i] += raw_cpu_read( pn->lruvec_stat_cpu->count[i]); for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + for (i = min_idx; i < max_idx; i++) atomic_long_add(stat[i], &pi->lruvec_stat[i]); } } @@ -3363,7 +3375,14 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) if (!parent) parent = root_mem_cgroup; + /* + * Deactivate and reparent kmem_caches. Then flush percpu + * slab statistics to have precise values at the parent and + * all ancestor levels. It's required to keep slab stats + * accurate after the reparenting of kmem_caches. + */ memcg_deactivate_kmem_caches(memcg, parent); + memcg_flush_percpu_vmstats(memcg, true); kmemcg_id = memcg->kmemcg_id; BUG_ON(kmemcg_id < 0); @@ -4740,7 +4759,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) * Flush percpu vmstats and vmevents to guarantee the value correctness * on parent's and all ancestor levels. */ - memcg_flush_percpu_vmstats(memcg); + memcg_flush_percpu_vmstats(memcg, false); memcg_flush_percpu_vmevents(memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); -- cgit v1.2.3 From 595a438c78dbdc43d6c9db4f437267f0bd1548bf Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Thu, 4 Jul 2019 20:21:10 +0300 Subject: tracing: Make exported ftrace_set_clr_event non-static The function ftrace_set_clr_event is declared static and marked EXPORT_SYMBOL_GPL(), which is at best an odd combination. Because the function was decided to be a part of API, this commit removes the static attribute and adds the declaration to the header. Link: http://lkml.kernel.org/r/20190704172110.27041-1-efremov@linux.com Fixes: f45d1225adb04 ("tracing: Kernel access to Ftrace instances") Reviewed-by: Joe Jin Signed-off-by: Denis Efremov Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 1 + kernel/trace/trace_events.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 5150436783e8..30a8cdcfd4a4 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -548,6 +548,7 @@ extern int trace_event_get_offsets(struct trace_event_call *call); #define is_signed_type(type) (((type)(-1)) < (type)1) +int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); int trace_set_clr_event(const char *system, const char *event, int set); /* diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c7506bc81b75..648930823b57 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -787,7 +787,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, return ret; } -static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) +int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { char *event = NULL, *sub = NULL, *match; int ret; -- cgit v1.2.3