From 31c9590ae468478fe47dc0f5f0d3562b2f69450e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 18 Apr 2020 21:06:23 -0400 Subject: SUNRPC: Add "@len" parameter to gss_unwrap() Refactor: This is a pre-requisite to fixing the client-side ralign computation in gss_unwrap_resp_priv(). The length value is passed in explicitly rather that as the value of buf->len. This will subsequently allow gss_unwrap_kerberos_v1() to compute a slack and align value, instead of computing it in gss_unwrap_resp_priv(). Fixes: 35e77d21baa0 ("SUNRPC: Add rpc_auth::au_ralign field") Signed-off-by: Chuck Lever --- include/linux/sunrpc/gss_api.h | 2 ++ include/linux/sunrpc/gss_krb5.h | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h index 48c1b1674cbf..e9a79518d652 100644 --- a/include/linux/sunrpc/gss_api.h +++ b/include/linux/sunrpc/gss_api.h @@ -66,6 +66,7 @@ u32 gss_wrap( u32 gss_unwrap( struct gss_ctx *ctx_id, int offset, + int len, struct xdr_buf *inbuf); u32 gss_delete_sec_context( struct gss_ctx **ctx_id); @@ -126,6 +127,7 @@ struct gss_api_ops { u32 (*gss_unwrap)( struct gss_ctx *ctx_id, int offset, + int len, struct xdr_buf *buf); void (*gss_delete_sec_context)( void *internal_ctx_id); diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h index c1d77dd8ed41..e8f8ffe7448b 100644 --- a/include/linux/sunrpc/gss_krb5.h +++ b/include/linux/sunrpc/gss_krb5.h @@ -83,7 +83,7 @@ struct gss_krb5_enctype { u32 (*encrypt_v2) (struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, struct page **pages); /* v2 encryption function */ - u32 (*decrypt_v2) (struct krb5_ctx *kctx, u32 offset, + u32 (*decrypt_v2) (struct krb5_ctx *kctx, u32 offset, u32 len, struct xdr_buf *buf, u32 *headskip, u32 *tailskip); /* v2 decryption function */ }; @@ -255,7 +255,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx_id, int offset, struct xdr_buf *outbuf, struct page **pages); u32 -gss_unwrap_kerberos(struct gss_ctx *ctx_id, int offset, +gss_unwrap_kerberos(struct gss_ctx *ctx_id, int offset, int len, struct xdr_buf *buf); @@ -312,7 +312,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, struct page **pages); u32 -gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, +gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, u32 len, struct xdr_buf *buf, u32 *plainoffset, u32 *plainlen); -- cgit v1.2.3 From a7e429a6fa6d612d1dacde96c885dc1bb4a9f400 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 18 Apr 2020 14:38:19 -0400 Subject: SUNRPC: Fix GSS privacy computation of auth->au_ralign When the au_ralign field was added to gss_unwrap_resp_priv, the wrong calculation was used. Setting au_rslack == au_ralign is probably correct for kerberos_v1 privacy, but kerberos_v2 privacy adds additional GSS data after the clear text RPC message. au_ralign needs to be smaller than au_rslack in that fairly common case. When xdr_buf_trim() is restored to gss_unwrap_kerberos_v2(), it does exactly what I feared it would: it trims off part of the clear text RPC message. However, that's because rpc_prepare_reply_pages() does not set up the rq_rcv_buf's tail correctly because au_ralign is too large. Fixing the au_ralign computation also corrects the alignment of rq_rcv_buf->pages so that the client does not have to shift reply data payloads after they are received. Fixes: 35e77d21baa0 ("SUNRPC: Add rpc_auth::au_ralign field") Signed-off-by: Chuck Lever --- include/linux/sunrpc/gss_api.h | 1 + net/sunrpc/auth_gss/auth_gss.c | 8 +++----- net/sunrpc/auth_gss/gss_krb5_wrap.c | 19 +++++++++++++++---- 3 files changed, 19 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h index e9a79518d652..bc07e51f20d1 100644 --- a/include/linux/sunrpc/gss_api.h +++ b/include/linux/sunrpc/gss_api.h @@ -21,6 +21,7 @@ struct gss_ctx { struct gss_api_mech *mech_type; void *internal_ctx_id; + unsigned int slack, align; }; #define GSS_C_NO_BUFFER ((struct xdr_netobj) 0) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 7885f37e3688..ac5cac0dd24b 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -2032,7 +2032,6 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; struct kvec *head = rqstp->rq_rcv_buf.head; struct rpc_auth *auth = cred->cr_auth; - unsigned int savedlen = rcv_buf->len; u32 offset, opaque_len, maj_stat; __be32 *p; @@ -2059,10 +2058,9 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, */ xdr_init_decode(xdr, rcv_buf, p, rqstp); - auth->au_rslack = auth->au_verfsize + 2 + - XDR_QUADLEN(savedlen - rcv_buf->len); - auth->au_ralign = auth->au_verfsize + 2 + - XDR_QUADLEN(savedlen - rcv_buf->len); + auth->au_rslack = auth->au_verfsize + 2 + ctx->gc_gss_ctx->slack; + auth->au_ralign = auth->au_verfsize + 2 + ctx->gc_gss_ctx->align; + return 0; unwrap_failed: trace_rpcgss_unwrap_failed(task); diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index c7589e35d5d9..4905652e7567 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -262,7 +262,8 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, static u32 gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, int len, - struct xdr_buf *buf) + struct xdr_buf *buf, unsigned int *slack, + unsigned int *align) { int signalg; int sealalg; @@ -280,6 +281,7 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, int len, u32 conflen = kctx->gk5e->conflen; int crypt_offset; u8 *cksumkey; + unsigned int saved_len = buf->len; dprintk("RPC: gss_unwrap_kerberos\n"); @@ -383,6 +385,10 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, int len, if (gss_krb5_remove_padding(buf, blocksize)) return GSS_S_DEFECTIVE_TOKEN; + /* slack must include room for krb5 padding */ + *slack = XDR_QUADLEN(saved_len - buf->len); + /* The GSS blob always precedes the RPC message payload */ + *align = *slack; return GSS_S_COMPLETE; } @@ -489,7 +495,8 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, static u32 gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, int len, - struct xdr_buf *buf) + struct xdr_buf *buf, unsigned int *slack, + unsigned int *align) { time64_t now; u8 *ptr; @@ -583,6 +590,8 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, int len, /* Trim off the trailing "extra count" and checksum blob */ buf->len -= ec + GSS_KRB5_TOK_HDR_LEN + tailskip; + *align = XDR_QUADLEN(GSS_KRB5_TOK_HDR_LEN + headskip); + *slack = *align + XDR_QUADLEN(ec + GSS_KRB5_TOK_HDR_LEN + tailskip); return GSS_S_COMPLETE; } @@ -617,9 +626,11 @@ gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, case ENCTYPE_DES_CBC_RAW: case ENCTYPE_DES3_CBC_RAW: case ENCTYPE_ARCFOUR_HMAC: - return gss_unwrap_kerberos_v1(kctx, offset, len, buf); + return gss_unwrap_kerberos_v1(kctx, offset, len, buf, + &gctx->slack, &gctx->align); case ENCTYPE_AES128_CTS_HMAC_SHA1_96: case ENCTYPE_AES256_CTS_HMAC_SHA1_96: - return gss_unwrap_kerberos_v2(kctx, offset, len, buf); + return gss_unwrap_kerberos_v2(kctx, offset, len, buf, + &gctx->slack, &gctx->align); } } -- cgit v1.2.3 From 0a8e7b7d08466b5fc52f8e96070acc116d82a8bb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 15 Apr 2020 17:36:22 -0400 Subject: SUNRPC: Revert 241b1f419f0e ("SUNRPC: Remove xdr_buf_trim()") I've noticed that when krb5i or krb5p security is in use, retransmitted requests are missing the server's duplicate reply cache. The computed checksum on the retransmitted request does not match the cached checksum, resulting in the server performing the retransmitted request again instead of returning the cached reply. The assumptions made when removing xdr_buf_trim() were not correct. In the send paths, the upper layer has already set the segment lengths correctly, and shorting the buffer's content is simply a matter of reducing buf->len. xdr_buf_trim() is the right answer in the receive/unwrap path on both the client and the server. The buffer segment lengths have to be shortened one-by-one. On the server side in particular, head.iov_len needs to be updated correctly to enable nfsd_cache_csum() to work correctly. The simple buf->len computation doesn't do that, and that results in checksumming stale data in the buffer. The problem isn't noticed until there's significant instability of the RPC transport. At that point, the reliability of retransmit detection on the server becomes crucial. Fixes: 241b1f419f0e ("SUNRPC: Remove xdr_buf_trim()") Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdr.h | 1 + net/sunrpc/auth_gss/gss_krb5_wrap.c | 7 +++---- net/sunrpc/auth_gss/svcauth_gss.c | 2 +- net/sunrpc/xdr.c | 41 +++++++++++++++++++++++++++++++++++++ 4 files changed, 46 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 01bb41908c93..22c207b2425f 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -184,6 +184,7 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p) extern void xdr_shift_buf(struct xdr_buf *, size_t); extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *); extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int); +extern void xdr_buf_trim(struct xdr_buf *, unsigned int); extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 4905652e7567..cf0fd170ac18 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -580,15 +580,14 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, int len, */ movelen = min_t(unsigned int, buf->head[0].iov_len, len); movelen -= offset + GSS_KRB5_TOK_HDR_LEN + headskip; - if (offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen > - buf->head[0].iov_len) - return GSS_S_FAILURE; + BUG_ON(offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen > + buf->head[0].iov_len); memmove(ptr, ptr + GSS_KRB5_TOK_HDR_LEN + headskip, movelen); buf->head[0].iov_len -= GSS_KRB5_TOK_HDR_LEN + headskip; buf->len = len - GSS_KRB5_TOK_HDR_LEN + headskip; /* Trim off the trailing "extra count" and checksum blob */ - buf->len -= ec + GSS_KRB5_TOK_HDR_LEN + tailskip; + xdr_buf_trim(buf, ec + GSS_KRB5_TOK_HDR_LEN + tailskip); *align = XDR_QUADLEN(GSS_KRB5_TOK_HDR_LEN + headskip); *slack = *align + XDR_QUADLEN(ec + GSS_KRB5_TOK_HDR_LEN + tailskip); diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index d0a2f084e5a4..50d93c49ef1a 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -906,7 +906,7 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g if (svc_getnl(&buf->head[0]) != seq) goto out; /* trim off the mic and padding at the end before returning */ - buf->len -= 4 + round_up_to_quad(mic.len); + xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4); stat = 0; out: kfree(mic.data); diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 15b58c5144f9..6f7d82fb1eb0 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1150,6 +1150,47 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, } EXPORT_SYMBOL_GPL(xdr_buf_subsegment); +/** + * xdr_buf_trim - lop at most "len" bytes off the end of "buf" + * @buf: buf to be trimmed + * @len: number of bytes to reduce "buf" by + * + * Trim an xdr_buf by the given number of bytes by fixing up the lengths. Note + * that it's possible that we'll trim less than that amount if the xdr_buf is + * too small, or if (for instance) it's all in the head and the parser has + * already read too far into it. + */ +void xdr_buf_trim(struct xdr_buf *buf, unsigned int len) +{ + size_t cur; + unsigned int trim = len; + + if (buf->tail[0].iov_len) { + cur = min_t(size_t, buf->tail[0].iov_len, trim); + buf->tail[0].iov_len -= cur; + trim -= cur; + if (!trim) + goto fix_len; + } + + if (buf->page_len) { + cur = min_t(unsigned int, buf->page_len, trim); + buf->page_len -= cur; + trim -= cur; + if (!trim) + goto fix_len; + } + + if (buf->head[0].iov_len) { + cur = min_t(size_t, buf->head[0].iov_len, trim); + buf->head[0].iov_len -= cur; + trim -= cur; + } +fix_len: + buf->len -= (len - trim); +} +EXPORT_SYMBOL_GPL(xdr_buf_trim); + static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len) { unsigned int this_len; -- cgit v1.2.3 From 501be6c1c72417eab05e7413671a38ea991a8ebc Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 25 Mar 2020 21:16:03 +0100 Subject: drm/tegra: Fix SMMU support on Tegra124 and Tegra210 When testing whether or not to enable the use of the SMMU, consult the supported DMA mask rather than the actually configured DMA mask, since the latter might already have been restricted. Fixes: 2d9384ff9177 ("drm/tegra: Relax IOMMU usage criteria on old Tegra") Tested-by: Jon Hunter Signed-off-by: Thierry Reding --- drivers/gpu/drm/tegra/drm.c | 3 ++- drivers/gpu/host1x/dev.c | 13 +++++++++++++ include/linux/host1x.h | 3 +++ 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c index bd268028fb3d..583cd6e0ae27 100644 --- a/drivers/gpu/drm/tegra/drm.c +++ b/drivers/gpu/drm/tegra/drm.c @@ -1039,6 +1039,7 @@ void tegra_drm_free(struct tegra_drm *tegra, size_t size, void *virt, static bool host1x_drm_wants_iommu(struct host1x_device *dev) { + struct host1x *host1x = dev_get_drvdata(dev->dev.parent); struct iommu_domain *domain; /* @@ -1076,7 +1077,7 @@ static bool host1x_drm_wants_iommu(struct host1x_device *dev) * sufficient and whether or not the host1x is attached to an IOMMU * doesn't matter. */ - if (!domain && dma_get_mask(dev->dev.parent) <= DMA_BIT_MASK(32)) + if (!domain && host1x_get_dma_mask(host1x) <= DMA_BIT_MASK(32)) return true; return domain != NULL; diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c index 388bcc2889aa..40a4b9f8b861 100644 --- a/drivers/gpu/host1x/dev.c +++ b/drivers/gpu/host1x/dev.c @@ -502,6 +502,19 @@ static void __exit tegra_host1x_exit(void) } module_exit(tegra_host1x_exit); +/** + * host1x_get_dma_mask() - query the supported DMA mask for host1x + * @host1x: host1x instance + * + * Note that this returns the supported DMA mask for host1x, which can be + * different from the applicable DMA mask under certain circumstances. + */ +u64 host1x_get_dma_mask(struct host1x *host1x) +{ + return host1x->info->dma_mask; +} +EXPORT_SYMBOL(host1x_get_dma_mask); + MODULE_AUTHOR("Thierry Reding "); MODULE_AUTHOR("Terje Bergstrom "); MODULE_DESCRIPTION("Host1x driver for Tegra products"); diff --git a/include/linux/host1x.h b/include/linux/host1x.h index 62d216ff1097..c230b4e70d75 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -17,9 +17,12 @@ enum host1x_class { HOST1X_CLASS_GR3D = 0x60, }; +struct host1x; struct host1x_client; struct iommu_group; +u64 host1x_get_dma_mask(struct host1x *host1x); + /** * struct host1x_client_ops - host1x client operations * @init: host1x client initialization code -- cgit v1.2.3 From 2f5a55c52c00fcded796db5f961057ba3fec8910 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 2 May 2020 14:18:35 +0200 Subject: i2c: use my kernel.org address from now on The old email is still active, but for easier handling, I am going to use my kernel.org address from now on. Also, add a mailmap for the now defunct Pengutronix address. Signed-off-by: Wolfram Sang --- .mailmap | 2 ++ MAINTAINERS | 2 +- drivers/i2c/i2c-core-base.c | 2 +- drivers/i2c/i2c-core-of.c | 2 +- include/linux/i2c.h | 2 +- 5 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/.mailmap b/.mailmap index db3754a41018..4f906b4e9785 100644 --- a/.mailmap +++ b/.mailmap @@ -288,6 +288,8 @@ Vladimir Davydov Vladimir Davydov Takashi YOSHII Will Deacon +Wolfram Sang +Wolfram Sang Yakir Yang Yusuke Goda Gustavo Padovan diff --git a/MAINTAINERS b/MAINTAINERS index 2926327e4976..3a1f24367cc1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7949,7 +7949,7 @@ F: Documentation/i2c/busses/i2c-parport.rst F: drivers/i2c/busses/i2c-parport.c I2C SUBSYSTEM -M: Wolfram Sang +M: Wolfram Sang L: linux-i2c@vger.kernel.org S: Maintained W: https://i2c.wiki.kernel.org/ diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index b0de3078ab25..1f1442dfcad7 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -7,7 +7,7 @@ * Mux support by Rodolfo Giometti and * Michael Lawnick * - * Copyright (C) 2013-2017 Wolfram Sang + * Copyright (C) 2013-2017 Wolfram Sang */ #define pr_fmt(fmt) "i2c-core: " fmt diff --git a/drivers/i2c/i2c-core-of.c b/drivers/i2c/i2c-core-of.c index 6787c1f71483..3ed74aa4b44b 100644 --- a/drivers/i2c/i2c-core-of.c +++ b/drivers/i2c/i2c-core-of.c @@ -5,7 +5,7 @@ * Copyright (C) 2008 Jochen Friedrich * based on a previous patch from Jon Smirl * - * Copyright (C) 2013, 2018 Wolfram Sang + * Copyright (C) 2013, 2018 Wolfram Sang */ #include diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 45d36ba4826b..49d29054e657 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -2,7 +2,7 @@ /* * i2c.h - definitions for the Linux i2c bus interface * Copyright (C) 1995-2000 Simon G. Vogl - * Copyright (C) 2013-2019 Wolfram Sang + * Copyright (C) 2013-2019 Wolfram Sang * * With some changes from Kyösti Mälkki and * Frodo Looijaard -- cgit v1.2.3 From 81aabbb9fb7b4b1efd073b62f0505d3adad442f3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 4 May 2020 10:21:44 -0700 Subject: bpf, sockmap: bpf_tcp_ingress needs to subtract bytes from sg.size In bpf_tcp_ingress we used apply_bytes to subtract bytes from sg.size which is used to track total bytes in a message. But this is not correct because apply_bytes is itself modified in the main loop doing the mem_charge. Then at the end of this we have sg.size incorrectly set and out of sync with actual sk values. Then we can get a splat if we try to cork the data later and again try to redirect the msg to ingress. To fix instead of trying to track msg.size do the easy thing and include it as part of the sk_msg_xfer logic so that when the msg is moved the sg.size is always correct. To reproduce the below users will need ingress + cork and hit an error path that will then try to 'free' the skmsg. [ 173.699981] BUG: KASAN: null-ptr-deref in sk_msg_free_elem+0xdd/0x120 [ 173.699987] Read of size 8 at addr 0000000000000008 by task test_sockmap/5317 [ 173.700000] CPU: 2 PID: 5317 Comm: test_sockmap Tainted: G I 5.7.0-rc1+ #43 [ 173.700005] Hardware name: Dell Inc. Precision 5820 Tower/002KVM, BIOS 1.9.2 01/24/2019 [ 173.700009] Call Trace: [ 173.700021] dump_stack+0x8e/0xcb [ 173.700029] ? sk_msg_free_elem+0xdd/0x120 [ 173.700034] ? sk_msg_free_elem+0xdd/0x120 [ 173.700042] __kasan_report+0x102/0x15f [ 173.700052] ? sk_msg_free_elem+0xdd/0x120 [ 173.700060] kasan_report+0x32/0x50 [ 173.700070] sk_msg_free_elem+0xdd/0x120 [ 173.700080] __sk_msg_free+0x87/0x150 [ 173.700094] tcp_bpf_send_verdict+0x179/0x4f0 [ 173.700109] tcp_bpf_sendpage+0x3ce/0x5d0 Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/158861290407.14306.5327773422227552482.stgit@john-Precision-5820-Tower --- include/linux/skmsg.h | 1 + net/ipv4/tcp_bpf.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 8a709f63c5e5..ad31c9fb7158 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -187,6 +187,7 @@ static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, dst->sg.data[which] = src->sg.data[which]; dst->sg.data[which].length = size; dst->sg.size += size; + src->sg.size -= size; src->sg.data[which].length -= size; src->sg.data[which].offset += size; } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ff96466ea6da..629aaa9a1eb9 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -125,7 +125,6 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, if (!ret) { msg->sg.start = i; - msg->sg.size -= apply_bytes; sk_psock_queue_msg(psock, tmp); sk_psock_data_ready(sk, psock); } else { -- cgit v1.2.3 From 54163a346d4a0a1b93f2ff6dc1f488419a605fa9 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Wed, 6 May 2020 08:17:53 -0500 Subject: KVM: Introduce kvm_make_all_cpus_request_except() This allows making request to all other vcpus except the one specified in the parameter. Signed-off-by: Suravee Suthikulpanit Message-Id: <1588771076-73790-2-git-send-email-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 2 +- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 3 +++ virt/kvm/kvm_main.c | 14 +++++++++++--- 4 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index bcefa9d4e57e..54d4b98b49e1 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1427,7 +1427,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, */ kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP, - vcpu_mask, &hv_vcpu->tlb_flush); + NULL, vcpu_mask, &hv_vcpu->tlb_flush); ret_success: /* We always do full TLB flush, set rep_done = rep_cnt. */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f780af601c5f..ba8edf3b89f6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8030,7 +8030,7 @@ void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, zalloc_cpumask_var(&cpus, GFP_ATOMIC); kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, - vcpu_bitmap, cpus); + NULL, vcpu_bitmap, cpus); free_cpumask_var(cpus); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 01276e3d01b9..131cc1527d68 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -813,8 +813,11 @@ void kvm_flush_remote_tlbs(struct kvm *kvm); void kvm_reload_remote_mmus(struct kvm *kvm); bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, + struct kvm_vcpu *except, unsigned long *vcpu_bitmap, cpumask_var_t tmp); bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); +bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, + struct kvm_vcpu *except); bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req, unsigned long *vcpu_bitmap); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 74bdb7bf3295..731c1e517716 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -259,6 +259,7 @@ static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) } bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, + struct kvm_vcpu *except, unsigned long *vcpu_bitmap, cpumask_var_t tmp) { int i, cpu, me; @@ -268,7 +269,8 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, me = get_cpu(); kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) + if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) || + vcpu == except) continue; kvm_make_request(req, vcpu); @@ -288,19 +290,25 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, return called; } -bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) +bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, + struct kvm_vcpu *except) { cpumask_var_t cpus; bool called; zalloc_cpumask_var(&cpus, GFP_ATOMIC); - called = kvm_make_vcpus_request_mask(kvm, req, NULL, cpus); + called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus); free_cpumask_var(cpus); return called; } +bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) +{ + return kvm_make_all_cpus_request_except(kvm, req, NULL); +} + #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL void kvm_flush_remote_tlbs(struct kvm *kvm) { -- cgit v1.2.3 From 2c864c78c2386ada7433268cdfa8cb77cfe31bf3 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 11 May 2020 14:02:15 -0700 Subject: ptp: fix struct member comment for do_aux_work The do_aux_work callback had documentation in the structure comment which referred to it as "do_work". Signed-off-by: Jacob Keller Cc: Richard Cochran Acked-by: Richard Cochran Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 121a7eda4593..c602670bbffb 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -105,10 +105,10 @@ struct ptp_system_timestamp { * parameter func: the desired function to use. * parameter chan: the function channel index to use. * - * @do_work: Request driver to perform auxiliary (periodic) operations - * Driver should return delay of the next auxiliary work scheduling - * time (>=0) or negative value in case further scheduling - * is not required. + * @do_aux_work: Request driver to perform auxiliary (periodic) operations + * Driver should return delay of the next auxiliary work + * scheduling time (>=0) or negative value in case further + * scheduling is not required. * * Drivers should embed their ptp_clock_info within a private * structure, obtaining a reference to it using container_of(). -- cgit v1.2.3 From 59566b0b622e3e6ea928c0b8cac8a5601b00b383 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 30 Apr 2020 20:21:47 -0400 Subject: x86/ftrace: Have ftrace trampolines turn read-only at the end of system boot up Booting one of my machines, it triggered the following crash: Kernel/User page tables isolation: enabled ftrace: allocating 36577 entries in 143 pages Starting tracer 'function' BUG: unable to handle page fault for address: ffffffffa000005c #PF: supervisor write access in kernel mode #PF: error_code(0x0003) - permissions violation PGD 2014067 P4D 2014067 PUD 2015063 PMD 7b253067 PTE 7b252061 Oops: 0003 [#1] PREEMPT SMP PTI CPU: 0 PID: 0 Comm: swapper Not tainted 5.4.0-test+ #24 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007 RIP: 0010:text_poke_early+0x4a/0x58 Code: 34 24 48 89 54 24 08 e8 bf 72 0b 00 48 8b 34 24 48 8b 4c 24 08 84 c0 74 0b 48 89 df f3 a4 48 83 c4 10 5b c3 9c 58 fa 48 89 df a4 50 9d 48 83 c4 10 5b e9 d6 f9 ff ff 0 41 57 49 RSP: 0000:ffffffff82003d38 EFLAGS: 00010046 RAX: 0000000000000046 RBX: ffffffffa000005c RCX: 0000000000000005 RDX: 0000000000000005 RSI: ffffffff825b9a90 RDI: ffffffffa000005c RBP: ffffffffa000005c R08: 0000000000000000 R09: ffffffff8206e6e0 R10: ffff88807b01f4c0 R11: ffffffff8176c106 R12: ffffffff8206e6e0 R13: ffffffff824f2440 R14: 0000000000000000 R15: ffffffff8206eac0 FS: 0000000000000000(0000) GS:ffff88807d400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffa000005c CR3: 0000000002012000 CR4: 00000000000006b0 Call Trace: text_poke_bp+0x27/0x64 ? mutex_lock+0x36/0x5d arch_ftrace_update_trampoline+0x287/0x2d5 ? ftrace_replace_code+0x14b/0x160 ? ftrace_update_ftrace_func+0x65/0x6c __register_ftrace_function+0x6d/0x81 ftrace_startup+0x23/0xc1 register_ftrace_function+0x20/0x37 func_set_flag+0x59/0x77 __set_tracer_option.isra.19+0x20/0x3e trace_set_options+0xd6/0x13e apply_trace_boot_options+0x44/0x6d register_tracer+0x19e/0x1ac early_trace_init+0x21b/0x2c9 start_kernel+0x241/0x518 ? load_ucode_intel_bsp+0x21/0x52 secondary_startup_64+0xa4/0xb0 I was able to trigger it on other machines, when I added to the kernel command line of both "ftrace=function" and "trace_options=func_stack_trace". The cause is the "ftrace=function" would register the function tracer and create a trampoline, and it will set it as executable and read-only. Then the "trace_options=func_stack_trace" would then update the same trampoline to include the stack tracer version of the function tracer. But since the trampoline already exists, it updates it with text_poke_bp(). The problem is that text_poke_bp() called while system_state == SYSTEM_BOOTING, it will simply do a memcpy() and not the page mapping, as it would think that the text is still read-write. But in this case it is not, and we take a fault and crash. Instead, lets keep the ftrace trampolines read-write during boot up, and then when the kernel executable text is set to read-only, the ftrace trampolines get set to read-only as well. Link: https://lkml.kernel.org/r/20200430202147.4dc6e2de@oasis.local.home Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: "H. Peter Anvin" Cc: stable@vger.kernel.org Fixes: 768ae4406a5c ("x86/ftrace: Use text_poke()") Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt (VMware) --- arch/x86/include/asm/ftrace.h | 6 ++++++ arch/x86/kernel/ftrace.c | 29 ++++++++++++++++++++++++++++- arch/x86/mm/init_64.c | 3 +++ include/linux/ftrace.h | 23 +++++++++++++++++++++++ kernel/trace/ftrace_internal.h | 22 ---------------------- 5 files changed, 60 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 85be2f506272..89af0d2c62aa 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -56,6 +56,12 @@ struct dyn_arch_ftrace { #ifndef __ASSEMBLY__ +#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) +extern void set_ftrace_ops_ro(void); +#else +static inline void set_ftrace_ops_ro(void) { } +#endif + #define ARCH_HAS_SYSCALL_MATCH_SYM_NAME static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) { diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 37a0aeaf89e7..b0e641793be4 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -407,7 +407,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) set_vm_flush_reset_perms(trampoline); - set_memory_ro((unsigned long)trampoline, npages); + if (likely(system_state != SYSTEM_BOOTING)) + set_memory_ro((unsigned long)trampoline, npages); set_memory_x((unsigned long)trampoline, npages); return (unsigned long)trampoline; fail: @@ -415,6 +416,32 @@ fail: return 0; } +void set_ftrace_ops_ro(void) +{ + struct ftrace_ops *ops; + unsigned long start_offset; + unsigned long end_offset; + unsigned long npages; + unsigned long size; + + do_for_each_ftrace_op(ops, ftrace_ops_list) { + if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + continue; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + start_offset = (unsigned long)ftrace_regs_caller; + end_offset = (unsigned long)ftrace_regs_caller_end; + } else { + start_offset = (unsigned long)ftrace_caller; + end_offset = (unsigned long)ftrace_epilogue; + } + size = end_offset - start_offset; + size = size + RET_SIZE + sizeof(void *); + npages = DIV_ROUND_UP(size, PAGE_SIZE); + set_memory_ro((unsigned long)ops->trampoline, npages); + } while_for_each_ftrace_op(ops); +} + static unsigned long calc_trampoline_call_offset(bool save_regs) { unsigned long start_offset; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3b289c2f75cd..8b5f73f5e207 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -54,6 +54,7 @@ #include #include #include +#include #include "mm_internal.h" @@ -1291,6 +1292,8 @@ void mark_rodata_ro(void) all_end = roundup((unsigned long)_brk_end, PMD_SIZE); set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT); + set_ftrace_ops_ro(); + #ifdef CONFIG_CPA_DEBUG printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end); set_memory_rw(start, (end-start) >> PAGE_SHIFT); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index db95244a62d4..ab4bd15cbcdb 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -210,6 +210,29 @@ struct ftrace_ops { #endif }; +extern struct ftrace_ops __rcu *ftrace_ops_list; +extern struct ftrace_ops ftrace_list_end; + +/* + * Traverse the ftrace_global_list, invoking all entries. The reason that we + * can use rcu_dereference_raw_check() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism. The rcu_dereference_raw_check() calls are needed to handle + * concurrent insertions into the ftrace_global_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ +#define do_for_each_ftrace_op(op, list) \ + op = rcu_dereference_raw_check(list); \ + do + +/* + * Optimized for just a single item in the list (as that is the normal case). + */ +#define while_for_each_ftrace_op(op) \ + while (likely(op = rcu_dereference_raw_check((op)->next)) && \ + unlikely((op) != &ftrace_list_end)) + /* * Type of the current tracing. */ diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h index 0456e0a3dab1..382775edf690 100644 --- a/kernel/trace/ftrace_internal.h +++ b/kernel/trace/ftrace_internal.h @@ -4,28 +4,6 @@ #ifdef CONFIG_FUNCTION_TRACER -/* - * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw_check() is that elements removed from this list - * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw_check() calls are needed to handle - * concurrent insertions into the ftrace_global_list. - * - * Silly Alpha and silly pointer-speculation compiler optimizations! - */ -#define do_for_each_ftrace_op(op, list) \ - op = rcu_dereference_raw_check(list); \ - do - -/* - * Optimized for just a single item in the list (as that is the normal case). - */ -#define while_for_each_ftrace_op(op) \ - while (likely(op = rcu_dereference_raw_check((op)->next)) && \ - unlikely((op) != &ftrace_list_end)) - -extern struct ftrace_ops __rcu *ftrace_ops_list; -extern struct ftrace_ops ftrace_list_end; extern struct mutex ftrace_lock; extern struct ftrace_ops global_ops; -- cgit v1.2.3 From 3d8c11efd528d56972d44ed0de51c4e11a9a4fa9 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 12 May 2020 13:55:02 +0900 Subject: efi: cper: Add support for printing Firmware Error Record Reference While debugging a boot failure, the following unknown error record was seen in the boot logs. <...> BERT: Error records from previous boot: [Hardware Error]: event severity: fatal [Hardware Error]: Error 0, type: fatal [Hardware Error]: section type: unknown, 81212a96-09ed-4996-9471-8d729c8e69ed [Hardware Error]: section length: 0x290 [Hardware Error]: 00000000: 00000001 00000000 00000000 00020002 ................ [Hardware Error]: 00000010: 00020002 0000001f 00000320 00000000 ........ ....... [Hardware Error]: 00000020: 00000000 00000000 00000000 00000000 ................ [Hardware Error]: 00000030: 00000000 00000000 00000000 00000000 ................ <...> On further investigation, it was found that the error record with UUID (81212a96-09ed-4996-9471-8d729c8e69ed) has been defined in the UEFI Specification at least since v2.4 and has recently had additional fields defined in v2.7 Section N.2.10 Firmware Error Record Reference. Add support for parsing and printing the defined fields to give users a chance to figure out what went wrong. Signed-off-by: Punit Agrawal Cc: Ard Biesheuvel Cc: "Rafael J. Wysocki" Cc: Borislav Petkov Cc: James Morse Cc: linux-acpi@vger.kernel.org Cc: linux-efi@vger.kernel.org Link: https://lore.kernel.org/r/20200512045502.3810339-1-punit1.agrawal@toshiba.co.jp Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/cper.c | 62 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/cper.h | 9 +++++++ 2 files changed, 71 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index 9d2512913d25..f564e15fbc7e 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -407,6 +407,58 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie, } } +static const char * const fw_err_rec_type_strs[] = { + "IPF SAL Error Record", + "SOC Firmware Error Record Type1 (Legacy CrashLog Support)", + "SOC Firmware Error Record Type2", +}; + +static void cper_print_fw_err(const char *pfx, + struct acpi_hest_generic_data *gdata, + const struct cper_sec_fw_err_rec_ref *fw_err) +{ + void *buf = acpi_hest_get_payload(gdata); + u32 offset, length = gdata->error_data_length; + + printk("%s""Firmware Error Record Type: %s\n", pfx, + fw_err->record_type < ARRAY_SIZE(fw_err_rec_type_strs) ? + fw_err_rec_type_strs[fw_err->record_type] : "unknown"); + printk("%s""Revision: %d\n", pfx, fw_err->revision); + + /* Record Type based on UEFI 2.7 */ + if (fw_err->revision == 0) { + printk("%s""Record Identifier: %08llx\n", pfx, + fw_err->record_identifier); + } else if (fw_err->revision == 2) { + printk("%s""Record Identifier: %pUl\n", pfx, + &fw_err->record_identifier_guid); + } + + /* + * The FW error record may contain trailing data beyond the + * structure defined by the specification. As the fields + * defined (and hence the offset of any trailing data) vary + * with the revision, set the offset to account for this + * variation. + */ + if (fw_err->revision == 0) { + /* record_identifier_guid not defined */ + offset = offsetof(struct cper_sec_fw_err_rec_ref, + record_identifier_guid); + } else if (fw_err->revision == 1) { + /* record_identifier not defined */ + offset = offsetof(struct cper_sec_fw_err_rec_ref, + record_identifier); + } else { + offset = sizeof(*fw_err); + } + + buf += offset; + length -= offset; + + print_hex_dump(pfx, "", DUMP_PREFIX_OFFSET, 16, 4, buf, length, true); +} + static void cper_print_tstamp(const char *pfx, struct acpi_hest_generic_data_v300 *gdata) { @@ -494,6 +546,16 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata else goto err_section_too_small; #endif + } else if (guid_equal(sec_type, &CPER_SEC_FW_ERR_REC_REF)) { + struct cper_sec_fw_err_rec_ref *fw_err = acpi_hest_get_payload(gdata); + + printk("%ssection_type: Firmware Error Record Reference\n", + newpfx); + /* The minimal FW Error Record contains 16 bytes */ + if (gdata->error_data_length >= SZ_16) + cper_print_fw_err(newpfx, gdata, fw_err); + else + goto err_section_too_small; } else { const void *err = acpi_hest_get_payload(gdata); diff --git a/include/linux/cper.h b/include/linux/cper.h index 4f005d95ce88..8537e9282a65 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -521,6 +521,15 @@ struct cper_sec_pcie { u8 aer_info[96]; }; +/* Firmware Error Record Reference, UEFI v2.7 sec N.2.10 */ +struct cper_sec_fw_err_rec_ref { + u8 record_type; + u8 revision; + u8 reserved[6]; + u64 record_identifier; + guid_t record_identifier_guid; +}; + /* Reset to default packing */ #pragma pack() -- cgit v1.2.3 From 04fd61a4e01028210a91f0efc408c8bc61a3018c Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 13 May 2020 17:50:34 -0700 Subject: mm, memcg: fix inconsistent oom event behavior A recent commit 9852ae3fe529 ("mm, memcg: consider subtrees in memory.events") changed the behavior of memcg events, which will now consider subtrees in memory.events. But oom_kill event is a special one as it is used in both cgroup1 and cgroup2. In cgroup1, it is displayed in memory.oom_control. The file memory.oom_control is in both root memcg and non root memcg, that is different with memory.event as it only in non-root memcg. That commit is okay for cgroup2, but it is not okay for cgroup1 as it will cause inconsistent behavior between root memcg and non-root memcg. Here's an example on why this behavior is inconsistent in cgroup1. root memcg / memcg foo / memcg bar Suppose there's an oom_kill in memcg bar, then the oon_kill will be root memcg : memory.oom_control(oom_kill) 0 / memcg foo : memory.oom_control(oom_kill) 1 / memcg bar : memory.oom_control(oom_kill) 1 For the non-root memcg, its memory.oom_control(oom_kill) includes its descendants' oom_kill, but for root memcg, it doesn't include its descendants' oom_kill. That means, memory.oom_control(oom_kill) has different meanings in different memcgs. That is inconsistent. Then the user has to know whether the memcg is root or not. If we can't fully support it in cgroup1, for example by adding memory.events.local into cgroup1 as well, then let's don't touch its original behavior. Fixes: 9852ae3fe529 ("mm, memcg: consider subtrees in memory.events") Reported-by: Randy Dunlap Signed-off-by: Yafang Shao Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Chris Down Acked-by: Michal Hocko Cc: Link: http://lkml.kernel.org/r/20200502141055.7378-1-laoar.shao@gmail.com Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d275c72c4f8e..977edd3b7bd8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -783,6 +783,8 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, atomic_long_inc(&memcg->memory_events[event]); cgroup_file_notify(&memcg->events_file); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + break; if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) break; } while ((memcg = parent_mem_cgroup(memcg)) && -- cgit v1.2.3 From 625236ba3832ae947cb3ebb7acc1f30788b274ef Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Tue, 12 May 2020 19:46:07 +0200 Subject: security: Fix the default value of secid_to_secctx hook security_secid_to_secctx is called by the bpf_lsm hook and a successful return value (i.e 0) implies that the parameter will be consumed by the LSM framework. The current behaviour return success when the pointer isn't initialized when CONFIG_BPF_LSM is enabled, with the default return from kernel/bpf/bpf_lsm.c. This is the internal error: [ 1229.341488][ T2659] usercopy: Kernel memory exposure attempt detected from null address (offset 0, size 280)! [ 1229.374977][ T2659] ------------[ cut here ]------------ [ 1229.376813][ T2659] kernel BUG at mm/usercopy.c:99! [ 1229.378398][ T2659] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP [ 1229.380348][ T2659] Modules linked in: [ 1229.381654][ T2659] CPU: 0 PID: 2659 Comm: systemd-journal Tainted: G B W 5.7.0-rc5-next-20200511-00019-g864e0c6319b8-dirty #13 [ 1229.385429][ T2659] Hardware name: linux,dummy-virt (DT) [ 1229.387143][ T2659] pstate: 80400005 (Nzcv daif +PAN -UAO BTYPE=--) [ 1229.389165][ T2659] pc : usercopy_abort+0xc8/0xcc [ 1229.390705][ T2659] lr : usercopy_abort+0xc8/0xcc [ 1229.392225][ T2659] sp : ffff000064247450 [ 1229.393533][ T2659] x29: ffff000064247460 x28: 0000000000000000 [ 1229.395449][ T2659] x27: 0000000000000118 x26: 0000000000000000 [ 1229.397384][ T2659] x25: ffffa000127049e0 x24: ffffa000127049e0 [ 1229.399306][ T2659] x23: ffffa000127048e0 x22: ffffa000127048a0 [ 1229.401241][ T2659] x21: ffffa00012704b80 x20: ffffa000127049e0 [ 1229.403163][ T2659] x19: ffffa00012704820 x18: 0000000000000000 [ 1229.405094][ T2659] x17: 0000000000000000 x16: 0000000000000000 [ 1229.407008][ T2659] x15: 0000000000000000 x14: 003d090000000000 [ 1229.408942][ T2659] x13: ffff80000d5b25b2 x12: 1fffe0000d5b25b1 [ 1229.410859][ T2659] x11: 1fffe0000d5b25b1 x10: ffff80000d5b25b1 [ 1229.412791][ T2659] x9 : ffffa0001034bee0 x8 : ffff00006ad92d8f [ 1229.414707][ T2659] x7 : 0000000000000000 x6 : ffffa00015eacb20 [ 1229.416642][ T2659] x5 : ffff0000693c8040 x4 : 0000000000000000 [ 1229.418558][ T2659] x3 : ffffa0001034befc x2 : d57a7483a01c6300 [ 1229.420610][ T2659] x1 : 0000000000000000 x0 : 0000000000000059 [ 1229.422526][ T2659] Call trace: [ 1229.423631][ T2659] usercopy_abort+0xc8/0xcc [ 1229.425091][ T2659] __check_object_size+0xdc/0x7d4 [ 1229.426729][ T2659] put_cmsg+0xa30/0xa90 [ 1229.428132][ T2659] unix_dgram_recvmsg+0x80c/0x930 [ 1229.429731][ T2659] sock_recvmsg+0x9c/0xc0 [ 1229.431123][ T2659] ____sys_recvmsg+0x1cc/0x5f8 [ 1229.432663][ T2659] ___sys_recvmsg+0x100/0x160 [ 1229.434151][ T2659] __sys_recvmsg+0x110/0x1a8 [ 1229.435623][ T2659] __arm64_sys_recvmsg+0x58/0x70 [ 1229.437218][ T2659] el0_svc_common.constprop.1+0x29c/0x340 [ 1229.438994][ T2659] do_el0_svc+0xe8/0x108 [ 1229.440587][ T2659] el0_svc+0x74/0x88 [ 1229.441917][ T2659] el0_sync_handler+0xe4/0x8b4 [ 1229.443464][ T2659] el0_sync+0x17c/0x180 [ 1229.444920][ T2659] Code: aa1703e2 aa1603e1 910a8260 97ecc860 (d4210000) [ 1229.447070][ T2659] ---[ end trace 400497d91baeaf51 ]--- [ 1229.448791][ T2659] Kernel panic - not syncing: Fatal exception [ 1229.450692][ T2659] Kernel Offset: disabled [ 1229.452061][ T2659] CPU features: 0x240002,20002004 [ 1229.453647][ T2659] Memory Limit: none [ 1229.455015][ T2659] ---[ end Kernel panic - not syncing: Fatal exception ]--- Rework the so the default return value is -EOPNOTSUPP. There are likely other callbacks such as security_inode_getsecctx() that may have the same problem, and that someone that understand the code better needs to audit them. Thank you Arnd for helping me figure out what went wrong. Fixes: 98e828a0650f ("security: Refactor declaration of LSM hooks") Signed-off-by: Anders Roxell Signed-off-by: Alexei Starovoitov Acked-by: James Morris Cc: Arnd Bergmann Link: https://lore.kernel.org/bpf/20200512174607.9630-1-anders.roxell@linaro.org --- include/linux/lsm_hook_defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 9cd4455528e5..21f4fff9e4cd 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -243,7 +243,7 @@ LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, char *name, char **value) LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size) LSM_HOOK(int, 0, ismaclabel, const char *name) -LSM_HOOK(int, 0, secid_to_secctx, u32 secid, char **secdata, +LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, char **secdata, u32 *seclen) LSM_HOOK(int, 0, secctx_to_secid, const char *secdata, u32 seclen, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, release_secctx, char *secdata, u32 seclen) -- cgit v1.2.3 From cc8a677a76f419016b5e231207d09b073f9b1d3f Mon Sep 17 00:00:00 2001 From: Kevin Lo Date: Thu, 14 May 2020 08:57:33 +0800 Subject: net: phy: broadcom: fix BCM54XX_SHD_SCR3_TRDDAPD value for BCM54810 Set the correct bit when checking for PHY_BRCM_DIS_TXCRXC_NOENRGY on the BCM54810 PHY. Fixes: 0ececcfc9267 ("net: phy: broadcom: Allow BCM54810 to use bcm54xx_adjust_rxrefclk()") Signed-off-by: Kevin Lo Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 8 ++++++-- include/linux/brcmphy.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index ae4873f2f86e..d14d91b759b7 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -225,8 +225,12 @@ static void bcm54xx_adjust_rxrefclk(struct phy_device *phydev) else val |= BCM54XX_SHD_SCR3_DLLAPD_DIS; - if (phydev->dev_flags & PHY_BRCM_DIS_TXCRXC_NOENRGY) - val |= BCM54XX_SHD_SCR3_TRDDAPD; + if (phydev->dev_flags & PHY_BRCM_DIS_TXCRXC_NOENRGY) { + if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810) + val |= BCM54810_SHD_SCR3_TRDDAPD; + else + val |= BCM54XX_SHD_SCR3_TRDDAPD; + } if (orig != val) bcm_phy_write_shadow(phydev, BCM54XX_SHD_SCR3, val); diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 6462c5447872..f4b77018c625 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -245,6 +245,7 @@ #define BCM54810_EXP_BROADREACH_LRE_MISC_CTL_EN (1 << 0) #define BCM54810_SHD_CLK_CTL 0x3 #define BCM54810_SHD_CLK_CTL_GTXCLK_EN (1 << 9) +#define BCM54810_SHD_SCR3_TRDDAPD 0x0100 /* BCM54612E Registers */ #define BCM54612E_EXP_SPARE0 (MII_BCM54XX_EXP_SEL_ETC + 0x34) -- cgit v1.2.3 From 8695e0b1b964f6d7caee667f14dceb7e8a4a3b3c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:53:29 -0500 Subject: i2c: mux: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Reviewed-by: Peter Rosin Signed-off-by: Wolfram Sang --- include/linux/i2c-mux.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c-mux.h b/include/linux/i2c-mux.h index c5a977320f82..98ef73b7c8fd 100644 --- a/include/linux/i2c-mux.h +++ b/include/linux/i2c-mux.h @@ -29,7 +29,7 @@ struct i2c_mux_core { int num_adapters; int max_adapters; - struct i2c_adapter *adapter[0]; + struct i2c_adapter *adapter[]; }; struct i2c_mux_core *i2c_mux_alloc(struct i2c_adapter *parent, -- cgit v1.2.3 From a9a3ed1eff3601b63aea4fb462d8b3b92c7c1e7e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 22 Apr 2020 18:11:30 +0200 Subject: x86: Fix early boot crash on gcc-10, third try MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... or the odyssey of trying to disable the stack protector for the function which generates the stack canary value. The whole story started with Sergei reporting a boot crash with a kernel built with gcc-10: Kernel panic — not syncing: stack-protector: Kernel stack is corrupted in: start_secondary CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.6.0-rc5—00235—gfffb08b37df9 #139 Hardware name: Gigabyte Technology Co., Ltd. To be filled by O.E.M./H77M—D3H, BIOS F12 11/14/2013 Call Trace: dump_stack panic ? start_secondary __stack_chk_fail start_secondary secondary_startup_64 -—-[ end Kernel panic — not syncing: stack—protector: Kernel stack is corrupted in: start_secondary This happens because gcc-10 tail-call optimizes the last function call in start_secondary() - cpu_startup_entry() - and thus emits a stack canary check which fails because the canary value changes after the boot_init_stack_canary() call. To fix that, the initial attempt was to mark the one function which generates the stack canary with: __attribute__((optimize("-fno-stack-protector"))) ... start_secondary(void *unused) however, using the optimize attribute doesn't work cumulatively as the attribute does not add to but rather replaces previously supplied optimization options - roughly all -fxxx options. The key one among them being -fno-omit-frame-pointer and thus leading to not present frame pointer - frame pointer which the kernel needs. The next attempt to prevent compilers from tail-call optimizing the last function call cpu_startup_entry(), shy of carving out start_secondary() into a separate compilation unit and building it with -fno-stack-protector, was to add an empty asm(""). This current solution was short and sweet, and reportedly, is supported by both compilers but we didn't get very far this time: future (LTO?) optimization passes could potentially eliminate this, which leads us to the third attempt: having an actual memory barrier there which the compiler cannot ignore or move around etc. That should hold for a long time, but hey we said that about the other two solutions too so... Reported-by: Sergei Trofimovich Signed-off-by: Borislav Petkov Tested-by: Kalle Valo Cc: Link: https://lkml.kernel.org/r/20200314164451.346497-1-slyfox@gentoo.org --- arch/x86/include/asm/stackprotector.h | 7 ++++++- arch/x86/kernel/smpboot.c | 8 ++++++++ arch/x86/xen/smp_pv.c | 1 + include/linux/compiler.h | 6 ++++++ init/main.c | 2 ++ 5 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 91e29b6a86a5..9804a7957f4e 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -55,8 +55,13 @@ /* * Initialize the stackprotector canary value. * - * NOTE: this must only be called from functions that never return, + * NOTE: this must only be called from functions that never return * and it must always be inlined. + * + * In addition, it should be called from a compilation unit for which + * stack protector is disabled. Alternatively, the caller should not end + * with a function call which gets tail-call optimized as that would + * lead to checking a modified canary value. */ static __always_inline void boot_init_stack_canary(void) { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8c89e4d9ad28..2f24c334a938 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -266,6 +266,14 @@ static void notrace start_secondary(void *unused) wmb(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); + + /* + * Prevent tail call to cpu_startup_entry() because the stack protector + * guard has been changed a couple of function calls up, in + * boot_init_stack_canary() and must not be checked before tail calling + * another function. + */ + prevent_tail_call_optimization(); } /** diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 8fb8a50a28b4..f2adb63b2d7c 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -93,6 +93,7 @@ asmlinkage __visible void cpu_bringup_and_idle(void) cpu_bringup(); boot_init_stack_canary(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); + prevent_tail_call_optimization(); } void xen_smp_intr_free_pv(unsigned int cpu) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 034b0a644efc..448c91bf543b 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -356,4 +356,10 @@ static inline void *offset_to_ptr(const int *off) /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) +/* + * This is needed in functions which generate the stack canary, see + * arch/x86/kernel/smpboot.c::start_secondary() for an example. + */ +#define prevent_tail_call_optimization() mb() + #endif /* __LINUX_COMPILER_H */ diff --git a/init/main.c b/init/main.c index 1a5da2c2660c..ad3812b5ae65 100644 --- a/init/main.c +++ b/init/main.c @@ -1036,6 +1036,8 @@ asmlinkage __visible void __init start_kernel(void) /* Do the rest non-__init'ed, we're now alive */ arch_call_rest_init(); + + prevent_tail_call_optimization(); } /* Call all constructor functions linked into the kernel. */ -- cgit v1.2.3 From e8da08a088236aff4b51d4ec97c750051f9fe417 Mon Sep 17 00:00:00 2001 From: Benjamin Thiel Date: Sat, 16 May 2020 15:26:47 +0200 Subject: efi: Pull up arch-specific prototype efi_systab_show_arch() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pull up arch-specific prototype efi_systab_show_arch() in order to fix a -Wmissing-prototypes warning: arch/x86/platform/efi/efi.c:957:7: warning: no previous prototype for ‘efi_systab_show_arch’ [-Wmissing-prototypes] char *efi_systab_show_arch(char *str) Signed-off-by: Benjamin Thiel Link: https://lore.kernel.org/r/20200516132647.14568-1-b.thiel@posteo.de Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/efi.c | 5 +---- include/linux/efi.h | 2 ++ 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 911a2bd0f6b7..4e3055238f31 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -130,11 +130,8 @@ static ssize_t systab_show(struct kobject *kobj, if (efi.smbios != EFI_INVALID_TABLE_ADDR) str += sprintf(str, "SMBIOS=0x%lx\n", efi.smbios); - if (IS_ENABLED(CONFIG_IA64) || IS_ENABLED(CONFIG_X86)) { - extern char *efi_systab_show_arch(char *str); - + if (IS_ENABLED(CONFIG_IA64) || IS_ENABLED(CONFIG_X86)) str = efi_systab_show_arch(str); - } return str - buf; } diff --git a/include/linux/efi.h b/include/linux/efi.h index 251f1f783cdf..9430d01c0c3d 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1245,4 +1245,6 @@ struct linux_efi_memreserve { void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size); +char *efi_systab_show_arch(char *str); + #endif /* _LINUX_EFI_H */ -- cgit v1.2.3 From 17d00e839d3b592da9659c1977d45f85b77f986a Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 27 Dec 2019 07:01:53 +0200 Subject: net/mlx5: Add command entry handling completion When FW response to commands is very slow and all command entries in use are waiting for completion we can have a race where commands can get timeout before they get out of the queue and handled. Timeout completion on uninitialized command will cause releasing command's buffers before accessing it for initialization and then we will get NULL pointer exception while trying access it. It may also cause releasing buffers of another command since we may have timeout completion before even allocating entry index for this command. Add entry handling completion to avoid this race. Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Moshe Shemesh Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 14 ++++++++++++++ include/linux/mlx5/driver.h | 1 + 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index cede5bdfd598..d695b75bc0af 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -861,6 +861,7 @@ static void cmd_work_handler(struct work_struct *work) int alloc_ret; int cmd_mode; + complete(&ent->handling); sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem; down(sem); if (!ent->page_queue) { @@ -978,6 +979,11 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent) struct mlx5_cmd *cmd = &dev->cmd; int err; + if (!wait_for_completion_timeout(&ent->handling, timeout) && + cancel_work_sync(&ent->work)) { + ent->ret = -ECANCELED; + goto out_err; + } if (cmd->mode == CMD_MODE_POLLING || ent->polling) { wait_for_completion(&ent->done); } else if (!wait_for_completion_timeout(&ent->done, timeout)) { @@ -985,12 +991,17 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent) mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); } +out_err: err = ent->ret; if (err == -ETIMEDOUT) { mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n", mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); + } else if (err == -ECANCELED) { + mlx5_core_warn(dev, "%s(0x%x) canceled on out of queue timeout.\n", + mlx5_command_str(msg_to_opcode(ent->in)), + msg_to_opcode(ent->in)); } mlx5_core_dbg(dev, "err %d, delivery status %s(%d)\n", err, deliv_status_to_str(ent->status), ent->status); @@ -1026,6 +1037,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, ent->token = token; ent->polling = force_polling; + init_completion(&ent->handling); if (!callback) init_completion(&ent->done); @@ -1045,6 +1057,8 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, err = wait_func(dev, ent); if (err == -ETIMEDOUT) goto out; + if (err == -ECANCELED) + goto out_free; ds = ent->ts2 - ent->ts1; op = MLX5_GET(mbox_in, in->first.data, opcode); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6f8f79ef829b..9b1f29f26c27 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -743,6 +743,7 @@ struct mlx5_cmd_work_ent { struct delayed_work cb_timeout_work; void *context; int idx; + struct completion handling; struct completion done; struct mlx5_cmd *cmd; struct work_struct work; -- cgit v1.2.3 From d43b7007dbd1195a5b6b83213e49b1516aaf6f5e Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Wed, 18 Mar 2020 21:44:32 +0200 Subject: net/mlx5: Fix a race when moving command interface to events mode After driver creates (via FW command) an EQ for commands, the driver will be informed on new commands completion by EQE. However, due to a race in driver's internal command mode metadata update, some new commands will still be miss-handled by driver as if we are in polling mode. Such commands can get two non forced completion, leading to already freed command entry access. CREATE_EQ command, that maps EQ to the command queue must be posted to the command queue while it is empty and no other command should be posted. Add SW mechanism that once the CREATE_EQ command is about to be executed, all other commands will return error without being sent to the FW. Allow sending other commands only after successfully changing the driver's internal command mode metadata. We can safely return error to all other commands while creating the command EQ, as all other commands might be sent from the user/application during driver load. Application can rerun them later after driver's load was finished. Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Eran Ben Elisha Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 35 ++++++++++++++++++++++++--- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 3 +++ include/linux/mlx5/driver.h | 6 +++++ 3 files changed, 40 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index d695b75bc0af..2f3cafdc3b1f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -848,6 +848,14 @@ static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg); static void mlx5_free_cmd_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg); +static bool opcode_allowed(struct mlx5_cmd *cmd, u16 opcode) +{ + if (cmd->allowed_opcode == CMD_ALLOWED_OPCODE_ALL) + return true; + + return cmd->allowed_opcode == opcode; +} + static void cmd_work_handler(struct work_struct *work) { struct mlx5_cmd_work_ent *ent = container_of(work, struct mlx5_cmd_work_ent, work); @@ -914,7 +922,8 @@ static void cmd_work_handler(struct work_struct *work) /* Skip sending command to fw if internal error */ if (pci_channel_offline(dev->pdev) || - dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || + !opcode_allowed(&dev->cmd, ent->op)) { u8 status = 0; u32 drv_synd; @@ -1405,6 +1414,22 @@ static void create_debugfs_files(struct mlx5_core_dev *dev) mlx5_cmdif_debugfs_init(dev); } +void mlx5_cmd_allowed_opcode(struct mlx5_core_dev *dev, u16 opcode) +{ + struct mlx5_cmd *cmd = &dev->cmd; + int i; + + for (i = 0; i < cmd->max_reg_cmds; i++) + down(&cmd->sem); + down(&cmd->pages_sem); + + cmd->allowed_opcode = opcode; + + up(&cmd->pages_sem); + for (i = 0; i < cmd->max_reg_cmds; i++) + up(&cmd->sem); +} + static void mlx5_cmd_change_mod(struct mlx5_core_dev *dev, int mode) { struct mlx5_cmd *cmd = &dev->cmd; @@ -1681,12 +1706,13 @@ static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int err; u8 status = 0; u32 drv_synd; + u16 opcode; u8 token; + opcode = MLX5_GET(mbox_in, in, opcode); if (pci_channel_offline(dev->pdev) || - dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { - u16 opcode = MLX5_GET(mbox_in, in, opcode); - + dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || + !opcode_allowed(&dev->cmd, opcode)) { err = mlx5_internal_err_ret_value(dev, opcode, &drv_synd, &status); MLX5_SET(mbox_out, out, status, status); MLX5_SET(mbox_out, out, syndrome, drv_synd); @@ -1988,6 +2014,7 @@ int mlx5_cmd_init(struct mlx5_core_dev *dev) mlx5_core_dbg(dev, "descriptor at dma 0x%llx\n", (unsigned long long)(cmd->dma)); cmd->mode = CMD_MODE_POLLING; + cmd->allowed_opcode = CMD_ALLOWED_OPCODE_ALL; create_msg_cache(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index cccea3a8eddd..ce6c621af043 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -611,11 +611,13 @@ static int create_async_eqs(struct mlx5_core_dev *dev) .nent = MLX5_NUM_CMD_EQE, .mask[0] = 1ull << MLX5_EVENT_TYPE_CMD, }; + mlx5_cmd_allowed_opcode(dev, MLX5_CMD_OP_CREATE_EQ); err = setup_async_eq(dev, &table->cmd_eq, ¶m, "cmd"); if (err) goto err1; mlx5_cmd_use_events(dev); + mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL); param = (struct mlx5_eq_param) { .irq_index = 0, @@ -645,6 +647,7 @@ err2: mlx5_cmd_use_polling(dev); cleanup_async_eq(dev, &table->cmd_eq, "cmd"); err1: + mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL); mlx5_eq_notifier_unregister(dev, &table->cq_err_nb); return err; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 9b1f29f26c27..c03778c75dfa 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -284,6 +284,7 @@ struct mlx5_cmd { struct semaphore sem; struct semaphore pages_sem; int mode; + u16 allowed_opcode; struct mlx5_cmd_work_ent *ent_arr[MLX5_MAX_COMMANDS]; struct dma_pool *pool; struct mlx5_cmd_debug dbg; @@ -875,10 +876,15 @@ mlx5_frag_buf_get_idx_last_contig_stride(struct mlx5_frag_buf_ctrl *fbc, u32 ix) return min_t(u32, last_frag_stride_idx - fbc->strides_offset, fbc->sz_m1); } +enum { + CMD_ALLOWED_OPCODE_ALL, +}; + int mlx5_cmd_init(struct mlx5_core_dev *dev); void mlx5_cmd_cleanup(struct mlx5_core_dev *dev); void mlx5_cmd_use_events(struct mlx5_core_dev *dev); void mlx5_cmd_use_polling(struct mlx5_core_dev *dev); +void mlx5_cmd_allowed_opcode(struct mlx5_core_dev *dev, u16 opcode); struct mlx5_async_ctx { struct mlx5_core_dev *dev; -- cgit v1.2.3 From f7936ddd35d8b849daf0372770c7c9dbe7910fca Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Thu, 19 Mar 2020 21:43:13 +0200 Subject: net/mlx5: Avoid processing commands before cmdif is ready When driver is reloading during recovery flow, it can't get new commands till command interface is up again. Otherwise we may get to null pointer trying to access non initialized command structures. Add cmdif state to avoid processing commands while cmdif is not ready. Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Eran Ben Elisha Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 10 ++++++++++ drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 ++++ include/linux/mlx5/driver.h | 9 +++++++++ 3 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 2f3cafdc3b1f..7a77fe40af3a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -923,6 +923,7 @@ static void cmd_work_handler(struct work_struct *work) /* Skip sending command to fw if internal error */ if (pci_channel_offline(dev->pdev) || dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || + cmd->state != MLX5_CMDIF_STATE_UP || !opcode_allowed(&dev->cmd, ent->op)) { u8 status = 0; u32 drv_synd; @@ -1712,6 +1713,7 @@ static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, opcode = MLX5_GET(mbox_in, in, opcode); if (pci_channel_offline(dev->pdev) || dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || + dev->cmd.state != MLX5_CMDIF_STATE_UP || !opcode_allowed(&dev->cmd, opcode)) { err = mlx5_internal_err_ret_value(dev, opcode, &drv_synd, &status); MLX5_SET(mbox_out, out, status, status); @@ -1977,6 +1979,7 @@ int mlx5_cmd_init(struct mlx5_core_dev *dev) goto err_free_page; } + cmd->state = MLX5_CMDIF_STATE_DOWN; cmd->checksum_disabled = 1; cmd->max_reg_cmds = (1 << cmd->log_sz) - 1; cmd->bitmask = (1UL << cmd->max_reg_cmds) - 1; @@ -2054,3 +2057,10 @@ void mlx5_cmd_cleanup(struct mlx5_core_dev *dev) dma_pool_destroy(cmd->pool); } EXPORT_SYMBOL(mlx5_cmd_cleanup); + +void mlx5_cmd_set_state(struct mlx5_core_dev *dev, + enum mlx5_cmdif_state cmdif_state) +{ + dev->cmd.state = cmdif_state; +} +EXPORT_SYMBOL(mlx5_cmd_set_state); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 7af4210c1b96..a61e473db7e1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -965,6 +965,8 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot) goto err_cmd_cleanup; } + mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_UP); + err = mlx5_core_enable_hca(dev, 0); if (err) { mlx5_core_err(dev, "enable hca failed\n"); @@ -1026,6 +1028,7 @@ reclaim_boot_pages: err_disable_hca: mlx5_core_disable_hca(dev, 0); err_cmd_cleanup: + mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN); mlx5_cmd_cleanup(dev); return err; @@ -1043,6 +1046,7 @@ static int mlx5_function_teardown(struct mlx5_core_dev *dev, bool boot) } mlx5_reclaim_startup_pages(dev); mlx5_core_disable_hca(dev, 0); + mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN); mlx5_cmd_cleanup(dev); return 0; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c03778c75dfa..8397b6558dc7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -213,6 +213,12 @@ enum mlx5_port_status { MLX5_PORT_DOWN = 2, }; +enum mlx5_cmdif_state { + MLX5_CMDIF_STATE_UNINITIALIZED, + MLX5_CMDIF_STATE_UP, + MLX5_CMDIF_STATE_DOWN, +}; + struct mlx5_cmd_first { __be32 data[4]; }; @@ -258,6 +264,7 @@ struct mlx5_cmd_stats { struct mlx5_cmd { struct mlx5_nb nb; + enum mlx5_cmdif_state state; void *cmd_alloc_buf; dma_addr_t alloc_dma; int alloc_size; @@ -882,6 +889,8 @@ enum { int mlx5_cmd_init(struct mlx5_core_dev *dev); void mlx5_cmd_cleanup(struct mlx5_core_dev *dev); +void mlx5_cmd_set_state(struct mlx5_core_dev *dev, + enum mlx5_cmdif_state cmdif_state); void mlx5_cmd_use_events(struct mlx5_core_dev *dev); void mlx5_cmd_use_polling(struct mlx5_core_dev *dev); void mlx5_cmd_allowed_opcode(struct mlx5_core_dev *dev, u16 opcode); -- cgit v1.2.3