From 6648e613226e18897231ab5e42ffc29e63fa3365 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 4 Apr 2024 10:10:01 +0800 Subject: bpf, skmsg: Fix NULL pointer dereference in sk_psock_skb_ingress_enqueue Fix NULL pointer data-races in sk_psock_skb_ingress_enqueue() which syzbot reported [1]. [1] BUG: KCSAN: data-race in sk_psock_drop / sk_psock_skb_ingress_enqueue write to 0xffff88814b3278b8 of 8 bytes by task 10724 on cpu 1: sk_psock_stop_verdict net/core/skmsg.c:1257 [inline] sk_psock_drop+0x13e/0x1f0 net/core/skmsg.c:843 sk_psock_put include/linux/skmsg.h:459 [inline] sock_map_close+0x1a7/0x260 net/core/sock_map.c:1648 unix_release+0x4b/0x80 net/unix/af_unix.c:1048 __sock_release net/socket.c:659 [inline] sock_close+0x68/0x150 net/socket.c:1421 __fput+0x2c1/0x660 fs/file_table.c:422 __fput_sync+0x44/0x60 fs/file_table.c:507 __do_sys_close fs/open.c:1556 [inline] __se_sys_close+0x101/0x1b0 fs/open.c:1541 __x64_sys_close+0x1f/0x30 fs/open.c:1541 do_syscall_64+0xd3/0x1d0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 read to 0xffff88814b3278b8 of 8 bytes by task 10713 on cpu 0: sk_psock_data_ready include/linux/skmsg.h:464 [inline] sk_psock_skb_ingress_enqueue+0x32d/0x390 net/core/skmsg.c:555 sk_psock_skb_ingress_self+0x185/0x1e0 net/core/skmsg.c:606 sk_psock_verdict_apply net/core/skmsg.c:1008 [inline] sk_psock_verdict_recv+0x3e4/0x4a0 net/core/skmsg.c:1202 unix_read_skb net/unix/af_unix.c:2546 [inline] unix_stream_read_skb+0x9e/0xf0 net/unix/af_unix.c:2682 sk_psock_verdict_data_ready+0x77/0x220 net/core/skmsg.c:1223 unix_stream_sendmsg+0x527/0x860 net/unix/af_unix.c:2339 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x140/0x180 net/socket.c:745 ____sys_sendmsg+0x312/0x410 net/socket.c:2584 ___sys_sendmsg net/socket.c:2638 [inline] __sys_sendmsg+0x1e9/0x280 net/socket.c:2667 __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x46/0x50 net/socket.c:2674 do_syscall_64+0xd3/0x1d0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 value changed: 0xffffffff83d7feb0 -> 0x0000000000000000 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 10713 Comm: syz-executor.4 Tainted: G W 6.8.0-syzkaller-08951-gfe46a7dd189e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/29/2024 Prior to this, commit 4cd12c6065df ("bpf, sockmap: Fix NULL pointer dereference in sk_psock_verdict_data_ready()") fixed one NULL pointer similarly due to no protection of saved_data_ready. Here is another different caller causing the same issue because of the same reason. So we should protect it with sk_callback_lock read lock because the writer side in the sk_psock_drop() uses "write_lock_bh(&sk->sk_callback_lock);". To avoid errors that could happen in future, I move those two pairs of lock into the sk_psock_data_ready(), which is suggested by John Fastabend. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Reported-by: syzbot+aa8c8ec2538929f18f2d@syzkaller.appspotmail.com Signed-off-by: Jason Xing Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Closes: https://syzkaller.appspot.com/bug?extid=aa8c8ec2538929f18f2d Link: https://lore.kernel.org/all/20240329134037.92124-1-kerneljasonxing@gmail.com Link: https://lore.kernel.org/bpf/20240404021001.94815-1-kerneljasonxing@gmail.com --- include/linux/skmsg.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e65ec3fd2799..a509caf823d6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -461,10 +461,12 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) { + read_lock_bh(&sk->sk_callback_lock); if (psock->saved_data_ready) psock->saved_data_ready(sk); else sk->sk_data_ready(sk); + read_unlock_bh(&sk->sk_callback_lock); } static inline void psock_set_prog(struct bpf_prog **pprog, -- cgit v1.2.3 From 70ee853eec5693fefd8348a2b049d9cb83362e58 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 8 Apr 2024 11:18:00 +0100 Subject: regmap: Add regmap_read_bypassed() Add a regmap_read_bypassed() to allow reads from the hardware registers while the regmap is in cache-only mode. A typical use for this is to keep the cache in cache-only mode until the hardware has reached a valid state, but one or more status registers must be polled to determine when this state is reached. For example, firmware download on the cs35l56 can take several seconds if there are multiple amps sharing limited bus bandwidth. This is too long to block in probe() so it is done as a background task. The device must be soft-reset to reboot the firmware and during this time the registers are not accessible, so the cache should be in cache-only. But the driver must poll a register to detect when reboot has completed. Signed-off-by: Richard Fitzgerald Fixes: 8a731fd37f8b ("ASoC: cs35l56: Move utility functions to shared file") Link: https://msgid.link/r/20240408101803.43183-2-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- drivers/base/regmap/regmap.c | 37 +++++++++++++++++++++++++++++++++++++ include/linux/regmap.h | 8 ++++++++ 2 files changed, 45 insertions(+) (limited to 'include/linux') diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 5cb425f6f02d..0a34dd3c4f38 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -2838,6 +2838,43 @@ int regmap_read(struct regmap *map, unsigned int reg, unsigned int *val) } EXPORT_SYMBOL_GPL(regmap_read); +/** + * regmap_read_bypassed() - Read a value from a single register direct + * from the device, bypassing the cache + * + * @map: Register map to read from + * @reg: Register to be read from + * @val: Pointer to store read value + * + * A value of zero will be returned on success, a negative errno will + * be returned in error cases. + */ +int regmap_read_bypassed(struct regmap *map, unsigned int reg, unsigned int *val) +{ + int ret; + bool bypass, cache_only; + + if (!IS_ALIGNED(reg, map->reg_stride)) + return -EINVAL; + + map->lock(map->lock_arg); + + bypass = map->cache_bypass; + cache_only = map->cache_only; + map->cache_bypass = true; + map->cache_only = false; + + ret = _regmap_read(map, reg, val); + + map->cache_bypass = bypass; + map->cache_only = cache_only; + + map->unlock(map->lock_arg); + + return ret; +} +EXPORT_SYMBOL_GPL(regmap_read_bypassed); + /** * regmap_raw_read() - Read raw data from the device * diff --git a/include/linux/regmap.h b/include/linux/regmap.h index b743241cfb7c..d470303b1bbb 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1230,6 +1230,7 @@ int regmap_multi_reg_write_bypassed(struct regmap *map, int regmap_raw_write_async(struct regmap *map, unsigned int reg, const void *val, size_t val_len); int regmap_read(struct regmap *map, unsigned int reg, unsigned int *val); +int regmap_read_bypassed(struct regmap *map, unsigned int reg, unsigned int *val); int regmap_raw_read(struct regmap *map, unsigned int reg, void *val, size_t val_len); int regmap_noinc_read(struct regmap *map, unsigned int reg, @@ -1739,6 +1740,13 @@ static inline int regmap_read(struct regmap *map, unsigned int reg, return -EINVAL; } +static inline int regmap_read_bypassed(struct regmap *map, unsigned int reg, + unsigned int *val) +{ + WARN_ONCE(1, "regmap API is disabled"); + return -EINVAL; +} + static inline int regmap_raw_read(struct regmap *map, unsigned int reg, void *val, size_t val_len) { -- cgit v1.2.3 From ed09f81eeaa8f9265e1787282cb283f10285c259 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Sat, 6 Apr 2024 15:01:09 +0200 Subject: firmware: qcom: uefisecapp: Fix memory related IO errors and crashes It turns out that while the QSEECOM APP_SEND command has specific fields for request and response buffers, uefisecapp expects them both to be in a single memory region. Failure to adhere to this has (so far) resulted in either no response being written to the response buffer (causing an EIO to be emitted down the line), the SCM call to fail with EINVAL (i.e., directly from TZ/firmware), or the device to be hard-reset. While this issue can be triggered deterministically, in the current form it seems to happen rather sporadically (which is why it has gone unnoticed during earlier testing). This is likely due to the two kzalloc() calls (for request and response) being directly after each other. Which means that those likely return consecutive regions most of the time, especially when not much else is going on in the system. Fix this by allocating a single memory region for both request and response buffers, properly aligning both structs inside it. This unfortunately also means that the qcom_scm_qseecom_app_send() interface needs to be restructured, as it should no longer map the DMA regions separately. Therefore, move the responsibility of DMA allocation (or mapping) to the caller. Fixes: 759e7a2b62eb ("firmware: Add support for Qualcomm UEFI Secure Application") Cc: stable@vger.kernel.org # 6.7 Tested-by: Johan Hovold Reviewed-by: Johan Hovold Signed-off-by: Maximilian Luz Tested-by: Konrad Dybcio # X13s Link: https://lore.kernel.org/r/20240406130125.1047436-1-luzmaximilian@gmail.com Signed-off-by: Bjorn Andersson --- drivers/firmware/qcom/qcom_qseecom_uefisecapp.c | 137 ++++++++++++++++-------- drivers/firmware/qcom/qcom_scm.c | 37 ++----- include/linux/firmware/qcom/qcom_qseecom.h | 55 +++++++++- include/linux/firmware/qcom/qcom_scm.h | 10 +- 4 files changed, 153 insertions(+), 86 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/qcom/qcom_qseecom_uefisecapp.c b/drivers/firmware/qcom/qcom_qseecom_uefisecapp.c index 32188f098ef3..bc550ad0dbe0 100644 --- a/drivers/firmware/qcom/qcom_qseecom_uefisecapp.c +++ b/drivers/firmware/qcom/qcom_qseecom_uefisecapp.c @@ -221,6 +221,19 @@ struct qsee_rsp_uefi_query_variable_info { * alignment of 8 bytes (64 bits) for GUIDs. Our definition of efi_guid_t, * however, has an alignment of 4 byte (32 bits). So far, this seems to work * fine here. See also the comment on the typedef of efi_guid_t. + * + * Note: It looks like uefisecapp is quite picky about how the memory passed to + * it is structured and aligned. In particular the request/response setup used + * for QSEE_CMD_UEFI_GET_VARIABLE. While qcom_qseecom_app_send(), in theory, + * accepts separate buffers/addresses for the request and response parts, in + * practice, however, it seems to expect them to be both part of a larger + * contiguous block. We initially allocated separate buffers for the request + * and response but this caused the QSEE_CMD_UEFI_GET_VARIABLE command to + * either not write any response to the response buffer or outright crash the + * device. Therefore, we now allocate a single contiguous block of DMA memory + * for both and properly align the data using the macros below. In particular, + * request and response structs are aligned at 8 byte (via __reqdata_offs()), + * following the driver that this has been reverse-engineered from. */ #define qcuefi_buf_align_fields(fields...) \ ({ \ @@ -244,6 +257,12 @@ struct qsee_rsp_uefi_query_variable_info { #define __array_offs(type, count, offset) \ __field_impl(sizeof(type) * (count), __alignof__(type), offset) +#define __array_offs_aligned(type, count, align, offset) \ + __field_impl(sizeof(type) * (count), align, offset) + +#define __reqdata_offs(size, offset) \ + __array_offs_aligned(u8, size, 8, offset) + #define __array(type, count) __array_offs(type, count, NULL) #define __field_offs(type, offset) __array_offs(type, 1, offset) #define __field(type) __array_offs(type, 1, NULL) @@ -277,10 +296,15 @@ static efi_status_t qsee_uefi_get_variable(struct qcuefi_client *qcuefi, const e unsigned long buffer_size = *data_size; efi_status_t efi_status = EFI_SUCCESS; unsigned long name_length; + dma_addr_t cmd_buf_dma; + size_t cmd_buf_size; + void *cmd_buf; size_t guid_offs; size_t name_offs; size_t req_size; size_t rsp_size; + size_t req_offs; + size_t rsp_offs; ssize_t status; if (!name || !guid) @@ -304,17 +328,19 @@ static efi_status_t qsee_uefi_get_variable(struct qcuefi_client *qcuefi, const e __array(u8, buffer_size) ); - req_data = kzalloc(req_size, GFP_KERNEL); - if (!req_data) { + cmd_buf_size = qcuefi_buf_align_fields( + __reqdata_offs(req_size, &req_offs) + __reqdata_offs(rsp_size, &rsp_offs) + ); + + cmd_buf = qseecom_dma_alloc(qcuefi->client, cmd_buf_size, &cmd_buf_dma, GFP_KERNEL); + if (!cmd_buf) { efi_status = EFI_OUT_OF_RESOURCES; goto out; } - rsp_data = kzalloc(rsp_size, GFP_KERNEL); - if (!rsp_data) { - efi_status = EFI_OUT_OF_RESOURCES; - goto out_free_req; - } + req_data = cmd_buf + req_offs; + rsp_data = cmd_buf + rsp_offs; req_data->command_id = QSEE_CMD_UEFI_GET_VARIABLE; req_data->data_size = buffer_size; @@ -332,7 +358,9 @@ static efi_status_t qsee_uefi_get_variable(struct qcuefi_client *qcuefi, const e memcpy(((void *)req_data) + req_data->guid_offset, guid, req_data->guid_size); - status = qcom_qseecom_app_send(qcuefi->client, req_data, req_size, rsp_data, rsp_size); + status = qcom_qseecom_app_send(qcuefi->client, + cmd_buf_dma + req_offs, req_size, + cmd_buf_dma + rsp_offs, rsp_size); if (status) { efi_status = EFI_DEVICE_ERROR; goto out_free; @@ -407,9 +435,7 @@ static efi_status_t qsee_uefi_get_variable(struct qcuefi_client *qcuefi, const e memcpy(data, ((void *)rsp_data) + rsp_data->data_offset, rsp_data->data_size); out_free: - kfree(rsp_data); -out_free_req: - kfree(req_data); + qseecom_dma_free(qcuefi->client, cmd_buf_size, cmd_buf, cmd_buf_dma); out: return efi_status; } @@ -422,10 +448,15 @@ static efi_status_t qsee_uefi_set_variable(struct qcuefi_client *qcuefi, const e struct qsee_rsp_uefi_set_variable *rsp_data; efi_status_t efi_status = EFI_SUCCESS; unsigned long name_length; + dma_addr_t cmd_buf_dma; + size_t cmd_buf_size; + void *cmd_buf; size_t name_offs; size_t guid_offs; size_t data_offs; size_t req_size; + size_t req_offs; + size_t rsp_offs; ssize_t status; if (!name || !guid) @@ -450,17 +481,19 @@ static efi_status_t qsee_uefi_set_variable(struct qcuefi_client *qcuefi, const e __array_offs(u8, data_size, &data_offs) ); - req_data = kzalloc(req_size, GFP_KERNEL); - if (!req_data) { + cmd_buf_size = qcuefi_buf_align_fields( + __reqdata_offs(req_size, &req_offs) + __reqdata_offs(sizeof(*rsp_data), &rsp_offs) + ); + + cmd_buf = qseecom_dma_alloc(qcuefi->client, cmd_buf_size, &cmd_buf_dma, GFP_KERNEL); + if (!cmd_buf) { efi_status = EFI_OUT_OF_RESOURCES; goto out; } - rsp_data = kzalloc(sizeof(*rsp_data), GFP_KERNEL); - if (!rsp_data) { - efi_status = EFI_OUT_OF_RESOURCES; - goto out_free_req; - } + req_data = cmd_buf + req_offs; + rsp_data = cmd_buf + rsp_offs; req_data->command_id = QSEE_CMD_UEFI_SET_VARIABLE; req_data->attributes = attributes; @@ -483,8 +516,9 @@ static efi_status_t qsee_uefi_set_variable(struct qcuefi_client *qcuefi, const e if (data_size) memcpy(((void *)req_data) + req_data->data_offset, data, req_data->data_size); - status = qcom_qseecom_app_send(qcuefi->client, req_data, req_size, rsp_data, - sizeof(*rsp_data)); + status = qcom_qseecom_app_send(qcuefi->client, + cmd_buf_dma + req_offs, req_size, + cmd_buf_dma + rsp_offs, sizeof(*rsp_data)); if (status) { efi_status = EFI_DEVICE_ERROR; goto out_free; @@ -507,9 +541,7 @@ static efi_status_t qsee_uefi_set_variable(struct qcuefi_client *qcuefi, const e } out_free: - kfree(rsp_data); -out_free_req: - kfree(req_data); + qseecom_dma_free(qcuefi->client, cmd_buf_size, cmd_buf, cmd_buf_dma); out: return efi_status; } @@ -521,10 +553,15 @@ static efi_status_t qsee_uefi_get_next_variable(struct qcuefi_client *qcuefi, struct qsee_req_uefi_get_next_variable *req_data; struct qsee_rsp_uefi_get_next_variable *rsp_data; efi_status_t efi_status = EFI_SUCCESS; + dma_addr_t cmd_buf_dma; + size_t cmd_buf_size; + void *cmd_buf; size_t guid_offs; size_t name_offs; size_t req_size; size_t rsp_size; + size_t req_offs; + size_t rsp_offs; ssize_t status; if (!name_size || !name || !guid) @@ -545,17 +582,19 @@ static efi_status_t qsee_uefi_get_next_variable(struct qcuefi_client *qcuefi, __array(*name, *name_size / sizeof(*name)) ); - req_data = kzalloc(req_size, GFP_KERNEL); - if (!req_data) { + cmd_buf_size = qcuefi_buf_align_fields( + __reqdata_offs(req_size, &req_offs) + __reqdata_offs(rsp_size, &rsp_offs) + ); + + cmd_buf = qseecom_dma_alloc(qcuefi->client, cmd_buf_size, &cmd_buf_dma, GFP_KERNEL); + if (!cmd_buf) { efi_status = EFI_OUT_OF_RESOURCES; goto out; } - rsp_data = kzalloc(rsp_size, GFP_KERNEL); - if (!rsp_data) { - efi_status = EFI_OUT_OF_RESOURCES; - goto out_free_req; - } + req_data = cmd_buf + req_offs; + rsp_data = cmd_buf + rsp_offs; req_data->command_id = QSEE_CMD_UEFI_GET_NEXT_VARIABLE; req_data->guid_offset = guid_offs; @@ -572,7 +611,9 @@ static efi_status_t qsee_uefi_get_next_variable(struct qcuefi_client *qcuefi, goto out_free; } - status = qcom_qseecom_app_send(qcuefi->client, req_data, req_size, rsp_data, rsp_size); + status = qcom_qseecom_app_send(qcuefi->client, + cmd_buf_dma + req_offs, req_size, + cmd_buf_dma + rsp_offs, rsp_size); if (status) { efi_status = EFI_DEVICE_ERROR; goto out_free; @@ -645,9 +686,7 @@ static efi_status_t qsee_uefi_get_next_variable(struct qcuefi_client *qcuefi, } out_free: - kfree(rsp_data); -out_free_req: - kfree(req_data); + qseecom_dma_free(qcuefi->client, cmd_buf_size, cmd_buf, cmd_buf_dma); out: return efi_status; } @@ -659,26 +698,34 @@ static efi_status_t qsee_uefi_query_variable_info(struct qcuefi_client *qcuefi, struct qsee_req_uefi_query_variable_info *req_data; struct qsee_rsp_uefi_query_variable_info *rsp_data; efi_status_t efi_status = EFI_SUCCESS; + dma_addr_t cmd_buf_dma; + size_t cmd_buf_size; + void *cmd_buf; + size_t req_offs; + size_t rsp_offs; int status; - req_data = kzalloc(sizeof(*req_data), GFP_KERNEL); - if (!req_data) { + cmd_buf_size = qcuefi_buf_align_fields( + __reqdata_offs(sizeof(*req_data), &req_offs) + __reqdata_offs(sizeof(*rsp_data), &rsp_offs) + ); + + cmd_buf = qseecom_dma_alloc(qcuefi->client, cmd_buf_size, &cmd_buf_dma, GFP_KERNEL); + if (!cmd_buf) { efi_status = EFI_OUT_OF_RESOURCES; goto out; } - rsp_data = kzalloc(sizeof(*rsp_data), GFP_KERNEL); - if (!rsp_data) { - efi_status = EFI_OUT_OF_RESOURCES; - goto out_free_req; - } + req_data = cmd_buf + req_offs; + rsp_data = cmd_buf + rsp_offs; req_data->command_id = QSEE_CMD_UEFI_QUERY_VARIABLE_INFO; req_data->attributes = attr; req_data->length = sizeof(*req_data); - status = qcom_qseecom_app_send(qcuefi->client, req_data, sizeof(*req_data), rsp_data, - sizeof(*rsp_data)); + status = qcom_qseecom_app_send(qcuefi->client, + cmd_buf_dma + req_offs, sizeof(*req_data), + cmd_buf_dma + rsp_offs, sizeof(*rsp_data)); if (status) { efi_status = EFI_DEVICE_ERROR; goto out_free; @@ -711,9 +758,7 @@ static efi_status_t qsee_uefi_query_variable_info(struct qcuefi_client *qcuefi, *max_variable_size = rsp_data->max_variable_size; out_free: - kfree(rsp_data); -out_free_req: - kfree(req_data); + qseecom_dma_free(qcuefi->client, cmd_buf_size, cmd_buf, cmd_buf_dma); out: return efi_status; } diff --git a/drivers/firmware/qcom/qcom_scm.c b/drivers/firmware/qcom/qcom_scm.c index 520de9b5633a..90283f160a22 100644 --- a/drivers/firmware/qcom/qcom_scm.c +++ b/drivers/firmware/qcom/qcom_scm.c @@ -1576,9 +1576,9 @@ EXPORT_SYMBOL_GPL(qcom_scm_qseecom_app_get_id); /** * qcom_scm_qseecom_app_send() - Send to and receive data from a given QSEE app. * @app_id: The ID of the target app. - * @req: Request buffer sent to the app (must be DMA-mappable). + * @req: DMA address of the request buffer sent to the app. * @req_size: Size of the request buffer. - * @rsp: Response buffer, written to by the app (must be DMA-mappable). + * @rsp: DMA address of the response buffer, written to by the app. * @rsp_size: Size of the response buffer. * * Sends a request to the QSEE app associated with the given ID and read back @@ -1589,33 +1589,13 @@ EXPORT_SYMBOL_GPL(qcom_scm_qseecom_app_get_id); * * Return: Zero on success, nonzero on failure. */ -int qcom_scm_qseecom_app_send(u32 app_id, void *req, size_t req_size, void *rsp, - size_t rsp_size) +int qcom_scm_qseecom_app_send(u32 app_id, dma_addr_t req, size_t req_size, + dma_addr_t rsp, size_t rsp_size) { struct qcom_scm_qseecom_resp res = {}; struct qcom_scm_desc desc = {}; - dma_addr_t req_phys; - dma_addr_t rsp_phys; int status; - /* Map request buffer */ - req_phys = dma_map_single(__scm->dev, req, req_size, DMA_TO_DEVICE); - status = dma_mapping_error(__scm->dev, req_phys); - if (status) { - dev_err(__scm->dev, "qseecom: failed to map request buffer\n"); - return status; - } - - /* Map response buffer */ - rsp_phys = dma_map_single(__scm->dev, rsp, rsp_size, DMA_FROM_DEVICE); - status = dma_mapping_error(__scm->dev, rsp_phys); - if (status) { - dma_unmap_single(__scm->dev, req_phys, req_size, DMA_TO_DEVICE); - dev_err(__scm->dev, "qseecom: failed to map response buffer\n"); - return status; - } - - /* Set up SCM call data */ desc.owner = QSEECOM_TZ_OWNER_TZ_APPS; desc.svc = QSEECOM_TZ_SVC_APP_ID_PLACEHOLDER; desc.cmd = QSEECOM_TZ_CMD_APP_SEND; @@ -1623,18 +1603,13 @@ int qcom_scm_qseecom_app_send(u32 app_id, void *req, size_t req_size, void *rsp, QCOM_SCM_RW, QCOM_SCM_VAL, QCOM_SCM_RW, QCOM_SCM_VAL); desc.args[0] = app_id; - desc.args[1] = req_phys; + desc.args[1] = req; desc.args[2] = req_size; - desc.args[3] = rsp_phys; + desc.args[3] = rsp; desc.args[4] = rsp_size; - /* Perform call */ status = qcom_scm_qseecom_call(&desc, &res); - /* Unmap buffers */ - dma_unmap_single(__scm->dev, rsp_phys, rsp_size, DMA_FROM_DEVICE); - dma_unmap_single(__scm->dev, req_phys, req_size, DMA_TO_DEVICE); - if (status) return status; diff --git a/include/linux/firmware/qcom/qcom_qseecom.h b/include/linux/firmware/qcom/qcom_qseecom.h index 5c28298a98be..366243ee9609 100644 --- a/include/linux/firmware/qcom/qcom_qseecom.h +++ b/include/linux/firmware/qcom/qcom_qseecom.h @@ -10,6 +10,7 @@ #define __QCOM_QSEECOM_H #include +#include #include #include @@ -24,12 +25,57 @@ struct qseecom_client { u32 app_id; }; +/** + * qseecom_scm_dev() - Get the SCM device associated with the QSEECOM client. + * @client: The QSEECOM client device. + * + * Returns the SCM device under which the provided QSEECOM client device + * operates. This function is intended to be used for DMA allocations. + */ +static inline struct device *qseecom_scm_dev(struct qseecom_client *client) +{ + return client->aux_dev.dev.parent->parent; +} + +/** + * qseecom_dma_alloc() - Allocate DMA memory for a QSEECOM client. + * @client: The QSEECOM client to allocate the memory for. + * @size: The number of bytes to allocate. + * @dma_handle: Pointer to where the DMA address should be stored. + * @gfp: Allocation flags. + * + * Wrapper function for dma_alloc_coherent(), allocating DMA memory usable for + * TZ/QSEECOM communication. Refer to dma_alloc_coherent() for details. + */ +static inline void *qseecom_dma_alloc(struct qseecom_client *client, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + return dma_alloc_coherent(qseecom_scm_dev(client), size, dma_handle, gfp); +} + +/** + * dma_free_coherent() - Free QSEECOM DMA memory. + * @client: The QSEECOM client for which the memory has been allocated. + * @size: The number of bytes allocated. + * @cpu_addr: Virtual memory address to free. + * @dma_handle: DMA memory address to free. + * + * Wrapper function for dma_free_coherent(), freeing memory previously + * allocated with qseecom_dma_alloc(). Refer to dma_free_coherent() for + * details. + */ +static inline void qseecom_dma_free(struct qseecom_client *client, size_t size, + void *cpu_addr, dma_addr_t dma_handle) +{ + return dma_free_coherent(qseecom_scm_dev(client), size, cpu_addr, dma_handle); +} + /** * qcom_qseecom_app_send() - Send to and receive data from a given QSEE app. * @client: The QSEECOM client associated with the target app. - * @req: Request buffer sent to the app (must be DMA-mappable). + * @req: DMA address of the request buffer sent to the app. * @req_size: Size of the request buffer. - * @rsp: Response buffer, written to by the app (must be DMA-mappable). + * @rsp: DMA address of the response buffer, written to by the app. * @rsp_size: Size of the response buffer. * * Sends a request to the QSEE app associated with the given client and read @@ -43,8 +89,9 @@ struct qseecom_client { * * Return: Zero on success, nonzero on failure. */ -static inline int qcom_qseecom_app_send(struct qseecom_client *client, void *req, size_t req_size, - void *rsp, size_t rsp_size) +static inline int qcom_qseecom_app_send(struct qseecom_client *client, + dma_addr_t req, size_t req_size, + dma_addr_t rsp, size_t rsp_size) { return qcom_scm_qseecom_app_send(client->app_id, req, req_size, rsp, rsp_size); } diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h index ccaf28846054..aaa19f93ac43 100644 --- a/include/linux/firmware/qcom/qcom_scm.h +++ b/include/linux/firmware/qcom/qcom_scm.h @@ -118,8 +118,8 @@ bool qcom_scm_lmh_dcvsh_available(void); #ifdef CONFIG_QCOM_QSEECOM int qcom_scm_qseecom_app_get_id(const char *app_name, u32 *app_id); -int qcom_scm_qseecom_app_send(u32 app_id, void *req, size_t req_size, void *rsp, - size_t rsp_size); +int qcom_scm_qseecom_app_send(u32 app_id, dma_addr_t req, size_t req_size, + dma_addr_t rsp, size_t rsp_size); #else /* CONFIG_QCOM_QSEECOM */ @@ -128,9 +128,9 @@ static inline int qcom_scm_qseecom_app_get_id(const char *app_name, u32 *app_id) return -EINVAL; } -static inline int qcom_scm_qseecom_app_send(u32 app_id, void *req, - size_t req_size, void *rsp, - size_t rsp_size) +static inline int qcom_scm_qseecom_app_send(u32 app_id, + dma_addr_t req, size_t req_size, + dma_addr_t rsp, size_t rsp_size) { return -EINVAL; } -- cgit v1.2.3 From 32cf5a4eda464d76d553ee3f1b06c4d33d796c52 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 19 Apr 2024 11:51:12 -0400 Subject: Revert "svcrdma: Add Write chunk WRs to the RPC's Send WR chain" Performance regression reported with NFS/RDMA using Omnipath, bisected to commit e084ee673c77 ("svcrdma: Add Write chunk WRs to the RPC's Send WR chain"). Tracing on the server reports: nfsd-7771 [060] 1758.891809: svcrdma_sq_post_err: cq.id=205 cid=226 sc_sq_avail=13643/851 status=-12 sq_post_err reports ENOMEM, and the rdma->sc_sq_avail (13643) is larger than rdma->sc_sq_depth (851). The number of available Send Queue entries is always supposed to be smaller than the Send Queue depth. That seems like a Send Queue accounting bug in svcrdma. As it's getting to be late in the 6.9-rc cycle, revert this commit. It can be revisited in a subsequent kernel release. Link: https://bugzilla.kernel.org/show_bug.cgi?id=218743 Fixes: e084ee673c77 ("svcrdma: Add Write chunk WRs to the RPC's Send WR chain") Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 13 ++---- net/sunrpc/xprtrdma/svc_rdma_rw.c | 86 +++++++++-------------------------- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 5 +- 3 files changed, 26 insertions(+), 78 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 24cd199dd6f3..d33bab33099a 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -210,7 +210,6 @@ struct svc_rdma_recv_ctxt { */ struct svc_rdma_write_info { struct svcxprt_rdma *wi_rdma; - struct list_head wi_list; const struct svc_rdma_chunk *wi_chunk; @@ -239,10 +238,7 @@ struct svc_rdma_send_ctxt { struct ib_cqe sc_cqe; struct xdr_buf sc_hdrbuf; struct xdr_stream sc_stream; - - struct list_head sc_write_info_list; struct svc_rdma_write_info sc_reply_info; - void *sc_xprt_buf; int sc_page_count; int sc_cur_sge_no; @@ -274,14 +270,11 @@ extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir); -extern void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt); extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt); -extern int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, - const struct svc_rdma_pcl *write_pcl, - struct svc_rdma_send_ctxt *sctxt, - const struct xdr_buf *xdr); +extern int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + const struct xdr_buf *xdr); extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_pcl *write_pcl, const struct svc_rdma_pcl *reply_pcl, diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index f2a100c4c81f..40797114d50a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -230,28 +230,6 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) queue_work(svcrdma_wq, &info->wi_work); } -/** - * svc_rdma_write_chunk_release - Release Write chunk I/O resources - * @rdma: controlling transport - * @ctxt: Send context that is being released - */ -void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt) -{ - struct svc_rdma_write_info *info; - struct svc_rdma_chunk_ctxt *cc; - - while (!list_empty(&ctxt->sc_write_info_list)) { - info = list_first_entry(&ctxt->sc_write_info_list, - struct svc_rdma_write_info, wi_list); - list_del(&info->wi_list); - - cc = &info->wi_cc; - svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); - svc_rdma_write_info_free(info); - } -} - /** * svc_rdma_reply_chunk_release - Release Reply chunk I/O resources * @rdma: controlling transport @@ -308,11 +286,13 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_chunk_ctxt *cc = container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); + struct svc_rdma_write_info *info = + container_of(cc, struct svc_rdma_write_info, wi_cc); switch (wc->status) { case IB_WC_SUCCESS: trace_svcrdma_wc_write(&cc->cc_cid); - return; + break; case IB_WC_WR_FLUSH_ERR: trace_svcrdma_wc_write_flush(wc, &cc->cc_cid); break; @@ -320,11 +300,12 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_write_err(wc, &cc->cc_cid); } - /* The RDMA Write has flushed, so the client won't get - * some of the outgoing RPC message. Signal the loss - * to the client by closing the connection. - */ - svc_xprt_deferred_close(&rdma->sc_xprt); + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); + + if (unlikely(wc->status != IB_WC_SUCCESS)) + svc_xprt_deferred_close(&rdma->sc_xprt); + + svc_rdma_write_info_free(info); } /** @@ -620,19 +601,13 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data) return xdr->len; } -/* Link Write WRs for @chunk onto @sctxt's WR chain. - */ -static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *sctxt, - const struct svc_rdma_chunk *chunk, - const struct xdr_buf *xdr) +static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_chunk *chunk, + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; struct svc_rdma_chunk_ctxt *cc; - struct ib_send_wr *first_wr; struct xdr_buf payload; - struct list_head *pos; - struct ib_cqe *cqe; int ret; if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position, @@ -648,25 +623,10 @@ static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma, if (ret != payload.len) goto out_err; - ret = -EINVAL; - if (unlikely(cc->cc_sqecount > rdma->sc_sq_depth)) - goto out_err; - - first_wr = sctxt->sc_wr_chain; - cqe = &cc->cc_cqe; - list_for_each(pos, &cc->cc_rwctxts) { - struct svc_rdma_rw_ctxt *rwc; - - rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list); - first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp, - rdma->sc_port_num, cqe, first_wr); - cqe = NULL; - } - sctxt->sc_wr_chain = first_wr; - sctxt->sc_sqecount += cc->cc_sqecount; - list_add(&info->wi_list, &sctxt->sc_write_info_list); - trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); + ret = svc_rdma_post_chunk_ctxt(rdma, cc); + if (ret < 0) + goto out_err; return 0; out_err: @@ -675,27 +635,25 @@ out_err: } /** - * svc_rdma_prepare_write_list - Construct WR chain for sending Write list + * svc_rdma_send_write_list - Send all chunks on the Write list * @rdma: controlling RDMA transport - * @write_pcl: Write list provisioned by the client - * @sctxt: Send WR resources + * @rctxt: Write list provisioned by the client * @xdr: xdr_buf containing an RPC Reply message * * Returns zero on success, or a negative errno if one or more * Write chunks could not be sent. */ -int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, - const struct svc_rdma_pcl *write_pcl, - struct svc_rdma_send_ctxt *sctxt, - const struct xdr_buf *xdr) +int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + const struct xdr_buf *xdr) { struct svc_rdma_chunk *chunk; int ret; - pcl_for_each_chunk(chunk, write_pcl) { + pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) { if (!chunk->ch_payload_length) break; - ret = svc_rdma_prepare_write_chunk(rdma, sctxt, chunk, xdr); + ret = svc_rdma_send_write_chunk(rdma, chunk, xdr); if (ret < 0) return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index dfca39abd16c..bb5436b719e0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -142,7 +142,6 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->sc_send_wr.sg_list = ctxt->sc_sges; ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; ctxt->sc_cqe.done = svc_rdma_wc_send; - INIT_LIST_HEAD(&ctxt->sc_write_info_list); ctxt->sc_xprt_buf = buffer; xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, rdma->sc_max_req_size); @@ -228,7 +227,6 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma, struct ib_device *device = rdma->sc_cm_id->device; unsigned int i; - svc_rdma_write_chunk_release(rdma, ctxt); svc_rdma_reply_chunk_release(rdma, ctxt); if (ctxt->sc_page_count) @@ -1015,8 +1013,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (!p) goto put_ctxt; - ret = svc_rdma_prepare_write_list(rdma, &rctxt->rc_write_pcl, sctxt, - &rqstp->rq_res); + ret = svc_rdma_send_write_list(rdma, rctxt, &rqstp->rq_res); if (ret < 0) goto put_ctxt; -- cgit v1.2.3 From 96e20adc43c4f81e9163a5188cee75a6dd393e09 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Mon, 22 Apr 2024 09:38:33 +0300 Subject: regulator: change stubbed devm_regulator_get_enable to return Ok The devm_regulator_get_enable() should be a 'call and forget' API, meaning, when it is used to enable the regulators, the API does not provide a handle to do any further control of the regulators. It gives no real benefit to return an error from the stub if CONFIG_REGULATOR is not set. On the contrary, returning and error is causing problems to drivers when hardware is such it works out just fine with no regulator control. Returning an error forces drivers to specifically handle the case where CONFIG_REGULATOR is not set, making the mere existence of the stub questionalble. Furthermore, the stub of the regulator_enable() seems to be returning Ok. Change the stub implementation for the devm_regulator_get_enable() to return Ok so drivers do not separately handle the case where the CONFIG_REGULATOR is not set. Signed-off-by: Matti Vaittinen Reported-by: Aleksander Mazur Suggested-by: Guenter Roeck Fixes: da279e6965b3 ("regulator: Add devm helpers for get and enable") Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/ZiYF6d1V1vSPcsJS@drtxq0yyyyyyyyyyyyyby-3.rev.dnainternet.fi Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 4660582a3302..71232fb7dda3 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -320,7 +320,7 @@ devm_regulator_get_exclusive(struct device *dev, const char *id) static inline int devm_regulator_get_enable(struct device *dev, const char *id) { - return -ENODEV; + return 0; } static inline int devm_regulator_get_enable_optional(struct device *dev, -- cgit v1.2.3 From ff33132605c1a0acea59e4c523cb7c6fabe856b2 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Tue, 23 Apr 2024 14:38:28 +0300 Subject: regulator: change devm_regulator_get_enable_optional() stub to return Ok The devm_regulator_get_enable_optional() should be a 'call and forget' API, meaning, when it is used to enable the regulators, the API does not provide a handle to do any further control of the regulators. It gives no real benefit to return an error from the stub if CONFIG_REGULATOR is not set. On the contrary, returning an error is causing problems to drivers when hardware is such it works out just fine with no regulator control. Returning an error forces drivers to specifically handle the case where CONFIG_REGULATOR is not set, making the mere existence of the stub questionalble. Change the stub implementation for the devm_regulator_get_enable_optional() to return Ok so drivers do not separately handle the case where the CONFIG_REGULATOR is not set. Signed-off-by: Matti Vaittinen Fixes: da279e6965b3 ("regulator: Add devm helpers for get and enable") Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/ZiedtOE00Zozd3XO@fedora Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 71232fb7dda3..ed180ca419da 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -326,7 +326,7 @@ static inline int devm_regulator_get_enable(struct device *dev, const char *id) static inline int devm_regulator_get_enable_optional(struct device *dev, const char *id) { - return -ENODEV; + return 0; } static inline struct regulator *__must_check -- cgit v1.2.3 From 12bbaae7635a56049779db3bef6e7140d9aa5f67 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 21 Mar 2024 14:24:40 +0000 Subject: mm: create FOLIO_FLAG_FALSE and FOLIO_TYPE_OPS macros Following the separation of FOLIO_FLAGS from PAGEFLAGS, separate FOLIO_FLAG_FALSE from PAGEFLAG_FALSE and FOLIO_TYPE_OPS from PAGE_TYPE_OPS. Link: https://lkml.kernel.org/r/20240321142448.1645400-3-willy@infradead.org Fixes: 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Miaohe Lin Cc: Muchun Song Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 70 +++++++++++++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 652d77805e99..dc1607f1415e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -458,30 +458,51 @@ static __always_inline int TestClearPage##uname(struct page *page) \ TESTSETFLAG(uname, lname, policy) \ TESTCLEARFLAG(uname, lname, policy) +#define FOLIO_TEST_FLAG_FALSE(name) \ +static inline bool folio_test_##name(const struct folio *folio) \ +{ return false; } +#define FOLIO_SET_FLAG_NOOP(name) \ +static inline void folio_set_##name(struct folio *folio) { } +#define FOLIO_CLEAR_FLAG_NOOP(name) \ +static inline void folio_clear_##name(struct folio *folio) { } +#define __FOLIO_SET_FLAG_NOOP(name) \ +static inline void __folio_set_##name(struct folio *folio) { } +#define __FOLIO_CLEAR_FLAG_NOOP(name) \ +static inline void __folio_clear_##name(struct folio *folio) { } +#define FOLIO_TEST_SET_FLAG_FALSE(name) \ +static inline bool folio_test_set_##name(struct folio *folio) \ +{ return false; } +#define FOLIO_TEST_CLEAR_FLAG_FALSE(name) \ +static inline bool folio_test_clear_##name(struct folio *folio) \ +{ return false; } + +#define FOLIO_FLAG_FALSE(name) \ +FOLIO_TEST_FLAG_FALSE(name) \ +FOLIO_SET_FLAG_NOOP(name) \ +FOLIO_CLEAR_FLAG_NOOP(name) + #define TESTPAGEFLAG_FALSE(uname, lname) \ -static inline bool folio_test_##lname(const struct folio *folio) { return false; } \ +FOLIO_TEST_FLAG_FALSE(lname) \ static inline int Page##uname(const struct page *page) { return 0; } #define SETPAGEFLAG_NOOP(uname, lname) \ -static inline void folio_set_##lname(struct folio *folio) { } \ +FOLIO_SET_FLAG_NOOP(lname) \ static inline void SetPage##uname(struct page *page) { } #define CLEARPAGEFLAG_NOOP(uname, lname) \ -static inline void folio_clear_##lname(struct folio *folio) { } \ +FOLIO_CLEAR_FLAG_NOOP(lname) \ static inline void ClearPage##uname(struct page *page) { } #define __CLEARPAGEFLAG_NOOP(uname, lname) \ -static inline void __folio_clear_##lname(struct folio *folio) { } \ +__FOLIO_CLEAR_FLAG_NOOP(lname) \ static inline void __ClearPage##uname(struct page *page) { } #define TESTSETFLAG_FALSE(uname, lname) \ -static inline bool folio_test_set_##lname(struct folio *folio) \ -{ return 0; } \ +FOLIO_TEST_SET_FLAG_FALSE(lname) \ static inline int TestSetPage##uname(struct page *page) { return 0; } #define TESTCLEARFLAG_FALSE(uname, lname) \ -static inline bool folio_test_clear_##lname(struct folio *folio) \ -{ return 0; } \ +FOLIO_TEST_CLEAR_FLAG_FALSE(lname) \ static inline int TestClearPage##uname(struct page *page) { return 0; } #define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname) \ @@ -977,35 +998,38 @@ static inline int page_has_type(const struct page *page) return page_type_has_type(page->page_type); } +#define FOLIO_TYPE_OPS(lname, fname) \ +static __always_inline bool folio_test_##fname(const struct folio *folio)\ +{ \ + return folio_test_type(folio, PG_##lname); \ +} \ +static __always_inline void __folio_set_##fname(struct folio *folio) \ +{ \ + VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \ + folio->page.page_type &= ~PG_##lname; \ +} \ +static __always_inline void __folio_clear_##fname(struct folio *folio) \ +{ \ + VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ + folio->page.page_type |= PG_##lname; \ +} + #define PAGE_TYPE_OPS(uname, lname, fname) \ +FOLIO_TYPE_OPS(lname, fname) \ static __always_inline int Page##uname(const struct page *page) \ { \ return PageType(page, PG_##lname); \ } \ -static __always_inline int folio_test_##fname(const struct folio *folio)\ -{ \ - return folio_test_type(folio, PG_##lname); \ -} \ static __always_inline void __SetPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!PageType(page, 0), page); \ page->page_type &= ~PG_##lname; \ } \ -static __always_inline void __folio_set_##fname(struct folio *folio) \ -{ \ - VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \ - folio->page.page_type &= ~PG_##lname; \ -} \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ page->page_type |= PG_##lname; \ -} \ -static __always_inline void __folio_clear_##fname(struct folio *folio) \ -{ \ - VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ - folio->page.page_type |= PG_##lname; \ -} \ +} /* * PageBuddy() indicates that the page is free and in the buddy system -- cgit v1.2.3 From fd1a745ce03e37945674c14833870a9af0882e2d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 21 Mar 2024 14:24:42 +0000 Subject: mm: support page_mapcount() on page_has_type() pages Return 0 for pages which can't be mapped. This matches how page_mapped() works. It is more convenient for users to not have to filter out these pages. Link: https://lkml.kernel.org/r/20240321142448.1645400-5-willy@infradead.org Fixes: 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Miaohe Lin Cc: Muchun Song Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- fs/proc/page.c | 7 ++----- include/linux/mm.h | 8 +++++--- include/linux/page-flags.h | 4 ++-- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/page.c b/fs/proc/page.c index 195b077c0fac..9223856c934b 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -67,7 +67,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, */ ppage = pfn_to_online_page(pfn); - if (!ppage || PageSlab(ppage) || page_has_type(ppage)) + if (!ppage) pcount = 0; else pcount = page_mapcount(ppage); @@ -124,11 +124,8 @@ u64 stable_page_flags(struct page *page) /* * pseudo flags for the well known (anonymous) memory mapped pages - * - * Note that page->_mapcount is overloaded in SLAB, so the - * simple test in page_mapped() is not enough. */ - if (!PageSlab(page) && page_mapped(page)) + if (page_mapped(page)) u |= 1 << KPF_MMAP; if (PageAnon(page)) u |= 1 << KPF_ANON; diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b0ee64225de..b6bdaa18b9e9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1223,14 +1223,16 @@ static inline void page_mapcount_reset(struct page *page) * a large folio, it includes the number of times this page is mapped * as part of that folio. * - * The result is undefined for pages which cannot be mapped into userspace. - * For example SLAB or special types of pages. See function page_has_type(). - * They use this field in struct page differently. + * Will report 0 for pages which cannot be mapped into userspace, eg + * slab, page tables and similar. */ static inline int page_mapcount(struct page *page) { int mapcount = atomic_read(&page->_mapcount) + 1; + /* Handle page_has_type() pages */ + if (mapcount < 0) + mapcount = 0; if (unlikely(PageCompound(page))) mapcount += folio_entire_mapcount(page_folio(page)); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index dc1607f1415e..35a0087d0910 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -971,12 +971,12 @@ static inline bool is_page_hwpoison(struct page *page) * page_type may be used. Because it is initialised to -1, we invert the * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and - * low bits so that an underflow or overflow of page_mapcount() won't be + * low bits so that an underflow or overflow of _mapcount won't be * mistaken for a page type value. */ #define PAGE_TYPE_BASE 0xf0000000 -/* Reserve 0x0000007f to catch underflows of page_mapcount */ +/* Reserve 0x0000007f to catch underflows of _mapcount */ #define PAGE_MAPCOUNT_RESERVE -128 #define PG_buddy 0x00000080 #define PG_offline 0x00000100 -- cgit v1.2.3 From d99e3140a4d33e26066183ff727d8f02f56bec64 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 21 Mar 2024 14:24:43 +0000 Subject: mm: turn folio_test_hugetlb into a PageType The current folio_test_hugetlb() can be fooled by a concurrent folio split into returning true for a folio which has never belonged to hugetlbfs. This can't happen if the caller holds a refcount on it, but we have a few places (memory-failure, compaction, procfs) which do not and should not take a speculative reference. Since hugetlb pages do not use individual page mapcounts (they are always fully mapped and use the entire_mapcount field to record the number of mappings), the PageType field is available now that page_mapcount() ignores the value in this field. In compaction and with CONFIG_DEBUG_VM enabled, the current implementation can result in an oops, as reported by Luis. This happens since 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") effectively added some VM_BUG_ON() checks in the PageHuge() testing path. [willy@infradead.org: update vmcoreinfo] Link: https://lkml.kernel.org/r/ZgGZUvsdhaT1Va-T@casper.infradead.org Link: https://lkml.kernel.org/r/20240321142448.1645400-6-willy@infradead.org Fixes: 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Acked-by: Vlastimil Babka Reported-by: Luis Chamberlain Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218227 Cc: Miaohe Lin Cc: Muchun Song Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 70 ++++++++++++++++++++---------------------- include/trace/events/mmflags.h | 1 + kernel/vmcore_info.c | 5 ++- mm/hugetlb.c | 22 ++----------- 4 files changed, 39 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 35a0087d0910..4bf1c25fd1dc 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -190,7 +190,6 @@ enum pageflags { /* At least one page in this folio has the hwpoison flag set */ PG_has_hwpoisoned = PG_error, - PG_hugetlb = PG_active, PG_large_rmappable = PG_workingset, /* anon or file-backed */ }; @@ -876,29 +875,6 @@ TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable) #define PG_head_mask ((1UL << PG_head)) -#ifdef CONFIG_HUGETLB_PAGE -int PageHuge(const struct page *page); -SETPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) -CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) - -/** - * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs - * @folio: The folio to test. - * - * Context: Any context. Caller should have a reference on the folio to - * prevent it from being turned into a tail page. - * Return: True for hugetlbfs folios, false for anon folios or folios - * belonging to other filesystems. - */ -static inline bool folio_test_hugetlb(const struct folio *folio) -{ - return folio_test_large(folio) && - test_bit(PG_hugetlb, const_folio_flags(folio, 1)); -} -#else -TESTPAGEFLAG_FALSE(Huge, hugetlb) -#endif - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageHuge() only returns true for hugetlbfs pages, but not for @@ -954,18 +930,6 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) TESTSCFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) #endif -/* - * Check if a page is currently marked HWPoisoned. Note that this check is - * best effort only and inherently racy: there is no way to synchronize with - * failing hardware. - */ -static inline bool is_page_hwpoison(struct page *page) -{ - if (PageHWPoison(page)) - return true; - return PageHuge(page) && PageHWPoison(compound_head(page)); -} - /* * For pages that are never mapped to userspace (and aren't PageSlab), * page_type may be used. Because it is initialised to -1, we invert the @@ -982,6 +946,7 @@ static inline bool is_page_hwpoison(struct page *page) #define PG_offline 0x00000100 #define PG_table 0x00000200 #define PG_guard 0x00000400 +#define PG_hugetlb 0x00000800 #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) @@ -1076,6 +1041,37 @@ PAGE_TYPE_OPS(Table, table, pgtable) */ PAGE_TYPE_OPS(Guard, guard, guard) +#ifdef CONFIG_HUGETLB_PAGE +FOLIO_TYPE_OPS(hugetlb, hugetlb) +#else +FOLIO_TEST_FLAG_FALSE(hugetlb) +#endif + +/** + * PageHuge - Determine if the page belongs to hugetlbfs + * @page: The page to test. + * + * Context: Any context. + * Return: True for hugetlbfs pages, false for anon pages or pages + * belonging to other filesystems. + */ +static inline bool PageHuge(const struct page *page) +{ + return folio_test_hugetlb(page_folio(page)); +} + +/* + * Check if a page is currently marked HWPoisoned. Note that this check is + * best effort only and inherently racy: there is no way to synchronize with + * failing hardware. + */ +static inline bool is_page_hwpoison(struct page *page) +{ + if (PageHWPoison(page)) + return true; + return PageHuge(page) && PageHWPoison(compound_head(page)); +} + extern bool is_free_buddy_page(struct page *page); PAGEFLAG(Isolated, isolated, PF_ANY); @@ -1142,7 +1138,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) */ #define PAGE_FLAGS_SECOND \ (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ - 1UL << PG_hugetlb | 1UL << PG_large_rmappable) + 1UL << PG_large_rmappable) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index d801409b33cf..d55e53ac91bd 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -135,6 +135,7 @@ IF_HAVE_PG_ARCH_X(arch_3) #define DEF_PAGETYPE_NAME(_name) { PG_##_name, __stringify(_name) } #define __def_pagetype_names \ + DEF_PAGETYPE_NAME(hugetlb), \ DEF_PAGETYPE_NAME(offline), \ DEF_PAGETYPE_NAME(guard), \ DEF_PAGETYPE_NAME(table), \ diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index f95516cd45bb..23c125c2e243 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -205,11 +205,10 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_head_mask); #define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_HUGETLB_PAGE - VMCOREINFO_NUMBER(PG_hugetlb); +#define PAGE_HUGETLB_MAPCOUNT_VALUE (~PG_hugetlb) + VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE); #define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); -#endif #ifdef CONFIG_KALLSYMS VMCOREINFO_SYMBOL(kallsyms_names); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 53e0ab5c0845..4553241f0fb2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1624,7 +1624,7 @@ static inline void __clear_hugetlb_destructor(struct hstate *h, { lockdep_assert_held(&hugetlb_lock); - folio_clear_hugetlb(folio); + __folio_clear_hugetlb(folio); } /* @@ -1711,7 +1711,7 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, h->surplus_huge_pages_node[nid]++; } - folio_set_hugetlb(folio); + __folio_set_hugetlb(folio); folio_change_private(folio, NULL); /* * We have to set hugetlb_vmemmap_optimized again as above @@ -2049,7 +2049,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) { - folio_set_hugetlb(folio); + __folio_set_hugetlb(folio); INIT_LIST_HEAD(&folio->lru); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); @@ -2159,22 +2159,6 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, return __prep_compound_gigantic_folio(folio, order, true); } -/* - * PageHuge() only returns true for hugetlbfs pages, but not for normal or - * transparent huge pages. See the PageTransHuge() documentation for more - * details. - */ -int PageHuge(const struct page *page) -{ - const struct folio *folio; - - if (!PageCompound(page)) - return 0; - folio = page_folio(page); - return folio_test_hugetlb(folio); -} -EXPORT_SYMBOL_GPL(PageHuge); - /* * Find and lock address space (mapping) in write mode. * -- cgit v1.2.3 From ce0abef6a1d540acef85068e0e82bdf1fbeeb0e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Apr 2024 17:05:55 -0700 Subject: cpu: Ignore "mitigations" kernel parameter if CPU_MITIGATIONS=n Explicitly disallow enabling mitigations at runtime for kernels that were built with CONFIG_CPU_MITIGATIONS=n, as some architectures may omit code entirely if mitigations are disabled at compile time. E.g. on x86, a large pile of Kconfigs are buried behind CPU_MITIGATIONS, and trying to provide sane behavior for retroactively enabling mitigations is extremely difficult, bordering on impossible. E.g. page table isolation and call depth tracking require build-time support, BHI mitigations will still be off without additional kernel parameters, etc. [ bp: Touchups. ] Signed-off-by: Sean Christopherson Signed-off-by: Borislav Petkov (AMD) Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240420000556.2645001-3-seanjc@google.com --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ arch/x86/Kconfig | 8 ++++++-- include/linux/cpu.h | 11 +++++++++++ kernel/cpu.c | 14 ++++++++++---- 4 files changed, 30 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 902ecd92a29f..213d0719e2b7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3423,6 +3423,9 @@ arch-independent options, each of which is an aggregation of existing arch-specific options. + Note, "mitigations" is supported if and only if the + kernel was built with CPU_MITIGATIONS=y. + off Disable all optional CPU mitigations. This improves system performance, but it may also diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 619a04d5c131..928820e61cb5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2495,9 +2495,13 @@ menuconfig CPU_MITIGATIONS help Say Y here to enable options which enable mitigations for hardware vulnerabilities (usually related to speculative execution). + Mitigations can be disabled or restricted to SMT systems at runtime + via the "mitigations" kernel parameter. - If you say N, all mitigations will be disabled. You really - should know what you are doing to say so. + If you say N, all mitigations will be disabled. This CANNOT be + overridden at runtime. + + Say 'Y', unless you really know what you are doing. if CPU_MITIGATIONS diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 272e4e79e15c..861c3bfc5f17 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -221,7 +221,18 @@ void cpuhp_report_idle_dead(void); static inline void cpuhp_report_idle_dead(void) { } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_CPU_MITIGATIONS extern bool cpu_mitigations_off(void); extern bool cpu_mitigations_auto_nosmt(void); +#else +static inline bool cpu_mitigations_off(void) +{ + return true; +} +static inline bool cpu_mitigations_auto_nosmt(void) +{ + return false; +} +#endif #endif /* _LINUX_CPU_H_ */ diff --git a/kernel/cpu.c b/kernel/cpu.c index bb0ff275fb46..63447eb85dab 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3196,6 +3196,7 @@ void __init boot_cpu_hotplug_init(void) this_cpu_write(cpuhp_state.target, CPUHP_ONLINE); } +#ifdef CONFIG_CPU_MITIGATIONS /* * These are used for a global "mitigations=" cmdline option for toggling * optional CPU mitigations. @@ -3206,9 +3207,7 @@ enum cpu_mitigations { CPU_MITIGATIONS_AUTO_NOSMT, }; -static enum cpu_mitigations cpu_mitigations __ro_after_init = - IS_ENABLED(CONFIG_CPU_MITIGATIONS) ? CPU_MITIGATIONS_AUTO : - CPU_MITIGATIONS_OFF; +static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; static int __init mitigations_parse_cmdline(char *arg) { @@ -3224,7 +3223,6 @@ static int __init mitigations_parse_cmdline(char *arg) return 0; } -early_param("mitigations", mitigations_parse_cmdline); /* mitigations=off */ bool cpu_mitigations_off(void) @@ -3239,3 +3237,11 @@ bool cpu_mitigations_auto_nosmt(void) return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; } EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt); +#else +static int __init mitigations_parse_cmdline(char *arg) +{ + pr_crit("Kernel compiled without mitigations, ignoring 'mitigations'; system may still be vulnerable\n"); + return 0; +} +#endif +early_param("mitigations", mitigations_parse_cmdline); -- cgit v1.2.3 From 6e159fd653d7ebf6290358e0330a0cb8a75cf73b Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 23 Apr 2024 11:13:03 -0700 Subject: ethernet: Add helper for assigning packet type when dest address does not match device address Enable reuse of logic in eth_type_trans for determining packet type. Suggested-by: Sabrina Dubroca Cc: stable@vger.kernel.org Signed-off-by: Rahul Rameshbabu Reviewed-by: Sabrina Dubroca Link: https://lore.kernel.org/r/20240423181319.115860-3-rrameshbabu@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/etherdevice.h | 25 +++++++++++++++++++++++++ net/ethernet/eth.c | 12 +----------- 2 files changed, 26 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 224645f17c33..297231854ada 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -607,6 +607,31 @@ static inline void eth_hw_addr_gen(struct net_device *dev, const u8 *base_addr, eth_hw_addr_set(dev, addr); } +/** + * eth_skb_pkt_type - Assign packet type if destination address does not match + * @skb: Assigned a packet type if address does not match @dev address + * @dev: Network device used to compare packet address against + * + * If the destination MAC address of the packet does not match the network + * device address, assign an appropriate packet type. + */ +static inline void eth_skb_pkt_type(struct sk_buff *skb, + const struct net_device *dev) +{ + const struct ethhdr *eth = eth_hdr(skb); + + if (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) { + if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { + if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + } else { + skb->pkt_type = PACKET_OTHERHOST; + } + } +} + /** * eth_skb_pad - Pad buffer to mininum number of octets for Ethernet frame * @skb: Buffer to pad diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 2edc8b796a4e..049c3adeb850 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -164,17 +164,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); - if (unlikely(!ether_addr_equal_64bits(eth->h_dest, - dev->dev_addr))) { - if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { - if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) - skb->pkt_type = PACKET_BROADCAST; - else - skb->pkt_type = PACKET_MULTICAST; - } else { - skb->pkt_type = PACKET_OTHERHOST; - } - } + eth_skb_pkt_type(skb, dev); /* * Some variants of DSA tagging don't have an ethertype field -- cgit v1.2.3 From 66e13b615a0ce76b785d780ecc9776ba71983629 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 24 Apr 2024 10:02:08 +0000 Subject: bpf: verifier: prevent userspace memory access With BPF_PROBE_MEM, BPF allows de-referencing an untrusted pointer. To thwart invalid memory accesses, the JITs add an exception table entry for all such accesses. But in case the src_reg + offset is a userspace address, the BPF program might read that memory if the user has mapped it. Make the verifier add guard instructions around such memory accesses and skip the load if the address falls into the userspace region. The JITs need to implement bpf_arch_uaddress_limit() to define where the userspace addresses end for that architecture or TASK_SIZE is taken as default. The implementation is as follows: REG_AX = SRC_REG if(offset) REG_AX += offset; REG_AX >>= 32; if (REG_AX <= (uaddress_limit >> 32)) DST_REG = 0; else DST_REG = *(size *)(SRC_REG + offset); Comparing just the upper 32 bits of the load address with the upper 32 bits of uaddress_limit implies that the values are being aligned down to a 4GB boundary before comparison. The above means that all loads with address <= uaddress_limit + 4GB are skipped. This is acceptable because there is a large hole (much larger than 4GB) between userspace and kernel space memory, therefore a correctly functioning BPF program should not access this 4GB memory above the userspace. Let's analyze what this patch does to the following fentry program dereferencing an untrusted pointer: SEC("fentry/tcp_v4_connect") int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk) { *(volatile long *)sk; return 0; } BPF Program before | BPF Program after ------------------ | ----------------- 0: (79) r1 = *(u64 *)(r1 +0) 0: (79) r1 = *(u64 *)(r1 +0) ----------------------------------------------------------------------- 1: (79) r1 = *(u64 *)(r1 +0) --\ 1: (bf) r11 = r1 ----------------------------\ \ 2: (77) r11 >>= 32 2: (b7) r0 = 0 \ \ 3: (b5) if r11 <= 0x8000 goto pc+2 3: (95) exit \ \-> 4: (79) r1 = *(u64 *)(r1 +0) \ 5: (05) goto pc+1 \ 6: (b7) r1 = 0 \-------------------------------------- 7: (b7) r0 = 0 8: (95) exit As you can see from above, in the best case (off=0), 5 extra instructions are emitted. Now, we analyze the same program after it has gone through the JITs of ARM64 and RISC-V architectures. We follow the single load instruction that has the untrusted pointer and see what instrumentation has been added around it. x86-64 JIT ========== JIT's Instrumentation (upstream) --------------------- 0: nopl 0x0(%rax,%rax,1) 5: xchg %ax,%ax 7: push %rbp 8: mov %rsp,%rbp b: mov 0x0(%rdi),%rdi --------------------------------- f: movabs $0x800000000000,%r11 19: cmp %r11,%rdi 1c: jb 0x000000000000002a 1e: mov %rdi,%r11 21: add $0x0,%r11 28: jae 0x000000000000002e 2a: xor %edi,%edi 2c: jmp 0x0000000000000032 2e: mov 0x0(%rdi),%rdi --------------------------------- 32: xor %eax,%eax 34: leave 35: ret The x86-64 JIT already emits some instructions to protect against user memory access. This patch doesn't make any changes for the x86-64 JIT. ARM64 JIT ========= No Intrumentation Verifier's Instrumentation (upstream) (This patch) ----------------- -------------------------- 0: add x9, x30, #0x0 0: add x9, x30, #0x0 4: nop 4: nop 8: paciasp 8: paciasp c: stp x29, x30, [sp, #-16]! c: stp x29, x30, [sp, #-16]! 10: mov x29, sp 10: mov x29, sp 14: stp x19, x20, [sp, #-16]! 14: stp x19, x20, [sp, #-16]! 18: stp x21, x22, [sp, #-16]! 18: stp x21, x22, [sp, #-16]! 1c: stp x25, x26, [sp, #-16]! 1c: stp x25, x26, [sp, #-16]! 20: stp x27, x28, [sp, #-16]! 20: stp x27, x28, [sp, #-16]! 24: mov x25, sp 24: mov x25, sp 28: mov x26, #0x0 28: mov x26, #0x0 2c: sub x27, x25, #0x0 2c: sub x27, x25, #0x0 30: sub sp, sp, #0x0 30: sub sp, sp, #0x0 34: ldr x0, [x0] 34: ldr x0, [x0] -------------------------------------------------------------------------------- 38: ldr x0, [x0] ----------\ 38: add x9, x0, #0x0 -----------------------------------\\ 3c: lsr x9, x9, #32 3c: mov x7, #0x0 \\ 40: cmp x9, #0x10, lsl #12 40: mov sp, sp \\ 44: b.ls 0x0000000000000050 44: ldp x27, x28, [sp], #16 \\--> 48: ldr x0, [x0] 48: ldp x25, x26, [sp], #16 \ 4c: b 0x0000000000000054 4c: ldp x21, x22, [sp], #16 \ 50: mov x0, #0x0 50: ldp x19, x20, [sp], #16 \--------------------------------------- 54: ldp x29, x30, [sp], #16 54: mov x7, #0x0 58: add x0, x7, #0x0 58: mov sp, sp 5c: autiasp 5c: ldp x27, x28, [sp], #16 60: ret 60: ldp x25, x26, [sp], #16 64: nop 64: ldp x21, x22, [sp], #16 68: ldr x10, 0x0000000000000070 68: ldp x19, x20, [sp], #16 6c: br x10 6c: ldp x29, x30, [sp], #16 70: add x0, x7, #0x0 74: autiasp 78: ret 7c: nop 80: ldr x10, 0x0000000000000088 84: br x10 There are 6 extra instructions added in ARM64 in the best case. This will become 7 in the worst case (off != 0). RISC-V JIT (RISCV_ISA_C Disabled) ========== No Intrumentation Verifier's Instrumentation (upstream) (This patch) ----------------- -------------------------- 0: nop 0: nop 4: nop 4: nop 8: li a6, 33 8: li a6, 33 c: addi sp, sp, -16 c: addi sp, sp, -16 10: sd s0, 8(sp) 10: sd s0, 8(sp) 14: addi s0, sp, 16 14: addi s0, sp, 16 18: ld a0, 0(a0) 18: ld a0, 0(a0) --------------------------------------------------------------- 1c: ld a0, 0(a0) --\ 1c: mv t0, a0 --------------------------\ \ 20: srli t0, t0, 32 20: li a5, 0 \ \ 24: lui t1, 4096 24: ld s0, 8(sp) \ \ 28: sext.w t1, t1 28: addi sp, sp, 16 \ \ 2c: bgeu t1, t0, 12 2c: sext.w a0, a5 \ \--> 30: ld a0, 0(a0) 30: ret \ 34: j 8 \ 38: li a0, 0 \------------------------------ 3c: li a5, 0 40: ld s0, 8(sp) 44: addi sp, sp, 16 48: sext.w a0, a5 4c: ret There are 7 extra instructions added in RISC-V. Fixes: 800834285361 ("bpf, arm64: Add BPF exception tables") Reported-by: Breno Leitao Suggested-by: Alexei Starovoitov Acked-by: Ilya Leoshkevich Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240424100210.11982-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 6 ++++++ include/linux/filter.h | 1 + kernel/bpf/core.c | 9 +++++++++ kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++++++++ 4 files changed, 46 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index df5fac428408..b520b66ad14b 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3473,3 +3473,9 @@ bool bpf_jit_supports_ptr_xchg(void) { return true; } + +/* x86-64 JIT emits its own code to filter user addresses so return 0 here */ +u64 bpf_arch_uaddress_limit(void) +{ + return 0; +} diff --git a/include/linux/filter.h b/include/linux/filter.h index c99bc3df2d28..219ee7a76874 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -963,6 +963,7 @@ bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); bool bpf_jit_supports_arena(void); +u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 696bc55de8e8..1ea5ce5bb599 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2942,6 +2942,15 @@ bool __weak bpf_jit_supports_arena(void) return false; } +u64 __weak bpf_arch_uaddress_limit(void) +{ +#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE) + return TASK_SIZE; +#else + return 0; +#endif +} + /* Return TRUE if the JIT backend satisfies the following two conditions: * 1) JIT backend supports atomic_xchg() on pointer-sized words. * 2) Under the specific arch, the implementation of xchg() is the same diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aa24beb65393..cb7ad1f795e1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19675,6 +19675,36 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } + /* Make it impossible to de-reference a userspace address */ + if (BPF_CLASS(insn->code) == BPF_LDX && + (BPF_MODE(insn->code) == BPF_PROBE_MEM || + BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) { + struct bpf_insn *patch = &insn_buf[0]; + u64 uaddress_limit = bpf_arch_uaddress_limit(); + + if (!uaddress_limit) + goto next_insn; + + *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); + if (insn->off) + *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off); + *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32); + *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2); + *patch++ = *insn; + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0); + + cnt = patch - insn_buf; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */ if (BPF_CLASS(insn->code) == BPF_LD && (BPF_MODE(insn->code) == BPF_ABS || -- cgit v1.2.3 From 2e5449f4f21a1b0bd9beec4c4b580eb1f9b9ed7f Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 27 Apr 2024 15:27:58 +0900 Subject: profiling: Remove create_prof_cpu_mask(). create_prof_cpu_mask() is no longer used after commit 1f44a225777e ("s390: convert interrupt handling to use generic hardirq"). Signed-off-by: Tetsuo Handa Signed-off-by: Linus Torvalds --- include/linux/profile.h | 5 ----- kernel/profile.c | 43 ------------------------------------------- 2 files changed, 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/profile.h b/include/linux/profile.h index 11db1ec516e2..04ae5ebcb637 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -18,13 +18,8 @@ struct proc_dir_entry; struct notifier_block; #if defined(CONFIG_PROFILING) && defined(CONFIG_PROC_FS) -void create_prof_cpu_mask(void); int create_proc_profile(void); #else -static inline void create_prof_cpu_mask(void) -{ -} - static inline int create_proc_profile(void) { return 0; diff --git a/kernel/profile.c b/kernel/profile.c index 8a77769bc4b4..2b775cc5c28f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -344,49 +344,6 @@ void profile_tick(int type) #include #include -static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) -{ - seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask)); - return 0; -} - -static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, prof_cpu_mask_proc_show, NULL); -} - -static ssize_t prof_cpu_mask_proc_write(struct file *file, - const char __user *buffer, size_t count, loff_t *pos) -{ - cpumask_var_t new_value; - int err; - - if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) - return -ENOMEM; - - err = cpumask_parse_user(buffer, count, new_value); - if (!err) { - cpumask_copy(prof_cpu_mask, new_value); - err = count; - } - free_cpumask_var(new_value); - return err; -} - -static const struct proc_ops prof_cpu_mask_proc_ops = { - .proc_open = prof_cpu_mask_proc_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = single_release, - .proc_write = prof_cpu_mask_proc_write, -}; - -void create_prof_cpu_mask(void) -{ - /* create /proc/irq/prof_cpu_mask */ - proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_ops); -} - /* * This function accesses profiling information. The returned data is * binary: the sampling step and the actual contents of the profile -- cgit v1.2.3 From b63db58e2fa5d6963db9c45df88e60060f0ff35f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 2 May 2024 09:03:15 -0400 Subject: eventfs/tracing: Add callback for release of an eventfs_inode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Synthetic events create and destroy tracefs files when they are created and removed. The tracing subsystem has its own file descriptor representing the state of the events attached to the tracefs files. There's a race between the eventfs files and this file descriptor of the tracing system where the following can cause an issue: With two scripts 'A' and 'B' doing: Script 'A': echo "hello int aaa" > /sys/kernel/tracing/synthetic_events while : do echo 0 > /sys/kernel/tracing/events/synthetic/hello/enable done Script 'B': echo > /sys/kernel/tracing/synthetic_events Script 'A' creates a synthetic event "hello" and then just writes zero into its enable file. Script 'B' removes all synthetic events (including the newly created "hello" event). What happens is that the opening of the "enable" file has: { struct trace_event_file *file = inode->i_private; int ret; ret = tracing_check_open_get_tr(file->tr); [..] But deleting the events frees the "file" descriptor, and a "use after free" happens with the dereference at "file->tr". The file descriptor does have a reference counter, but there needs to be a way to decrement it from the eventfs when the eventfs_inode is removed that represents this file descriptor. Add an optional "release" callback to the eventfs_entry array structure, that gets called when the eventfs file is about to be removed. This allows for the creating on the eventfs file to increment the tracing file descriptor ref counter. When the eventfs file is deleted, it can call the release function that will call the put function for the tracing file descriptor. This will protect the tracing file from being freed while a eventfs file that references it is being opened. Link: https://lore.kernel.org/linux-trace-kernel/20240426073410.17154-1-Tze-nan.Wu@mediatek.com/ Link: https://lore.kernel.org/linux-trace-kernel/20240502090315.448cba46@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode") Reported-by: Tze-nan wu Tested-by: Tze-nan Wu (吳澤南) Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 23 +++++++++++++++++++++-- include/linux/tracefs.h | 3 +++ kernel/trace/trace_events.c | 12 ++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 894c6ca1e500..f5510e26f0f6 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -84,10 +84,17 @@ enum { static void release_ei(struct kref *ref) { struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref); + const struct eventfs_entry *entry; struct eventfs_root_inode *rei; WARN_ON_ONCE(!ei->is_freed); + for (int i = 0; i < ei->nr_entries; i++) { + entry = &ei->entries[i]; + if (entry->release) + entry->release(entry->name, ei->data); + } + kfree(ei->entry_attrs); kfree_const(ei->name); if (ei->is_events) { @@ -112,6 +119,18 @@ static inline void free_ei(struct eventfs_inode *ei) } } +/* + * Called when creation of an ei fails, do not call release() functions. + */ +static inline void cleanup_ei(struct eventfs_inode *ei) +{ + if (ei) { + /* Set nr_entries to 0 to prevent release() function being called */ + ei->nr_entries = 0; + free_ei(ei); + } +} + static inline struct eventfs_inode *get_ei(struct eventfs_inode *ei) { if (ei) @@ -734,7 +753,7 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode /* Was the parent freed? */ if (list_empty(&ei->list)) { - free_ei(ei); + cleanup_ei(ei); ei = NULL; } return ei; @@ -835,7 +854,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry return ei; fail: - free_ei(ei); + cleanup_ei(ei); tracefs_failed_creating(dentry); return ERR_PTR(-ENOMEM); } diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h index 7a5fe17b6bf9..d03f74658716 100644 --- a/include/linux/tracefs.h +++ b/include/linux/tracefs.h @@ -62,6 +62,8 @@ struct eventfs_file; typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data, const struct file_operations **fops); +typedef void (*eventfs_release)(const char *name, void *data); + /** * struct eventfs_entry - dynamically created eventfs file call back handler * @name: Then name of the dynamic file in an eventfs directory @@ -72,6 +74,7 @@ typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data, struct eventfs_entry { const char *name; eventfs_callback callback; + eventfs_release release; }; struct eventfs_inode; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 52f75c36bbca..6ef29eba90ce 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2552,6 +2552,14 @@ static int event_callback(const char *name, umode_t *mode, void **data, return 0; } +/* The file is incremented on creation and freeing the enable file decrements it */ +static void event_release(const char *name, void *data) +{ + struct trace_event_file *file = data; + + event_file_put(file); +} + static int event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) { @@ -2566,6 +2574,7 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) { .name = "enable", .callback = event_callback, + .release = event_release, }, { .name = "filter", @@ -2634,6 +2643,9 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) return ret; } + /* Gets decremented on freeing of the "enable" file */ + event_file_get(file); + return 0; } -- cgit v1.2.3