From 7efa84b5cdd6d473c7e80912638fca9d7167f202 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 4 Apr 2025 15:10:02 -0700 Subject: compiler-gcc.h: Introduce __diag_GCC_all It is not possible disabling a diagnostic for all versions of GCC without hard coding the minimum supported version at the site, as the GCC specific macros require a minimum version to disable the warning for: __diag_ignore(GCC, 5, ...); __diag_ignore_all() does not solve this issue because it disables a diagnostic for all versions of both GCC and clang, not just one or the other. Introduce __diag_GCC_all so that developers can write __diag_ignore(GCC, all, ...); to disable a particular diagnostic for all versions of GCC, while not affecting clang. Closes: https://lore.kernel.org/r/CAHk-=wgfX9nBGE0Ap9GjhOy7Mn=RSy=rx0MvqfYFFDx31KJXqQ@mail.gmail.com Signed-off-by: Nathan Chancellor Tested-by: Andy Shevchenko Reviewed-by: Petr Mladek Link: https://patch.msgid.link/20250404-vsprintf-convert-pragmas-to-__diag-v1-1-5d6c5c55b2bd@kernel.org Signed-off-by: Petr Mladek --- include/linux/compiler-gcc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index c9b58188ec61..c75a222880f9 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -127,6 +127,8 @@ #define __diag_GCC_8(s) #endif +#define __diag_GCC_all(s) __diag(s) + #define __diag_ignore_all(option, comment) \ __diag(__diag_GCC_ignore option) -- cgit v1.2.3 From de1c831a7898f164c1c2703c6b2b9e4fb4bebefc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 15 Apr 2025 10:02:33 -0700 Subject: slab: Decouple slab_debug and no_hash_pointers Some system owners use slab_debug=FPZ (or similar) as a hardening option, but do not want to be forced into having kernel addresses exposed due to the implicit "no_hash_pointers" boot param setting.[1] Introduce the "hash_pointers" boot param, which defaults to "auto" (the current behavior), but also includes "always" (forcing on hashing even when "slab_debug=..." is defined), and "never". The existing "no_hash_pointers" boot param becomes an alias for "hash_pointers=never". This makes it possible to boot with "slab_debug=FPZ hash_pointers=always". Link: https://github.com/KSPP/linux/issues/368 [1] Fixes: 792702911f58 ("slub: force on no_hash_pointers when slub_debug is enabled") Co-developed-by: Sergio Perez Gonzalez Signed-off-by: Sergio Perez Gonzalez Acked-by: Vlastimil Babka Acked-by: David Rientjes Reviewed-by: Bagas Sanjaya Signed-off-by: Kees Cook Reviewed-by: Harry Yoo Acked-by: Rafael Aquini Tested-by: Petr Mladek Reviewed-by: Petr Mladek Link: https://patch.msgid.link/20250415170232.it.467-kees@kernel.org [kees@kernel.org: Add note about hash_pointers into slab_debug kernel parameter documentation.] Signed-off-by: Petr Mladek --- Documentation/admin-guide/kernel-parameters.txt | 38 ++++++++++----- include/linux/sprintf.h | 2 +- lib/vsprintf.c | 61 +++++++++++++++++++++++-- mm/slub.c | 5 +- 4 files changed, 86 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3f35d5b8c296..0dd5cd17e87e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1798,6 +1798,27 @@ backtraces on all cpus. Format: 0 | 1 + hash_pointers= + [KNL,EARLY] + By default, when pointers are printed to the console + or buffers via the %p format string, that pointer is + "hashed", i.e. obscured by hashing the pointer value. + This is a security feature that hides actual kernel + addresses from unprivileged users, but it also makes + debugging the kernel more difficult since unequal + pointers can no longer be compared. The choices are: + Format: { auto | always | never } + Default: auto + + auto - Hash pointers unless slab_debug is enabled. + always - Always hash pointers (even if slab_debug is + enabled). + never - Never hash pointers. This option should only + be specified when debugging the kernel. Do + not use on production kernels. The boot + param "no_hash_pointers" is an alias for + this mode. + hashdist= [KNL,NUMA] Large hashes allocated during boot are distributed across NUMA nodes. Defaults on for 64-bit NUMA, off otherwise. @@ -4120,18 +4141,7 @@ no_hash_pointers [KNL,EARLY] - Force pointers printed to the console or buffers to be - unhashed. By default, when a pointer is printed via %p - format string, that pointer is "hashed", i.e. obscured - by hashing the pointer value. This is a security feature - that hides actual kernel addresses from unprivileged - users, but it also makes debugging the kernel more - difficult since unequal pointers can no longer be - compared. However, if this command-line option is - specified, then all normal pointers will have their true - value printed. This option should only be specified when - debugging the kernel. Please do not use on production - kernels. + Alias for "hash_pointers=never". nohibernate [HIBERNATION] Disable hibernation and resume. @@ -6481,6 +6491,10 @@ Documentation/mm/slub.rst. (slub_debug legacy name also accepted for now) + Using this option implies the "no_hash_pointers" + option which can be undone by adding the + "hash_pointers=always" option. + slab_max_order= [MM] Determines the maximum allowed order for slabs. A high setting may cause OOMs due to memory diff --git a/include/linux/sprintf.h b/include/linux/sprintf.h index 51cab2def9ec..521bb2cd2648 100644 --- a/include/linux/sprintf.h +++ b/include/linux/sprintf.h @@ -22,7 +22,7 @@ __scanf(2, 0) int vsscanf(const char *, const char *, va_list); /* These are for specific cases, do not use without real need */ extern bool no_hash_pointers; -int no_hash_pointers_enable(char *str); +void hash_pointers_finalize(bool slub_debug); /* Used for Rust formatting ('%pA') */ char *rust_fmt_argument(char *buf, char *end, const void *ptr); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 01699852f30c..22cbd75266ef 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -60,6 +60,20 @@ bool no_hash_pointers __ro_after_init; EXPORT_SYMBOL_GPL(no_hash_pointers); +/* + * Hashed pointers policy selected by "hash_pointers=..." boot param + * + * `auto` - Hashed pointers enabled unless disabled by slub_debug_enabled=true + * `always` - Hashed pointers enabled unconditionally + * `never` - Hashed pointers disabled unconditionally + */ +enum hash_pointers_policy { + HASH_PTR_AUTO = 0, + HASH_PTR_ALWAYS, + HASH_PTR_NEVER +}; +static enum hash_pointers_policy hash_pointers_mode __initdata; + noinline static unsigned long long simple_strntoull(const char *startp, char **endp, unsigned int base, size_t max_chars) { @@ -2271,12 +2285,23 @@ char *resource_or_range(const char *fmt, char *buf, char *end, void *ptr, return resource_string(buf, end, ptr, spec, fmt); } -int __init no_hash_pointers_enable(char *str) +void __init hash_pointers_finalize(bool slub_debug) { - if (no_hash_pointers) - return 0; + switch (hash_pointers_mode) { + case HASH_PTR_ALWAYS: + no_hash_pointers = false; + break; + case HASH_PTR_NEVER: + no_hash_pointers = true; + break; + case HASH_PTR_AUTO: + default: + no_hash_pointers = slub_debug; + break; + } - no_hash_pointers = true; + if (!no_hash_pointers) + return; pr_warn("**********************************************************\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); @@ -2289,11 +2314,39 @@ int __init no_hash_pointers_enable(char *str) pr_warn("** the kernel, report this immediately to your system **\n"); pr_warn("** administrator! **\n"); pr_warn("** **\n"); + pr_warn("** Use hash_pointers=always to force this mode off **\n"); + pr_warn("** **\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("**********************************************************\n"); +} + +static int __init hash_pointers_mode_parse(char *str) +{ + if (!str) { + pr_warn("Hash pointers mode empty; falling back to auto.\n"); + hash_pointers_mode = HASH_PTR_AUTO; + } else if (strncmp(str, "auto", 4) == 0) { + pr_info("Hash pointers mode set to auto.\n"); + hash_pointers_mode = HASH_PTR_AUTO; + } else if (strncmp(str, "never", 5) == 0) { + pr_info("Hash pointers mode set to never.\n"); + hash_pointers_mode = HASH_PTR_NEVER; + } else if (strncmp(str, "always", 6) == 0) { + pr_info("Hash pointers mode set to always.\n"); + hash_pointers_mode = HASH_PTR_ALWAYS; + } else { + pr_warn("Unknown hash_pointers mode '%s' specified; assuming auto.\n", str); + hash_pointers_mode = HASH_PTR_AUTO; + } return 0; } +early_param("hash_pointers", hash_pointers_mode_parse); + +static int __init no_hash_pointers_enable(char *str) +{ + return hash_pointers_mode_parse("never"); +} early_param("no_hash_pointers", no_hash_pointers_enable); /* diff --git a/mm/slub.c b/mm/slub.c index b46f87662e71..f3d61b330a76 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6314,9 +6314,8 @@ void __init kmem_cache_init(void) if (debug_guardpage_minorder()) slub_max_order = 0; - /* Print slub debugging pointers without hashing */ - if (__slub_debug_enabled()) - no_hash_pointers_enable(NULL); + /* Inform pointer hashing choice about slub debugging state. */ + hash_pointers_finalize(__slub_debug_enabled()); kmem_cache_node = &boot_kmem_cache_node; kmem_cache = &boot_kmem_cache; -- cgit v1.2.3 From 7934a8dd8692b56714ce9b36421e316445d94a77 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 6 Jun 2025 13:10:23 +0900 Subject: module: remove meaningless 'name' parameter from __MODULE_INFO() The symbol names in the .modinfo section are never used and already randomized by the __UNIQUE_ID() macro. Therefore, the second parameter of __MODULE_INFO() is meaningless and can be removed to simplify the code. With this change, the symbol names in the .modinfo section will be prefixed with __UNIQUE_ID_modinfo, making it clearer that they originate from MODULE_INFO(). [Before] $ objcopy -j .modinfo vmlinux.o modinfo.o $ nm -n modinfo.o | head -n10 0000000000000000 r __UNIQUE_ID_license560 0000000000000011 r __UNIQUE_ID_file559 0000000000000030 r __UNIQUE_ID_description558 0000000000000074 r __UNIQUE_ID_license580 000000000000008e r __UNIQUE_ID_file579 00000000000000bd r __UNIQUE_ID_description578 00000000000000e6 r __UNIQUE_ID_license581 00000000000000ff r __UNIQUE_ID_file580 0000000000000134 r __UNIQUE_ID_description579 0000000000000179 r __UNIQUE_ID_uncore_no_discover578 [After] $ objcopy -j .modinfo vmlinux.o modinfo.o $ nm -n modinfo.o | head -n10 0000000000000000 r __UNIQUE_ID_modinfo560 0000000000000011 r __UNIQUE_ID_modinfo559 0000000000000030 r __UNIQUE_ID_modinfo558 0000000000000074 r __UNIQUE_ID_modinfo580 000000000000008e r __UNIQUE_ID_modinfo579 00000000000000bd r __UNIQUE_ID_modinfo578 00000000000000e6 r __UNIQUE_ID_modinfo581 00000000000000ff r __UNIQUE_ID_modinfo580 0000000000000134 r __UNIQUE_ID_modinfo579 0000000000000179 r __UNIQUE_ID_modinfo578 Signed-off-by: Masahiro Yamada Reviewed-by: Petr Pavlu --- include/crypto/algapi.h | 4 ++-- include/linux/module.h | 3 --- include/linux/moduleparam.h | 9 +++++---- include/net/tcp.h | 4 ++-- 4 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h index 188eface0a11..fc4574940636 100644 --- a/include/crypto/algapi.h +++ b/include/crypto/algapi.h @@ -43,8 +43,8 @@ * alias. */ #define MODULE_ALIAS_CRYPTO(name) \ - __MODULE_INFO(alias, alias_userspace, name); \ - __MODULE_INFO(alias, alias_crypto, "crypto-" name) + MODULE_INFO(alias, name); \ + MODULE_INFO(alias, "crypto-" name) struct crypto_aead; struct crypto_instance; diff --git a/include/linux/module.h b/include/linux/module.h index 92e1420fccdf..81b41cc6a19e 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -164,9 +164,6 @@ extern void cleanup_module(void); struct module_kobject *lookup_or_create_module_kobject(const char *name); -/* Generic info of form tag = "info" */ -#define MODULE_INFO(tag, info) __MODULE_INFO(tag, tag, info) - /* For userspace: you can also call me... */ #define MODULE_ALIAS(_alias) MODULE_INFO(alias, _alias) diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index bfb85fd13e1f..00166f747e27 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -20,18 +20,19 @@ /* Chosen so that structs with an unsigned long line up. */ #define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long)) -#define __MODULE_INFO(tag, name, info) \ - static const char __UNIQUE_ID(name)[] \ +/* Generic info of form tag = "info" */ +#define MODULE_INFO(tag, info) \ + static const char __UNIQUE_ID(modinfo)[] \ __used __section(".modinfo") __aligned(1) \ = __MODULE_INFO_PREFIX __stringify(tag) "=" info #define __MODULE_PARM_TYPE(name, _type) \ - __MODULE_INFO(parmtype, name##type, #name ":" _type) + MODULE_INFO(parmtype, #name ":" _type) /* One for each parameter, describing how to use it. Some files do multiple of these per line, so can't just use MODULE_INFO. */ #define MODULE_PARM_DESC(_parm, desc) \ - __MODULE_INFO(parm, _parm, #_parm ":" desc) + MODULE_INFO(parm, #_parm ":" desc) struct kernel_param; diff --git a/include/net/tcp.h b/include/net/tcp.h index 5078ad868fee..9b39ef630c92 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2662,8 +2662,8 @@ void tcp_update_ulp(struct sock *sk, struct proto *p, void (*write_space)(struct sock *sk)); #define MODULE_ALIAS_TCP_ULP(name) \ - __MODULE_INFO(alias, alias_userspace, name); \ - __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) + MODULE_INFO(alias, name); \ + MODULE_INFO(alias, "tcp-ulp-" name) #ifdef CONFIG_NET_SOCK_MSG struct sk_msg; -- cgit v1.2.3 From 50b4233a22b1ee9ccd0e847597de66ce21329ddb Mon Sep 17 00:00:00 2001 From: Julian Vetter Date: Tue, 3 Jun 2025 15:21:21 +0200 Subject: include/linux/jhash.h: replace __get_unaligned_cpu32 in jhash function __get_unaligned_cpu32() is deprecated. So, replace it with the more generic get_unaligned() and just cast the input parameter. Link: https://lkml.kernel.org/r/20250603132121.3674066-1-julian@outer-limits.org Signed-off-by: Julian Vetter Cc: Arnd Bergmann Cc: Wei-Hsin Yeh Signed-off-by: Andrew Morton --- include/linux/jhash.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jhash.h b/include/linux/jhash.h index fa26a2dd3b52..7c1c1821c694 100644 --- a/include/linux/jhash.h +++ b/include/linux/jhash.h @@ -24,7 +24,7 @@ * Jozsef */ #include -#include +#include /* Best hash sizes are of power of two */ #define jhash_size(n) ((u32)1<<(n)) @@ -77,9 +77,9 @@ static inline u32 jhash(const void *key, u32 length, u32 initval) /* All but the last block: affect some 32 bits of (a,b,c) */ while (length > 12) { - a += __get_unaligned_cpu32(k); - b += __get_unaligned_cpu32(k + 4); - c += __get_unaligned_cpu32(k + 8); + a += get_unaligned((u32 *)k); + b += get_unaligned((u32 *)(k + 4)); + c += get_unaligned((u32 *)(k + 8)); __jhash_mix(a, b, c); length -= 12; k += 12; -- cgit v1.2.3 From 2489e958129ff7cbf26a34ee33cdc9ccbd68fe3c Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 12 Jun 2025 14:11:57 +0800 Subject: relayfs: abolish prev_padding Patch series "relayfs: misc changes", v5. The series mostly focuses on the error counters which helps every user debug their own kernel module. This patch (of 5): prev_padding represents the unused space of certain subbuffer. If the content of a call of relay_write() exceeds the limit of the remainder of this subbuffer, it will skip storing in the rest space and record the start point as buf->prev_padding in relay_switch_subbuf(). Since the buf is a per-cpu big buffer, the point of prev_padding as a global value for the whole buffer instead of a single subbuffer (whose padding info is stored in buf->padding[]) seems meaningless from the real use cases, so we don't bother to record it any more. Link: https://lkml.kernel.org/r/20250612061201.34272-1-kerneljasonxing@gmail.com Link: https://lkml.kernel.org/r/20250612061201.34272-2-kerneljasonxing@gmail.com Signed-off-by: Jason Xing Reviewed-by: Yushan Zhou Reviewed-by: Masami Hiramatsu (Google) Cc: Jens Axboe Cc: Mathieu Desnoyers Cc: Steven Rostedt Signed-off-by: Andrew Morton --- drivers/gpu/drm/i915/gt/uc/intel_guc_log.c | 3 +-- drivers/net/wwan/iosm/iosm_ipc_trace.c | 3 +-- drivers/net/wwan/t7xx/t7xx_port_trace.c | 2 +- include/linux/relay.h | 5 +---- kernel/relay.c | 14 ++++++++------ kernel/trace/blktrace.c | 2 +- 6 files changed, 13 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c index e8a04e476c57..09a64f224c49 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c @@ -220,8 +220,7 @@ static int guc_action_control_log(struct intel_guc *guc, bool enable, */ static int subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, - size_t prev_padding) + void *prev_subbuf) { /* * Use no-overwrite mode by default, where relay will stop accepting diff --git a/drivers/net/wwan/iosm/iosm_ipc_trace.c b/drivers/net/wwan/iosm/iosm_ipc_trace.c index eeecfa3d10c5..9656254c1c6c 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_trace.c +++ b/drivers/net/wwan/iosm/iosm_ipc_trace.c @@ -51,8 +51,7 @@ static int ipc_trace_remove_buf_file_handler(struct dentry *dentry) } static int ipc_trace_subbuf_start_handler(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, - size_t prev_padding) + void *prev_subbuf) { if (relay_buf_full(buf)) { pr_err_ratelimited("Relay_buf full dropping traces"); diff --git a/drivers/net/wwan/t7xx/t7xx_port_trace.c b/drivers/net/wwan/t7xx/t7xx_port_trace.c index 4ed8b4e29bf1..f16d3b01302c 100644 --- a/drivers/net/wwan/t7xx/t7xx_port_trace.c +++ b/drivers/net/wwan/t7xx/t7xx_port_trace.c @@ -33,7 +33,7 @@ static int t7xx_trace_remove_buf_file_handler(struct dentry *dentry) } static int t7xx_trace_subbuf_start_handler(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) + void *prev_subbuf) { if (relay_buf_full(buf)) { pr_err_ratelimited("Relay_buf full dropping traces"); diff --git a/include/linux/relay.h b/include/linux/relay.h index b3224111d074..e10a0fdf4325 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -47,7 +47,6 @@ struct rchan_buf unsigned int page_count; /* number of current buffer pages */ unsigned int finalized; /* buffer has been finalized */ size_t *padding; /* padding counts per sub-buffer */ - size_t prev_padding; /* temporary variable */ size_t bytes_consumed; /* bytes consumed in cur read subbuf */ size_t early_bytes; /* bytes consumed before VFS inited */ unsigned int cpu; /* this buf's cpu */ @@ -84,7 +83,6 @@ struct rchan_callbacks * @buf: the channel buffer containing the new sub-buffer * @subbuf: the start of the new sub-buffer * @prev_subbuf: the start of the previous sub-buffer - * @prev_padding: unused space at the end of previous sub-buffer * * The client should return 1 to continue logging, 0 to stop * logging. @@ -100,8 +98,7 @@ struct rchan_callbacks */ int (*subbuf_start) (struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, - size_t prev_padding); + void *prev_subbuf); /* * create_buf_file - create file to represent a relay channel buffer diff --git a/kernel/relay.c b/kernel/relay.c index 3ee5b038d0d9..fc6ad76b789d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -249,13 +249,13 @@ EXPORT_SYMBOL_GPL(relay_buf_full); */ static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) + void *prev_subbuf) { if (!buf->chan->cb->subbuf_start) return !relay_buf_full(buf); return buf->chan->cb->subbuf_start(buf, subbuf, - prev_subbuf, prev_padding); + prev_subbuf); } /** @@ -301,7 +301,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) for (i = 0; i < buf->chan->n_subbufs; i++) buf->padding[i] = 0; - relay_subbuf_start(buf, buf->data, NULL, 0); + relay_subbuf_start(buf, buf->data, NULL); } /** @@ -554,9 +554,11 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) goto toobig; if (buf->offset != buf->chan->subbuf_size + 1) { - buf->prev_padding = buf->chan->subbuf_size - buf->offset; + size_t prev_padding; + + prev_padding = buf->chan->subbuf_size - buf->offset; old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; - buf->padding[old_subbuf] = buf->prev_padding; + buf->padding[old_subbuf] = prev_padding; buf->subbufs_produced++; if (buf->dentry) d_inode(buf->dentry)->i_size += @@ -581,7 +583,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; new = buf->start + new_subbuf * buf->chan->subbuf_size; buf->offset = 0; - if (!relay_subbuf_start(buf, new, old, buf->prev_padding)) { + if (!relay_subbuf_start(buf, new, old)) { buf->offset = buf->chan->subbuf_size + 1; return 0; } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3f6a7bdc6edf..d3083c88474e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -461,7 +461,7 @@ static const struct file_operations blk_msg_fops = { * the user space app in telling how many lost events there were. */ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) + void *prev_subbuf) { struct blk_trace *bt; -- cgit v1.2.3 From ca01a90ae7bf9bb22137e719366bdc0f387675c2 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 12 Jun 2025 14:11:58 +0800 Subject: relayfs: support a counter tracking if per-cpu buffers is full When using relay mechanism, we often encounter the case where new data are lost or old unconsumed data are overwritten because of slow reader. Add 'full' field in per-cpu buffer structure to detect if the above case is happening. Relay has two modes: 1) non-overwrite mode, 2) overwrite mode. So buffer being full here respectively means: 1) relayfs doesn't intend to accept new data and then simply drop them, or 2) relayfs is going to start over again and overwrite old unread data with new data. Note: this counter doesn't need any explicit lock to protect from being modified by different threads for the better performance consideration. Writers calling __relay_write/relay_write should consider how to use the lock and ensure it performs under the lock protection, thus it's not necessary to add a new small lock here. Link: https://lkml.kernel.org/r/20250612061201.34272-3-kerneljasonxing@gmail.com Signed-off-by: Jason Xing Reviewed-by: Yushan Zhou Reviewed-by: Jens Axboe Reviewed-by: Masami Hiramatsu (Google) Cc: Mathieu Desnoyers Cc: Steven Rostedt Signed-off-by: Andrew Morton --- include/linux/relay.h | 9 +++++++++ kernel/relay.c | 8 +++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/relay.h b/include/linux/relay.h index e10a0fdf4325..cd77eb285a48 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -28,6 +28,14 @@ */ #define RELAYFS_CHANNEL_VERSION 7 +/* + * Relay buffer statistics + */ +struct rchan_buf_stats +{ + unsigned int full_count; /* counter for buffer full */ +}; + /* * Per-cpu relay channel buffer */ @@ -43,6 +51,7 @@ struct rchan_buf struct irq_work wakeup_work; /* reader wakeup */ struct dentry *dentry; /* channel file dentry */ struct kref kref; /* channel buffer refcount */ + struct rchan_buf_stats stats; /* buffer stats */ struct page **page_array; /* array of current buffer pages */ unsigned int page_count; /* number of current buffer pages */ unsigned int finalized; /* buffer has been finalized */ diff --git a/kernel/relay.c b/kernel/relay.c index fc6ad76b789d..4b07efddc2cf 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -251,8 +251,13 @@ EXPORT_SYMBOL_GPL(relay_buf_full); static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf, void *prev_subbuf) { + int full = relay_buf_full(buf); + + if (full) + buf->stats.full_count++; + if (!buf->chan->cb->subbuf_start) - return !relay_buf_full(buf); + return !full; return buf->chan->cb->subbuf_start(buf, subbuf, prev_subbuf); @@ -297,6 +302,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) buf->finalized = 0; buf->data = buf->start; buf->offset = 0; + buf->stats.full_count = 0; for (i = 0; i < buf->chan->n_subbufs; i++) buf->padding[i] = 0; -- cgit v1.2.3 From a53202ce7fbafd24f854865b02eff891e246c550 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 12 Jun 2025 14:11:59 +0800 Subject: relayfs: introduce getting relayfs statistics function In this version, only support getting the counter for buffer full and implement the framework of how it works. Users can pass certain flag to fetch what field/statistics they expect to know. Each time it only returns one result. So do not pass multiple flags. Link: https://lkml.kernel.org/r/20250612061201.34272-4-kerneljasonxing@gmail.com Signed-off-by: Jason Xing Reviewed-by: Yushan Zhou Reviewed-by: Masami Hiramatsu (Google) Cc: Jens Axboe Cc: Mathieu Desnoyers Cc: Steven Rostedt Signed-off-by: Andrew Morton --- include/linux/relay.h | 7 +++++++ kernel/relay.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/relay.h b/include/linux/relay.h index cd77eb285a48..5310967f9d74 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -31,6 +31,12 @@ /* * Relay buffer statistics */ +enum { + RELAY_STATS_BUF_FULL = (1 << 0), + + RELAY_STATS_LAST = RELAY_STATS_BUF_FULL, +}; + struct rchan_buf_stats { unsigned int full_count; /* counter for buffer full */ @@ -167,6 +173,7 @@ struct rchan *relay_open(const char *base_filename, void *private_data); extern void relay_close(struct rchan *chan); extern void relay_flush(struct rchan *chan); +size_t relay_stats(struct rchan *chan, int flags); extern void relay_subbufs_consumed(struct rchan *chan, unsigned int cpu, size_t consumed); diff --git a/kernel/relay.c b/kernel/relay.c index 4b07efddc2cf..2fc27c0e771e 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -700,6 +700,36 @@ void relay_flush(struct rchan *chan) } EXPORT_SYMBOL_GPL(relay_flush); +/** + * relay_stats - get channel buffer statistics + * @chan: the channel + * @flags: select particular information to get + * + * Returns the count of certain field that caller specifies. + */ +size_t relay_stats(struct rchan *chan, int flags) +{ + unsigned int i, count = 0; + struct rchan_buf *rbuf; + + if (!chan || flags > RELAY_STATS_LAST) + return 0; + + if (chan->is_global) { + rbuf = *per_cpu_ptr(chan->buf, 0); + if (flags & RELAY_STATS_BUF_FULL) + count = rbuf->stats.full_count; + } else { + for_each_online_cpu(i) { + rbuf = *per_cpu_ptr(chan->buf, i); + if (rbuf && flags & RELAY_STATS_BUF_FULL) + count += rbuf->stats.full_count; + } + } + + return count; +} + /** * relay_file_open - open file op for relay files * @inode: the inode -- cgit v1.2.3 From 19f3cb64a25b80db667a00182785577fae465b3e Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 12 Jun 2025 14:12:01 +0800 Subject: relayfs: support a counter tracking if data is too big to write It really doesn't matter if the user/admin knows what the last too big value is. Record how many times this case is triggered would be helpful. Solve the existing issue where relay_reset() doesn't restore the value. Store the counter in the per-cpu buffer structure instead of the global buffer structure. It also solves the racy condition which is likely to happen when a few of per-cpu buffers encounter the too big data case and then access the global field last_toobig without lock protection. Remove the printk in relay_close() since kernel module can directly call relay_stats() as they want. Link: https://lkml.kernel.org/r/20250612061201.34272-6-kerneljasonxing@gmail.com Signed-off-by: Jason Xing Reviewed-by: Yushan Zhou Reviewed-by: Masami Hiramatsu (Google) Cc: Jens Axboe Cc: Mathieu Desnoyers Cc: Steven Rostedt Signed-off-by: Andrew Morton --- include/linux/relay.h | 5 +++-- kernel/relay.c | 18 ++++++++++-------- 2 files changed, 13 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/relay.h b/include/linux/relay.h index 5310967f9d74..6772a7075840 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -33,13 +33,15 @@ */ enum { RELAY_STATS_BUF_FULL = (1 << 0), + RELAY_STATS_WRT_BIG = (1 << 1), - RELAY_STATS_LAST = RELAY_STATS_BUF_FULL, + RELAY_STATS_LAST = RELAY_STATS_WRT_BIG, }; struct rchan_buf_stats { unsigned int full_count; /* counter for buffer full */ + unsigned int big_count; /* counter for too big to write */ }; /* @@ -79,7 +81,6 @@ struct rchan const struct rchan_callbacks *cb; /* client callbacks */ struct kref kref; /* channel refcount */ void *private_data; /* for user-defined data */ - size_t last_toobig; /* tried to log event > subbuf size */ struct rchan_buf * __percpu *buf; /* per-cpu channel buffers */ int is_global; /* One global buffer ? */ struct list_head list; /* for channel list */ diff --git a/kernel/relay.c b/kernel/relay.c index 2fc27c0e771e..8d915fe98198 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -303,6 +303,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) buf->data = buf->start; buf->offset = 0; buf->stats.full_count = 0; + buf->stats.big_count = 0; for (i = 0; i < buf->chan->n_subbufs; i++) buf->padding[i] = 0; @@ -602,7 +603,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) return length; toobig: - buf->chan->last_toobig = length; + buf->stats.big_count++; return 0; } EXPORT_SYMBOL_GPL(relay_switch_subbuf); @@ -662,11 +663,6 @@ void relay_close(struct rchan *chan) if ((buf = *per_cpu_ptr(chan->buf, i))) relay_close_buf(buf); - if (chan->last_toobig) - printk(KERN_WARNING "relay: one or more items not logged " - "[item size (%zd) > sub-buffer size (%zd)]\n", - chan->last_toobig, chan->subbuf_size); - list_del(&chan->list); kref_put(&chan->kref, relay_destroy_channel); mutex_unlock(&relay_channels_mutex); @@ -719,11 +715,17 @@ size_t relay_stats(struct rchan *chan, int flags) rbuf = *per_cpu_ptr(chan->buf, 0); if (flags & RELAY_STATS_BUF_FULL) count = rbuf->stats.full_count; + else if (flags & RELAY_STATS_WRT_BIG) + count = rbuf->stats.big_count; } else { for_each_online_cpu(i) { rbuf = *per_cpu_ptr(chan->buf, i); - if (rbuf && flags & RELAY_STATS_BUF_FULL) - count += rbuf->stats.full_count; + if (rbuf) { + if (flags & RELAY_STATS_BUF_FULL) + count += rbuf->stats.full_count; + else if (flags & RELAY_STATS_WRT_BIG) + count += rbuf->stats.big_count; + } } } -- cgit v1.2.3 From 1857fcc847443b0238cb64584b43d8c3a9049a0a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 17 Mar 2025 17:02:28 +0800 Subject: lib/raid6: replace custom zero page with ZERO_PAGE Use the system-wide zero page instead of a custom zero page. [herbert@gondor.apana.org.au: update lib/raid6/recov_rvv.c, per Klara] Link: https://lkml.kernel.org/r/aFkUnXWtxcgOTVkw@gondor.apana.org.au Link: https://lkml.kernel.org/r/Z9flJNkWQICx0PXk@gondor.apana.org.au Signed-off-by: Herbert Xu Cc: Song Liu Cc: Yu Kuai Cc: Klara Modin Signed-off-by: Andrew Morton --- crypto/async_tx/async_pq.c | 2 +- crypto/async_tx/async_raid6_recov.c | 4 ++-- include/linux/raid/pq.h | 12 +++++++++++- lib/raid6/algos.c | 3 --- lib/raid6/recov.c | 6 +++--- lib/raid6/recov_avx2.c | 6 +++--- lib/raid6/recov_avx512.c | 6 +++--- lib/raid6/recov_loongarch_simd.c | 12 ++++++------ lib/raid6/recov_neon.c | 6 +++--- lib/raid6/recov_s390xc.c | 6 +++--- lib/raid6/recov_ssse3.c | 6 +++--- 11 files changed, 38 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c index 5e2b2680d7db..9e4bb7fbde25 100644 --- a/crypto/async_tx/async_pq.c +++ b/crypto/async_tx/async_pq.c @@ -119,7 +119,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int *offsets, int disks, for (i = 0; i < disks; i++) { if (blocks[i] == NULL) { BUG_ON(i > disks - 3); /* P or Q can't be zero */ - srcs[i] = (void*)raid6_empty_zero_page; + srcs[i] = raid6_get_zero_page(); } else { srcs[i] = page_address(blocks[i]) + offsets[i]; diff --git a/crypto/async_tx/async_raid6_recov.c b/crypto/async_tx/async_raid6_recov.c index 354b8cd5537f..539ea5b378dc 100644 --- a/crypto/async_tx/async_raid6_recov.c +++ b/crypto/async_tx/async_raid6_recov.c @@ -414,7 +414,7 @@ async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb, async_tx_quiesce(&submit->depend_tx); for (i = 0; i < disks; i++) if (blocks[i] == NULL) - ptrs[i] = (void *) raid6_empty_zero_page; + ptrs[i] = raid6_get_zero_page(); else ptrs[i] = page_address(blocks[i]) + offs[i]; @@ -497,7 +497,7 @@ async_raid6_datap_recov(int disks, size_t bytes, int faila, async_tx_quiesce(&submit->depend_tx); for (i = 0; i < disks; i++) if (blocks[i] == NULL) - ptrs[i] = (void*)raid6_empty_zero_page; + ptrs[i] = raid6_get_zero_page(); else ptrs[i] = page_address(blocks[i]) + offs[i]; diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 72ff44cca864..2467b3be15c9 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -11,8 +11,13 @@ #ifdef __KERNEL__ #include +#include -extern const char raid6_empty_zero_page[PAGE_SIZE]; +/* This should be const but the raid6 code is too convoluted for that. */ +static inline void *raid6_get_zero_page(void) +{ + return page_address(ZERO_PAGE(0)); +} #else /* ! __KERNEL__ */ /* Used for testing in user space */ @@ -191,6 +196,11 @@ static inline uint32_t raid6_jiffies(void) return tv.tv_sec*1000 + tv.tv_usec/1000; } +static inline void *raid6_get_zero_page(void) +{ + return raid6_empty_zero_page; +} + #endif /* ! __KERNEL__ */ #endif /* LINUX_RAID_RAID6_H */ diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 75ce3e134b7c..799e0e5eac26 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -18,9 +18,6 @@ #else #include #include -/* In .bss so it's zeroed */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -EXPORT_SYMBOL(raid6_empty_zero_page); #endif struct raid6_calls raid6_call; diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c index a7c1b2bbe40d..b5e47c008b41 100644 --- a/lib/raid6/recov.c +++ b/lib/raid6/recov.c @@ -31,10 +31,10 @@ static void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -72,7 +72,7 @@ static void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c index 4e8095403ee2..97d598d2535c 100644 --- a/lib/raid6/recov_avx2.c +++ b/lib/raid6/recov_avx2.c @@ -28,10 +28,10 @@ static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -196,7 +196,7 @@ static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c index 310c715db313..7986120ca444 100644 --- a/lib/raid6/recov_avx512.c +++ b/lib/raid6/recov_avx512.c @@ -37,10 +37,10 @@ static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila, */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -238,7 +238,7 @@ static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila, */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_loongarch_simd.c b/lib/raid6/recov_loongarch_simd.c index 94aeac85e6f7..93dc515997a1 100644 --- a/lib/raid6/recov_loongarch_simd.c +++ b/lib/raid6/recov_loongarch_simd.c @@ -42,10 +42,10 @@ static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila, * delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -197,7 +197,7 @@ static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila, * Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -316,10 +316,10 @@ static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila, * delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -436,7 +436,7 @@ static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila, * Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c index 1bfc14174d4d..70e1404c1512 100644 --- a/lib/raid6/recov_neon.c +++ b/lib/raid6/recov_neon.c @@ -36,10 +36,10 @@ static void raid6_2data_recov_neon(int disks, size_t bytes, int faila, * delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -74,7 +74,7 @@ static void raid6_datap_recov_neon(int disks, size_t bytes, int faila, * Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_s390xc.c b/lib/raid6/recov_s390xc.c index 179eec900cea..1d32c01261be 100644 --- a/lib/raid6/recov_s390xc.c +++ b/lib/raid6/recov_s390xc.c @@ -35,10 +35,10 @@ static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -82,7 +82,7 @@ static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c index 4bfa3c6b60de..2e849185c32b 100644 --- a/lib/raid6/recov_ssse3.c +++ b/lib/raid6/recov_ssse3.c @@ -30,10 +30,10 @@ static void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -203,7 +203,7 @@ static void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); -- cgit v1.2.3 From 35c18f2933c596b4fd6a98baee36f3137d133a5f Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Thu, 12 Jun 2025 12:13:21 +0200 Subject: Add a new optional ",cma" suffix to the crashkernel= command line option Patch series "kdump: crashkernel reservation from CMA", v5. This series implements a way to reserve additional crash kernel memory using CMA. Currently, all the memory for the crash kernel is not usable by the 1st (production) kernel. It is also unmapped so that it can't be corrupted by the fault that will eventually trigger the crash. This makes sense for the memory actually used by the kexec-loaded crash kernel image and initrd and the data prepared during the load (vmcoreinfo, ...). However, the reserved space needs to be much larger than that to provide enough run-time memory for the crash kernel and the kdump userspace. Estimating the amount of memory to reserve is difficult. Being too careful makes kdump likely to end in OOM, being too generous takes even more memory from the production system. Also, the reservation only allows reserving a single contiguous block (or two with the "low" suffix). I've seen systems where this fails because the physical memory is fragmented. By reserving additional crashkernel memory from CMA, the main crashkernel reservation can be just large enough to fit the kernel and initrd image, minimizing the memory taken away from the production system. Most of the run-time memory for the crash kernel will be memory previously available to userspace in the production system. As this memory is no longer wasted, the reservation can be done with a generous margin, making kdump more reliable. Kernel memory that we need to preserve for dumping is normally not allocated from CMA, unless it is explicitly allocated as movable. Currently this is only the case for memory ballooning and zswap. Such movable memory will be missing from the vmcore. User data is typically not dumped by makedumpfile. When dumping of user data is intended this new CMA reservation cannot be used. There are five patches in this series: The first adds a new ",cma" suffix to the recenly introduced generic crashkernel parsing code. parse_crashkernel() takes one more argument to store the cma reservation size. The second patch implements reserve_crashkernel_cma() which performs the reservation. If the requested size is not available in a single range, multiple smaller ranges will be reserved. The third patch updates Documentation/, explicitly mentioning the potential DMA corruption of the CMA-reserved memory. The fourth patch adds a short delay before booting the kdump kernel, allowing pending DMA transfers to finish. The fifth patch enables the functionality for x86 as a proof of concept. There are just three things every arch needs to do: - call reserve_crashkernel_cma() - include the CMA-reserved ranges in the physical memory map - exclude the CMA-reserved ranges from the memory available through /proc/vmcore by excluding them from the vmcoreinfo PT_LOAD ranges. Adding other architectures is easy and I can do that as soon as this series is merged. With this series applied, specifying crashkernel=100M craskhernel=1G,cma on the command line will make a standard crashkernel reservation of 100M, where kexec will load the kernel and initrd. An additional 1G will be reserved from CMA, still usable by the production system. The crash kernel will have 1.1G memory available. The 100M can be reliably predicted based on the size of the kernel and initrd. The new cma suffix is completely optional. When no crashkernel=size,cma is specified, everything works as before. This patch (of 5): Add a new cma_size parameter to parse_crashkernel(). When not NULL, call __parse_crashkernel to parse the CMA reservation size from "crashkernel=size,cma" and store it in cma_size. Set cma_size to NULL in all calls to parse_crashkernel(). Link: https://lkml.kernel.org/r/aEqnxxfLZMllMC8I@dwarf.suse.cz Link: https://lkml.kernel.org/r/aEqoQckgoTQNULnh@dwarf.suse.cz Signed-off-by: Jiri Bohac Cc: Baoquan He Cc: Dave Young Cc: Donald Dutile Cc: Michal Hocko Cc: Philipp Rudo Cc: Pingfan Liu Cc: Tao Liu Cc: Vivek Goyal Cc: David Hildenbrand Signed-off-by: Andrew Morton --- arch/arm/kernel/setup.c | 2 +- arch/arm64/mm/init.c | 2 +- arch/loongarch/kernel/setup.c | 2 +- arch/mips/kernel/setup.c | 2 +- arch/powerpc/kernel/fadump.c | 2 +- arch/powerpc/kexec/core.c | 2 +- arch/powerpc/mm/nohash/kaslr_booke.c | 2 +- arch/riscv/mm/init.c | 2 +- arch/s390/kernel/setup.c | 2 +- arch/sh/kernel/machine_kexec.c | 2 +- arch/x86/kernel/setup.c | 2 +- include/linux/crash_reserve.h | 3 ++- kernel/crash_reserve.c | 16 ++++++++++++++-- 13 files changed, 27 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index a41c93988d2c..0bfd66c7ada0 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -1004,7 +1004,7 @@ static void __init reserve_crashkernel(void) total_mem = get_total_mem(); ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base, - NULL, NULL); + NULL, NULL, NULL); /* invalid value specified or crashkernel=0 */ if (ret || !crash_size) return; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 0c8c35dd645e..ea84a61ed508 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -106,7 +106,7 @@ static void __init arch_reserve_crashkernel(void) ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base, - &low_size, &high); + &low_size, NULL, &high); if (ret) return; diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index b99fbb388fe0..22b27cd447a1 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -265,7 +265,7 @@ static void __init arch_reserve_crashkernel(void) return; ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), - &crash_size, &crash_base, &low_size, &high); + &crash_size, &crash_base, &low_size, NULL, &high); if (ret) return; diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index fbfe0771317e..11b9b6b63e19 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -458,7 +458,7 @@ static void __init mips_parse_crashkernel(void) total_mem = memblock_phys_mem_size(); ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base, - NULL, NULL); + NULL, NULL, NULL); if (ret != 0 || crash_size <= 0) return; diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 8ca49e40c473..28cab25d5b33 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -333,7 +333,7 @@ static __init u64 fadump_calculate_reserve_size(void) * memory at a predefined offset. */ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), - &size, &base, NULL, NULL); + &size, &base, NULL, NULL, NULL); if (ret == 0 && size > 0) { unsigned long max_size; diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index 00e9c267b912..d1a2d755381c 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -110,7 +110,7 @@ void __init arch_reserve_crashkernel(void) /* use common parsing */ ret = parse_crashkernel(boot_command_line, total_mem_sz, &crash_size, - &crash_base, NULL, NULL); + &crash_base, NULL, NULL, NULL); if (ret) return; diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index 5c8d1bb98b3e..5e4897daaaea 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -178,7 +178,7 @@ static void __init get_crash_kernel(void *fdt, unsigned long size) int ret; ret = parse_crashkernel(boot_command_line, size, &crash_size, - &crash_base, NULL, NULL); + &crash_base, NULL, NULL, NULL); if (ret != 0 || crash_size == 0) return; if (crash_base == 0) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 8d0374d7ce8e..15683ae13fa5 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1408,7 +1408,7 @@ static void __init arch_reserve_crashkernel(void) ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base, - &low_size, &high); + &low_size, NULL, &high); if (ret) return; diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index f244c5560e7f..b99aeb0db2ee 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -605,7 +605,7 @@ static void __init reserve_crashkernel(void) int rc; rc = parse_crashkernel(boot_command_line, ident_map_size, - &crash_size, &crash_base, NULL, NULL); + &crash_size, &crash_base, NULL, NULL, NULL); crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN); crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN); diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c index 8321b31d2e19..37073ca1e0ad 100644 --- a/arch/sh/kernel/machine_kexec.c +++ b/arch/sh/kernel/machine_kexec.c @@ -146,7 +146,7 @@ void __init reserve_crashkernel(void) return; ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), - &crash_size, &crash_base, NULL, NULL); + &crash_size, &crash_base, NULL, NULL, NULL); if (ret == 0 && crash_size > 0) { crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index fb27be697128..c22dc630c297 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -608,7 +608,7 @@ static void __init arch_reserve_crashkernel(void) ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base, - &low_size, &high); + &low_size, NULL, &high); if (ret) return; diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h index 1fe7e7d1b214..e784aaff2f5a 100644 --- a/include/linux/crash_reserve.h +++ b/include/linux/crash_reserve.h @@ -16,7 +16,8 @@ extern struct resource crashk_low_res; int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base, - unsigned long long *low_size, bool *high); + unsigned long long *low_size, unsigned long long *cma_size, + bool *high); #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index acb6bf42e30d..86ae1365d04e 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -172,17 +172,19 @@ static int __init parse_crashkernel_simple(char *cmdline, #define SUFFIX_HIGH 0 #define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 +#define SUFFIX_CMA 2 +#define SUFFIX_NULL 3 static __initdata char *suffix_tbl[] = { [SUFFIX_HIGH] = ",high", [SUFFIX_LOW] = ",low", + [SUFFIX_CMA] = ",cma", [SUFFIX_NULL] = NULL, }; /* * That function parses "suffix" crashkernel command lines like * - * crashkernel=size,[high|low] + * crashkernel=size,[high|low|cma] * * It returns 0 on success and -EINVAL on failure. */ @@ -298,9 +300,11 @@ int __init parse_crashkernel(char *cmdline, unsigned long long *crash_size, unsigned long long *crash_base, unsigned long long *low_size, + unsigned long long *cma_size, bool *high) { int ret; + unsigned long long __always_unused cma_base; /* crashkernel=X[@offset] */ ret = __parse_crashkernel(cmdline, system_ram, crash_size, @@ -331,6 +335,14 @@ int __init parse_crashkernel(char *cmdline, *high = true; } + + /* + * optional CMA reservation + * cma_base is ignored + */ + if (cma_size) + __parse_crashkernel(cmdline, 0, cma_size, + &cma_base, suffix_tbl[SUFFIX_CMA]); #endif if (!*crash_size) ret = -EINVAL; -- cgit v1.2.3 From ab475510e0422bb5672d465f9d0f523d72fdb7f1 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Thu, 12 Jun 2025 12:16:39 +0200 Subject: kdump: implement reserve_crashkernel_cma reserve_crashkernel_cma() reserves CMA ranges for the crash kernel. If allocating the requested size fails, try to reserve in smaller blocks. Store the reserved ranges in the crashk_cma_ranges array and the number of ranges in crashk_cma_cnt. Link: https://lkml.kernel.org/r/aEqpBwOy_ekm0gw9@dwarf.suse.cz Signed-off-by: Jiri Bohac Cc: Baoquan He Cc: Dave Young Cc: David Hildenbrand Cc: Donald Dutile Cc: Michal Hocko Cc: Philipp Rudo Cc: Pingfan Liu Cc: Tao Liu Cc: Vivek Goyal Signed-off-by: Andrew Morton --- include/linux/crash_reserve.h | 12 ++++++++++ kernel/crash_reserve.c | 52 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h index e784aaff2f5a..7b44b41d0a20 100644 --- a/include/linux/crash_reserve.h +++ b/include/linux/crash_reserve.h @@ -13,12 +13,24 @@ */ extern struct resource crashk_res; extern struct resource crashk_low_res; +extern struct range crashk_cma_ranges[]; +#if defined(CONFIG_CMA) && defined(CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION) +#define CRASHKERNEL_CMA +#define CRASHKERNEL_CMA_RANGES_MAX 4 +extern int crashk_cma_cnt; +#else +#define crashk_cma_cnt 0 +#define CRASHKERNEL_CMA_RANGES_MAX 0 +#endif + int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base, unsigned long long *low_size, unsigned long long *cma_size, bool *high); +void __init reserve_crashkernel_cma(unsigned long long cma_size); + #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE #define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index 86ae1365d04e..87bf4d41eabb 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include @@ -469,6 +471,56 @@ retry: #endif } +struct range crashk_cma_ranges[CRASHKERNEL_CMA_RANGES_MAX]; +#ifdef CRASHKERNEL_CMA +int crashk_cma_cnt; +void __init reserve_crashkernel_cma(unsigned long long cma_size) +{ + unsigned long long request_size = roundup(cma_size, PAGE_SIZE); + unsigned long long reserved_size = 0; + + if (!cma_size) + return; + + while (cma_size > reserved_size && + crashk_cma_cnt < CRASHKERNEL_CMA_RANGES_MAX) { + + struct cma *res; + + if (cma_declare_contiguous(0, request_size, 0, 0, 0, false, + "crashkernel", &res)) { + /* reservation failed, try half-sized blocks */ + if (request_size <= PAGE_SIZE) + break; + + request_size = roundup(request_size / 2, PAGE_SIZE); + continue; + } + + crashk_cma_ranges[crashk_cma_cnt].start = cma_get_base(res); + crashk_cma_ranges[crashk_cma_cnt].end = + crashk_cma_ranges[crashk_cma_cnt].start + + cma_get_size(res) - 1; + ++crashk_cma_cnt; + reserved_size += request_size; + } + + if (cma_size > reserved_size) + pr_warn("crashkernel CMA reservation failed: %lld MB requested, %lld MB reserved in %d ranges\n", + cma_size >> 20, reserved_size >> 20, crashk_cma_cnt); + else + pr_info("crashkernel CMA reserved: %lld MB in %d ranges\n", + reserved_size >> 20, crashk_cma_cnt); +} + +#else /* CRASHKERNEL_CMA */ +void __init reserve_crashkernel_cma(unsigned long long cma_size) +{ + if (cma_size) + pr_warn("crashkernel CMA reservation not supported\n"); +} +#endif + #ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY static __init int insert_crashkernel_resources(void) { -- cgit v1.2.3 From b76e89e50fc3693b7b8a443ed906320d8ccb93fd Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 3 Jul 2025 10:10:01 +0800 Subject: panic: generalize panic_print's function to show sys info 'panic_print' was introduced to help debugging kernel panic by dumping different kinds of system information like tasks' call stack, memory, ftrace buffer, etc. Actually this function could also be used to help debugging other cases like task-hung, soft/hard lockup, etc. where user may need the snapshot of system info at that time. Extract system info dump function related code from panic.c to separate file sys_info.[ch], for wider usage by other kernel parts for debugging. Also modify the macro names about singulars/plurals. Link: https://lkml.kernel.org/r/20250703021004.42328-3-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang Suggested-by: Petr Mladek Cc: John Ogness Cc: Jonathan Corbet Cc: Lance Yang Cc: "Paul E . McKenney" Cc: Steven Rostedt Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- include/linux/sys_info.h | 20 ++++++++++++++++++++ kernel/panic.c | 36 ++++-------------------------------- lib/Makefile | 2 +- lib/sys_info.c | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 33 deletions(-) create mode 100644 include/linux/sys_info.h create mode 100644 lib/sys_info.c (limited to 'include/linux') diff --git a/include/linux/sys_info.h b/include/linux/sys_info.h new file mode 100644 index 000000000000..53b7e27dbf2a --- /dev/null +++ b/include/linux/sys_info.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SYS_INFO_H +#define _LINUX_SYS_INFO_H + +/* + * SYS_INFO_PANIC_CONSOLE_REPLAY is for panic case only, as it needs special + * handling which only fits panic case. + */ +#define SYS_INFO_TASKS 0x00000001 +#define SYS_INFO_MEM 0x00000002 +#define SYS_INFO_TIMERS 0x00000004 +#define SYS_INFO_LOCKS 0x00000008 +#define SYS_INFO_FTRACE 0x00000010 +#define SYS_INFO_PANIC_CONSOLE_REPLAY 0x00000020 +#define SYS_INFO_ALL_CPU_BT 0x00000040 +#define SYS_INFO_BLOCKED_TASKS 0x00000080 + +void sys_info(unsigned long si_mask); + +#endif /* _LINUX_SYS_INFO_H */ diff --git a/kernel/panic.c b/kernel/panic.c index 9b6c5dc28a65..cbb0681177b3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -69,14 +70,6 @@ bool panic_triggering_all_cpu_backtrace; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); -#define PANIC_PRINT_TASK_INFO 0x00000001 -#define PANIC_PRINT_MEM_INFO 0x00000002 -#define PANIC_PRINT_TIMER_INFO 0x00000004 -#define PANIC_PRINT_LOCK_INFO 0x00000008 -#define PANIC_PRINT_FTRACE_INFO 0x00000010 -#define PANIC_CONSOLE_REPLAY 0x00000020 -#define PANIC_PRINT_ALL_CPU_BT 0x00000040 -#define PANIC_PRINT_BLOCKED_TASKS 0x00000080 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -240,31 +233,10 @@ EXPORT_SYMBOL(nmi_panic); static void panic_console_replay(void) { - if (panic_print & PANIC_CONSOLE_REPLAY) + if (panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY) console_flush_on_panic(CONSOLE_REPLAY_ALL); } -static void panic_print_sys_info(void) -{ - if (panic_print & PANIC_PRINT_TASK_INFO) - show_state(); - - if (panic_print & PANIC_PRINT_MEM_INFO) - show_mem(); - - if (panic_print & PANIC_PRINT_TIMER_INFO) - sysrq_timer_list_show(); - - if (panic_print & PANIC_PRINT_LOCK_INFO) - debug_show_all_locks(); - - if (panic_print & PANIC_PRINT_FTRACE_INFO) - ftrace_dump(DUMP_ALL); - - if (panic_print & PANIC_PRINT_BLOCKED_TASKS) - show_state_filter(TASK_UNINTERRUPTIBLE); -} - void check_panic_on_warn(const char *origin) { unsigned int limit; @@ -285,7 +257,7 @@ void check_panic_on_warn(const char *origin) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & PANIC_PRINT_ALL_CPU_BT) { + if (panic_print & SYS_INFO_ALL_CPU_BT) { /* Temporary allow non-panic CPUs to write their backtraces. */ panic_triggering_all_cpu_backtrace = true; trigger_all_cpu_backtrace(); @@ -410,7 +382,7 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); - panic_print_sys_info(); + sys_info(panic_print); kmsg_dump_desc(KMSG_DUMP_PANIC, buf); diff --git a/lib/Makefile b/lib/Makefile index c38582f187dd..88d6228089a8 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -40,7 +40,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ nmi_backtrace.o win_minmax.o memcat_p.o \ - buildid.o objpool.o iomem_copy.o + buildid.o objpool.o iomem_copy.o sys_info.o lib-$(CONFIG_UNION_FIND) += union_find.o lib-$(CONFIG_PRINTK) += dump_stack.o diff --git a/lib/sys_info.c b/lib/sys_info.c new file mode 100644 index 000000000000..53031e5cb98e --- /dev/null +++ b/lib/sys_info.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include + +#include + +void sys_info(unsigned long si_mask) +{ + if (si_mask & SYS_INFO_TASKS) + show_state(); + + if (si_mask & SYS_INFO_MEM) + show_mem(); + + if (si_mask & SYS_INFO_TIMERS) + sysrq_timer_list_show(); + + if (si_mask & SYS_INFO_LOCKS) + debug_show_all_locks(); + + if (si_mask & SYS_INFO_FTRACE) + ftrace_dump(DUMP_ALL); + + if (si_mask & SYS_INFO_ALL_CPU_BT) + trigger_all_cpu_backtrace(); + + if (si_mask & SYS_INFO_BLOCKED_TASKS) + show_state_filter(TASK_UNINTERRUPTIBLE); +} -- cgit v1.2.3 From d747755917bf8ae08f490c3fe7d8e321afab8127 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 3 Jul 2025 10:10:02 +0800 Subject: panic: add 'panic_sys_info' sysctl to take human readable string parameter Bitmap definition for 'panic_print' is hard to remember and decode. Add 'panic_sys_info='sysctl to take human readable string like "tasks,mem,timers,locks,ftrace,..." and translate it into bitmap. The detailed mapping is: SYS_INFO_TASKS "tasks" SYS_INFO_MEM "mem" SYS_INFO_TIMERS "timers" SYS_INFO_LOCKS "locks" SYS_INFO_FTRACE "ftrace" SYS_INFO_ALL_CPU_BT "all_bt" SYS_INFO_BLOCKED_TASKS "blocked_tasks" [nathan@kernel.org: add __maybe_unused to sys_info_avail] Link: https://lkml.kernel.org/r/20250708-fix-clang-sys_info_avail-warning-v1-1-60d239eacd64@kernel.org Link: https://lkml.kernel.org/r/20250703021004.42328-4-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang Suggested-by: Petr Mladek Cc: John Ogness Cc: Jonathan Corbet Cc: Lance Yang Cc: "Paul E . McKenney" Cc: Steven Rostedt Cc: Nathan Chancellor Cc: Andy Shevchenko Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/kernel.rst | 18 ++++++ include/linux/sys_info.h | 8 +++ kernel/panic.c | 7 +++ lib/sys_info.c | 90 +++++++++++++++++++++++++++++ 4 files changed, 123 insertions(+) (limited to 'include/linux') diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 0d08b7a2db2d..cccb06d1a6bf 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -899,6 +899,24 @@ So for example to print tasks and memory info on panic, user can:: echo 3 > /proc/sys/kernel/panic_print +panic_sys_info +============== + +A comma separated list of extra information to be dumped on panic, +for example, "tasks,mem,timers,...". It is a human readable alternative +to 'panic_print'. Possible values are: + +============= =================================================== +tasks print all tasks info +mem print system memory info +timer print timers info +lock print locks info if CONFIG_LOCKDEP is on +ftrace print ftrace buffer +all_bt print all CPUs backtrace (if available in the arch) +blocked_tasks print only tasks in uninterruptible (blocked) state +============= =================================================== + + panic_on_rcu_stall ================== diff --git a/include/linux/sys_info.h b/include/linux/sys_info.h index 53b7e27dbf2a..89d77dc4f2ed 100644 --- a/include/linux/sys_info.h +++ b/include/linux/sys_info.h @@ -2,6 +2,8 @@ #ifndef _LINUX_SYS_INFO_H #define _LINUX_SYS_INFO_H +#include + /* * SYS_INFO_PANIC_CONSOLE_REPLAY is for panic case only, as it needs special * handling which only fits panic case. @@ -16,5 +18,11 @@ #define SYS_INFO_BLOCKED_TASKS 0x00000080 void sys_info(unsigned long si_mask); +unsigned long sys_info_parse_param(char *str); +#ifdef CONFIG_SYSCTL +int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, + void *buffer, size_t *lenp, + loff_t *ppos); +#endif #endif /* _LINUX_SYS_INFO_H */ diff --git a/kernel/panic.c b/kernel/panic.c index cbb0681177b3..d7aa427dc23c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -126,6 +126,13 @@ static const struct ctl_table kern_panic_table[] = { .mode = 0644, .proc_handler = proc_douintvec, }, + { + .procname = "panic_sys_info", + .data = &panic_print, + .maxlen = sizeof(panic_print), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, }; static __init int kernel_panic_sysctls_init(void) diff --git a/lib/sys_info.c b/lib/sys_info.c index 53031e5cb98e..5bf503fd7ec1 100644 --- a/lib/sys_info.c +++ b/lib/sys_info.c @@ -3,10 +3,100 @@ #include #include #include +#include #include #include +struct sys_info_name { + unsigned long bit; + const char *name; +}; + +/* + * When 'si_names' gets updated, please make sure the 'sys_info_avail' + * below is updated accordingly. + */ +static const struct sys_info_name si_names[] = { + { SYS_INFO_TASKS, "tasks" }, + { SYS_INFO_MEM, "mem" }, + { SYS_INFO_TIMERS, "timers" }, + { SYS_INFO_LOCKS, "locks" }, + { SYS_INFO_FTRACE, "ftrace" }, + { SYS_INFO_ALL_CPU_BT, "all_bt" }, + { SYS_INFO_BLOCKED_TASKS, "blocked_tasks" }, +}; + +/* Expecting string like "xxx_sys_info=tasks,mem,timers,locks,ftrace,..." */ +unsigned long sys_info_parse_param(char *str) +{ + unsigned long si_bits = 0; + char *s, *name; + int i; + + s = str; + while ((name = strsep(&s, ",")) && *name) { + for (i = 0; i < ARRAY_SIZE(si_names); i++) { + if (!strcmp(name, si_names[i].name)) { + si_bits |= si_names[i].bit; + break; + } + } + } + + return si_bits; +} + +#ifdef CONFIG_SYSCTL + +static const char sys_info_avail[] __maybe_unused = "tasks,mem,timers,locks,ftrace,all_bt,blocked_tasks"; + +int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + char names[sizeof(sys_info_avail) + 1]; + struct ctl_table table; + unsigned long *si_bits_global; + + si_bits_global = ro_table->data; + + if (write) { + unsigned long si_bits; + int ret; + + table = *ro_table; + table.data = names; + table.maxlen = sizeof(names); + ret = proc_dostring(&table, write, buffer, lenp, ppos); + if (ret) + return ret; + + si_bits = sys_info_parse_param(names); + /* The access to the global value is not synchronized. */ + WRITE_ONCE(*si_bits_global, si_bits); + return 0; + } else { + /* for 'read' operation */ + char *delim = ""; + int i, len = 0; + + for (i = 0; i < ARRAY_SIZE(si_names); i++) { + if (*si_bits_global & si_names[i].bit) { + len += scnprintf(names + len, sizeof(names) - len, + "%s%s", delim, si_names[i].name); + delim = ","; + } + } + + table = *ro_table; + table.data = names; + table.maxlen = sizeof(names); + return proc_dostring(&table, write, buffer, lenp, ppos); + } +} +#endif + void sys_info(unsigned long si_mask) { if (si_mask & SYS_INFO_TASKS) -- cgit v1.2.3 From ae2da51def76020fa16f53cd3446c00cafe41008 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 27 Jun 2025 15:29:22 +0800 Subject: locking/rwsem: make owner helpers globally available Patch series "extend hung task blocker tracking to rwsems". Inspired by mutex blocker tracking[1], and having already extended it to semaphores, let's now add support for reader-writer semaphores (rwsems). The approach is simple: when a task enters TASK_UNINTERRUPTIBLE while waiting for an rwsem, we just call hung_task_set_blocker(). The hung task detector can then query the rwsem's owner to identify the lock holder. Tracking works reliably for writers, as there can only be a single writer holding the lock, and its task struct is stored in the owner field. The main challenge lies with readers. The owner field points to only one of many concurrent readers, so we might lose track of the blocker if that specific reader unlocks, even while others remain. This is not a significant issue, however. In practice, long-lasting lock contention is almost always caused by a writer. Therefore, reliably tracking the writer is the primary goal of this patch series ;) With this change, the hung task detector can now show blocker task's info like below: [Fri Jun 27 15:21:34 2025] INFO: task cat:28631 blocked for more than 122 seconds. [Fri Jun 27 15:21:34 2025] Tainted: G S 6.16.0-rc3 #8 [Fri Jun 27 15:21:34 2025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [Fri Jun 27 15:21:34 2025] task:cat state:D stack:0 pid:28631 tgid:28631 ppid:28501 task_flags:0x400000 flags:0x00004000 [Fri Jun 27 15:21:34 2025] Call Trace: [Fri Jun 27 15:21:34 2025] [Fri Jun 27 15:21:34 2025] __schedule+0x7c7/0x1930 [Fri Jun 27 15:21:34 2025] ? __pfx___schedule+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? policy_nodemask+0x215/0x340 [Fri Jun 27 15:21:34 2025] ? _raw_spin_lock_irq+0x8a/0xe0 [Fri Jun 27 15:21:34 2025] ? __pfx__raw_spin_lock_irq+0x10/0x10 [Fri Jun 27 15:21:34 2025] schedule+0x6a/0x180 [Fri Jun 27 15:21:34 2025] schedule_preempt_disabled+0x15/0x30 [Fri Jun 27 15:21:34 2025] rwsem_down_read_slowpath+0x55e/0xe10 [Fri Jun 27 15:21:34 2025] ? __pfx_rwsem_down_read_slowpath+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? __pfx___might_resched+0x10/0x10 [Fri Jun 27 15:21:34 2025] down_read+0xc9/0x230 [Fri Jun 27 15:21:34 2025] ? __pfx_down_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? __debugfs_file_get+0x14d/0x700 [Fri Jun 27 15:21:34 2025] ? __pfx___debugfs_file_get+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? handle_pte_fault+0x52a/0x710 [Fri Jun 27 15:21:34 2025] ? selinux_file_permission+0x3a9/0x590 [Fri Jun 27 15:21:34 2025] read_dummy_rwsem_read+0x4a/0x90 [Fri Jun 27 15:21:34 2025] full_proxy_read+0xff/0x1c0 [Fri Jun 27 15:21:34 2025] ? rw_verify_area+0x6d/0x410 [Fri Jun 27 15:21:34 2025] vfs_read+0x177/0xa50 [Fri Jun 27 15:21:34 2025] ? __pfx_vfs_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? fdget_pos+0x1cf/0x4c0 [Fri Jun 27 15:21:34 2025] ksys_read+0xfc/0x1d0 [Fri Jun 27 15:21:34 2025] ? __pfx_ksys_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] do_syscall_64+0x66/0x2d0 [Fri Jun 27 15:21:34 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Fri Jun 27 15:21:34 2025] RIP: 0033:0x7f3f8faefb40 [Fri Jun 27 15:21:34 2025] RSP: 002b:00007ffdeda5ab98 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Fri Jun 27 15:21:34 2025] RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007f3f8faefb40 [Fri Jun 27 15:21:34 2025] RDX: 0000000000010000 RSI: 00000000010fa000 RDI: 0000000000000003 [Fri Jun 27 15:21:34 2025] RBP: 00000000010fa000 R08: 0000000000000000 R09: 0000000000010fff [Fri Jun 27 15:21:34 2025] R10: 00007ffdeda59fe0 R11: 0000000000000246 R12: 00000000010fa000 [Fri Jun 27 15:21:34 2025] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff [Fri Jun 27 15:21:34 2025] [Fri Jun 27 15:21:34 2025] INFO: task cat:28631 blocked on an rw-semaphore likely owned by task cat:28630 [Fri Jun 27 15:21:34 2025] task:cat state:S stack:0 pid:28630 tgid:28630 ppid:28501 task_flags:0x400000 flags:0x00004000 [Fri Jun 27 15:21:34 2025] Call Trace: [Fri Jun 27 15:21:34 2025] [Fri Jun 27 15:21:34 2025] __schedule+0x7c7/0x1930 [Fri Jun 27 15:21:34 2025] ? __pfx___schedule+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? __mod_timer+0x304/0xa80 [Fri Jun 27 15:21:34 2025] schedule+0x6a/0x180 [Fri Jun 27 15:21:34 2025] schedule_timeout+0xfb/0x230 [Fri Jun 27 15:21:34 2025] ? __pfx_schedule_timeout+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? __pfx_process_timeout+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? down_write+0xc4/0x140 [Fri Jun 27 15:21:34 2025] msleep_interruptible+0xbe/0x150 [Fri Jun 27 15:21:34 2025] read_dummy_rwsem_write+0x54/0x90 [Fri Jun 27 15:21:34 2025] full_proxy_read+0xff/0x1c0 [Fri Jun 27 15:21:34 2025] ? rw_verify_area+0x6d/0x410 [Fri Jun 27 15:21:34 2025] vfs_read+0x177/0xa50 [Fri Jun 27 15:21:34 2025] ? __pfx_vfs_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? fdget_pos+0x1cf/0x4c0 [Fri Jun 27 15:21:34 2025] ksys_read+0xfc/0x1d0 [Fri Jun 27 15:21:34 2025] ? __pfx_ksys_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] do_syscall_64+0x66/0x2d0 [Fri Jun 27 15:21:34 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Fri Jun 27 15:21:34 2025] RIP: 0033:0x7f8f288efb40 [Fri Jun 27 15:21:34 2025] RSP: 002b:00007ffffb631038 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Fri Jun 27 15:21:34 2025] RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007f8f288efb40 [Fri Jun 27 15:21:34 2025] RDX: 0000000000010000 RSI: 000000002a4b5000 RDI: 0000000000000003 [Fri Jun 27 15:21:34 2025] RBP: 000000002a4b5000 R08: 0000000000000000 R09: 0000000000010fff [Fri Jun 27 15:21:34 2025] R10: 00007ffffb630460 R11: 0000000000000246 R12: 000000002a4b5000 [Fri Jun 27 15:21:34 2025] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff [Fri Jun 27 15:21:34 2025] This patch (of 3): In preparation for extending blocker tracking to support rwsems, make the rwsem_owner() and is_rwsem_reader_owned() helpers globally available for determining if the blocker is a writer or one of the readers. Additionally, a stale owner pointer in a reader-owned rwsem can lead to false positives in blocker tracking when CONFIG_DETECT_HUNG_TASK_BLOCKER is enabled. To mitigate this, clear the owner field on the reader unlock path, similar to what CONFIG_DEBUG_RWSEMS does. A NULL owner is better than a stale one for diagnostics. Link: https://lkml.kernel.org/r/20250627072924.36567-1-lance.yang@linux.dev Link: https://lkml.kernel.org/r/20250627072924.36567-2-lance.yang@linux.dev Link: https://lore.kernel.org/all/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com/ [1] Signed-off-by: Lance Yang Reviewed-by: Masami Hiramatsu (Google) Cc: Anna Schumaker Cc: Boqun Feng Cc: Ingo Molnar Cc: Joel Granados Cc: John Stultz Cc: Kent Overstreet Cc: Mingzhe Yang Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Tomasz Figa Cc: Waiman Long Cc: Will Deacon Cc: Yongliang Gao Cc: Zi Li Signed-off-by: Andrew Morton --- include/linux/rwsem.h | 12 ++++++++++++ kernel/locking/rwsem.c | 14 +++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index c8b543d428b0..544853bed5b9 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -132,6 +132,18 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem) return !list_empty(&sem->wait_list); } +#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) +/* + * Return just the real task structure pointer of the owner + */ +extern struct task_struct *rwsem_owner(struct rw_semaphore *sem); + +/* + * Return true if the rwsem is owned by a reader. + */ +extern bool is_rwsem_reader_owned(struct rw_semaphore *sem); +#endif + #else /* !CONFIG_PREEMPT_RT */ #include diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2ddb827e3bea..a310eb9896de 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -181,11 +181,11 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) __rwsem_set_reader_owned(sem, current); } -#ifdef CONFIG_DEBUG_RWSEMS +#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) /* * Return just the real task structure pointer of the owner */ -static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) +struct task_struct *rwsem_owner(struct rw_semaphore *sem) { return (struct task_struct *) (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); @@ -194,7 +194,7 @@ static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) /* * Return true if the rwsem is owned by a reader. */ -static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) +bool is_rwsem_reader_owned(struct rw_semaphore *sem) { /* * Check the count to see if it is write-locked. @@ -207,10 +207,10 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) } /* - * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there - * is a task pointer in owner of a reader-owned rwsem, it will be the - * real owner or one of the real owners. The only exception is when the - * unlock is done by up_read_non_owner(). + * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured, + * it will make sure that the owner field of a reader-owned rwsem either + * points to a real reader-owner(s) or gets cleared. The only exception is + * when the unlock is done by up_read_non_owner(). */ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { -- cgit v1.2.3 From 77da18de55ac6417e48905bec8b3c66f023b15a9 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 27 Jun 2025 15:29:23 +0800 Subject: hung_task: extend hung task blocker tracking to rwsems Inspired by mutex blocker tracking[1], and having already extended it to semaphores, let's now add support for reader-writer semaphores (rwsems). The approach is simple: when a task enters TASK_UNINTERRUPTIBLE while waiting for an rwsem, we just call hung_task_set_blocker(). The hung task detector can then query the rwsem's owner to identify the lock holder. Tracking works reliably for writers, as there can only be a single writer holding the lock, and its task struct is stored in the owner field. The main challenge lies with readers. The owner field points to only one of many concurrent readers, so we might lose track of the blocker if that specific reader unlocks, even while others remain. This is not a significant issue, however. In practice, long-lasting lock contention is almost always caused by a writer. Therefore, reliably tracking the writer is the primary goal of this patch series ;) With this change, the hung task detector can now show blocker task's info like below: [Fri Jun 27 15:21:34 2025] INFO: task cat:28631 blocked for more than 122 seconds. [Fri Jun 27 15:21:34 2025] Tainted: G S 6.16.0-rc3 #8 [Fri Jun 27 15:21:34 2025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [Fri Jun 27 15:21:34 2025] task:cat state:D stack:0 pid:28631 tgid:28631 ppid:28501 task_flags:0x400000 flags:0x00004000 [Fri Jun 27 15:21:34 2025] Call Trace: [Fri Jun 27 15:21:34 2025] [Fri Jun 27 15:21:34 2025] __schedule+0x7c7/0x1930 [Fri Jun 27 15:21:34 2025] ? __pfx___schedule+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? policy_nodemask+0x215/0x340 [Fri Jun 27 15:21:34 2025] ? _raw_spin_lock_irq+0x8a/0xe0 [Fri Jun 27 15:21:34 2025] ? __pfx__raw_spin_lock_irq+0x10/0x10 [Fri Jun 27 15:21:34 2025] schedule+0x6a/0x180 [Fri Jun 27 15:21:34 2025] schedule_preempt_disabled+0x15/0x30 [Fri Jun 27 15:21:34 2025] rwsem_down_read_slowpath+0x55e/0xe10 [Fri Jun 27 15:21:34 2025] ? __pfx_rwsem_down_read_slowpath+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? __pfx___might_resched+0x10/0x10 [Fri Jun 27 15:21:34 2025] down_read+0xc9/0x230 [Fri Jun 27 15:21:34 2025] ? __pfx_down_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? __debugfs_file_get+0x14d/0x700 [Fri Jun 27 15:21:34 2025] ? __pfx___debugfs_file_get+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? handle_pte_fault+0x52a/0x710 [Fri Jun 27 15:21:34 2025] ? selinux_file_permission+0x3a9/0x590 [Fri Jun 27 15:21:34 2025] read_dummy_rwsem_read+0x4a/0x90 [Fri Jun 27 15:21:34 2025] full_proxy_read+0xff/0x1c0 [Fri Jun 27 15:21:34 2025] ? rw_verify_area+0x6d/0x410 [Fri Jun 27 15:21:34 2025] vfs_read+0x177/0xa50 [Fri Jun 27 15:21:34 2025] ? __pfx_vfs_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? fdget_pos+0x1cf/0x4c0 [Fri Jun 27 15:21:34 2025] ksys_read+0xfc/0x1d0 [Fri Jun 27 15:21:34 2025] ? __pfx_ksys_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] do_syscall_64+0x66/0x2d0 [Fri Jun 27 15:21:34 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Fri Jun 27 15:21:34 2025] RIP: 0033:0x7f3f8faefb40 [Fri Jun 27 15:21:34 2025] RSP: 002b:00007ffdeda5ab98 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Fri Jun 27 15:21:34 2025] RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007f3f8faefb40 [Fri Jun 27 15:21:34 2025] RDX: 0000000000010000 RSI: 00000000010fa000 RDI: 0000000000000003 [Fri Jun 27 15:21:34 2025] RBP: 00000000010fa000 R08: 0000000000000000 R09: 0000000000010fff [Fri Jun 27 15:21:34 2025] R10: 00007ffdeda59fe0 R11: 0000000000000246 R12: 00000000010fa000 [Fri Jun 27 15:21:34 2025] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff [Fri Jun 27 15:21:34 2025] [Fri Jun 27 15:21:34 2025] INFO: task cat:28631 blocked on an rw-semaphore likely owned by task cat:28630 [Fri Jun 27 15:21:34 2025] task:cat state:S stack:0 pid:28630 tgid:28630 ppid:28501 task_flags:0x400000 flags:0x00004000 [Fri Jun 27 15:21:34 2025] Call Trace: [Fri Jun 27 15:21:34 2025] [Fri Jun 27 15:21:34 2025] __schedule+0x7c7/0x1930 [Fri Jun 27 15:21:34 2025] ? __pfx___schedule+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? __mod_timer+0x304/0xa80 [Fri Jun 27 15:21:34 2025] schedule+0x6a/0x180 [Fri Jun 27 15:21:34 2025] schedule_timeout+0xfb/0x230 [Fri Jun 27 15:21:34 2025] ? __pfx_schedule_timeout+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? __pfx_process_timeout+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? down_write+0xc4/0x140 [Fri Jun 27 15:21:34 2025] msleep_interruptible+0xbe/0x150 [Fri Jun 27 15:21:34 2025] read_dummy_rwsem_write+0x54/0x90 [Fri Jun 27 15:21:34 2025] full_proxy_read+0xff/0x1c0 [Fri Jun 27 15:21:34 2025] ? rw_verify_area+0x6d/0x410 [Fri Jun 27 15:21:34 2025] vfs_read+0x177/0xa50 [Fri Jun 27 15:21:34 2025] ? __pfx_vfs_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? fdget_pos+0x1cf/0x4c0 [Fri Jun 27 15:21:34 2025] ksys_read+0xfc/0x1d0 [Fri Jun 27 15:21:34 2025] ? __pfx_ksys_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] do_syscall_64+0x66/0x2d0 [Fri Jun 27 15:21:34 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Fri Jun 27 15:21:34 2025] RIP: 0033:0x7f8f288efb40 [Fri Jun 27 15:21:34 2025] RSP: 002b:00007ffffb631038 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Fri Jun 27 15:21:34 2025] RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007f8f288efb40 [Fri Jun 27 15:21:34 2025] RDX: 0000000000010000 RSI: 000000002a4b5000 RDI: 0000000000000003 [Fri Jun 27 15:21:34 2025] RBP: 000000002a4b5000 R08: 0000000000000000 R09: 0000000000010fff [Fri Jun 27 15:21:34 2025] R10: 00007ffffb630460 R11: 0000000000000246 R12: 000000002a4b5000 [Fri Jun 27 15:21:34 2025] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff [Fri Jun 27 15:21:34 2025] [1] https://lore.kernel.org/all/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com/ Link: https://lkml.kernel.org/r/20250627072924.36567-3-lance.yang@linux.dev Signed-off-by: Lance Yang Suggested-by: Masami Hiramatsu (Google) Reviewed-by: Masami Hiramatsu (Google) Cc: Anna Schumaker Cc: Boqun Feng Cc: Ingo Molnar Cc: Joel Granados Cc: John Stultz Cc: Kent Overstreet Cc: Mingzhe Yang Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Tomasz Figa Cc: Waiman Long Cc: Will Deacon Cc: Yongliang Gao Cc: Zi Li Signed-off-by: Andrew Morton --- include/linux/hung_task.h | 18 +++++++++--------- kernel/hung_task.c | 29 +++++++++++++++++++++++++---- kernel/locking/rwsem.c | 17 ++++++++++++++++- 3 files changed, 50 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hung_task.h b/include/linux/hung_task.h index 1bc2b3244613..34e615c76ca5 100644 --- a/include/linux/hung_task.h +++ b/include/linux/hung_task.h @@ -21,17 +21,17 @@ * type. * * Type encoding: - * 00 - Blocked on mutex (BLOCKER_TYPE_MUTEX) - * 01 - Blocked on semaphore (BLOCKER_TYPE_SEM) - * 10 - Blocked on rt-mutex (BLOCKER_TYPE_RTMUTEX) - * 11 - Blocked on rw-semaphore (BLOCKER_TYPE_RWSEM) + * 00 - Blocked on mutex (BLOCKER_TYPE_MUTEX) + * 01 - Blocked on semaphore (BLOCKER_TYPE_SEM) + * 10 - Blocked on rw-semaphore as READER (BLOCKER_TYPE_RWSEM_READER) + * 11 - Blocked on rw-semaphore as WRITER (BLOCKER_TYPE_RWSEM_WRITER) */ -#define BLOCKER_TYPE_MUTEX 0x00UL -#define BLOCKER_TYPE_SEM 0x01UL -#define BLOCKER_TYPE_RTMUTEX 0x02UL -#define BLOCKER_TYPE_RWSEM 0x03UL +#define BLOCKER_TYPE_MUTEX 0x00UL +#define BLOCKER_TYPE_SEM 0x01UL +#define BLOCKER_TYPE_RWSEM_READER 0x02UL +#define BLOCKER_TYPE_RWSEM_WRITER 0x03UL -#define BLOCKER_TYPE_MASK 0x03UL +#define BLOCKER_TYPE_MASK 0x03UL #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER static inline void hung_task_set_blocker(void *lock, unsigned long type) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index d2432df2b905..8708a1205f82 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -100,6 +101,7 @@ static void debug_show_blocker(struct task_struct *task) { struct task_struct *g, *t; unsigned long owner, blocker, blocker_type; + const char *rwsem_blocked_by, *rwsem_blocked_as; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); @@ -111,12 +113,20 @@ static void debug_show_blocker(struct task_struct *task) switch (blocker_type) { case BLOCKER_TYPE_MUTEX: - owner = mutex_get_owner( - (struct mutex *)hung_task_blocker_to_lock(blocker)); + owner = mutex_get_owner(hung_task_blocker_to_lock(blocker)); break; case BLOCKER_TYPE_SEM: - owner = sem_last_holder( - (struct semaphore *)hung_task_blocker_to_lock(blocker)); + owner = sem_last_holder(hung_task_blocker_to_lock(blocker)); + break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + owner = (unsigned long)rwsem_owner( + hung_task_blocker_to_lock(blocker)); + rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ? + "reader" : "writer"; + rwsem_blocked_by = is_rwsem_reader_owned( + hung_task_blocker_to_lock(blocker)) ? + "reader" : "writer"; break; default: WARN_ON_ONCE(1); @@ -134,6 +144,11 @@ static void debug_show_blocker(struct task_struct *task) pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n", task->comm, task->pid); break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n", + task->comm, task->pid); + break; } return; } @@ -152,6 +167,12 @@ static void debug_show_blocker(struct task_struct *task) pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n", task->comm, task->pid, t->comm, t->pid); break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n", + task->comm, task->pid, rwsem_blocked_as, t->comm, + t->pid, rwsem_blocked_by); + break; } sched_show_task(t); return; diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index a310eb9896de..92c6332da401 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #ifndef CONFIG_PREEMPT_RT @@ -1065,10 +1066,13 @@ queue: wake_up_q(&wake_q); trace_contention_begin(sem, LCB_F_READ); + set_current_state(state); + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER); /* wait to be given the lock */ for (;;) { - set_current_state(state); if (!smp_load_acquire(&waiter.task)) { /* Matches rwsem_mark_wake()'s smp_store_release(). */ break; @@ -1083,8 +1087,12 @@ queue: } schedule_preempt_disabled(); lockevent_inc(rwsem_sleep_reader); + set_current_state(state); } + if (state == TASK_UNINTERRUPTIBLE) + hung_task_clear_blocker(); + __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock); trace_contention_end(sem, 0); @@ -1146,6 +1154,9 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) set_current_state(state); trace_contention_begin(sem, LCB_F_WRITE); + if (state == TASK_UNINTERRUPTIBLE) + hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER); + for (;;) { if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ @@ -1179,6 +1190,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) trylock_again: raw_spin_lock_irq(&sem->wait_lock); } + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_clear_blocker(); + __set_current_state(TASK_RUNNING); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); -- cgit v1.2.3 From b3d5fd6f82dde8c906dc2a587003a44252ae5eae Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 6 Jun 2025 21:47:56 +0800 Subject: lib/math/gcd: use static key to select implementation at runtime Patch series "Optimize GCD performance on RISC-V by selecting implementation at runtime", v3. The current implementation of gcd() selects between the binary GCD and the odd-even GCD algorithm at compile time, depending on whether CONFIG_CPU_NO_EFFICIENT_FFS is set. On platforms like RISC-V, however, this compile-time decision can be misleading: even when the compiler emits ctz instructions based on the assumption that they are efficient (as is the case when CONFIG_RISCV_ISA_ZBB is enabled), the actual hardware may lack support for the Zbb extension. In such cases, ffs() falls back to a software implementation at runtime, making the binary GCD algorithm significantly slower than the odd-even variant. To address this, we introduce a static key to allow runtime selection between the binary and odd-even GCD implementations. On RISC-V, the kernel now checks for Zbb support during boot. If Zbb is unavailable, the static key is disabled so that gcd() consistently uses the more efficient odd-even algorithm in that scenario. Additionally, to further reduce code size, we select CONFIG_CPU_NO_EFFICIENT_FFS automatically when CONFIG_RISCV_ISA_ZBB is not enabled, avoiding compilation of the unused binary GCD implementation entirely on systems where it would never be executed. This series ensures that the most efficient GCD algorithm is used in practice and avoids compiling unnecessary code based on hardware capabilities and kernel configuration. This patch (of 3): On platforms like RISC-V, the compiler may generate hardware FFS instructions even if the underlying CPU does not actually support them. Currently, the GCD implementation is chosen at compile time based on CONFIG_CPU_NO_EFFICIENT_FFS, which can result in suboptimal behavior on such systems. Introduce a static key, efficient_ffs_key, to enable runtime selection between the binary GCD (using ffs) and the odd-even GCD implementation. This allows the kernel to default to the faster binary GCD when FFS is efficient, while retaining the ability to fall back when needed. Link: https://lkml.kernel.org/r/20250606134758.1308400-1-visitorckw@gmail.com Link: https://lkml.kernel.org/r/20250606134758.1308400-2-visitorckw@gmail.com Co-developed-by: Yu-Chun Lin Signed-off-by: Yu-Chun Lin Signed-off-by: Kuan-Wei Chiu Cc: Albert Ou Cc: Ching-Chun (Jim) Huang Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Alexandre Ghiti Signed-off-by: Andrew Morton --- include/linux/gcd.h | 3 +++ lib/math/gcd.c | 27 +++++++++++++++------------ 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gcd.h b/include/linux/gcd.h index cb572677fd7f..616e81a7f7e3 100644 --- a/include/linux/gcd.h +++ b/include/linux/gcd.h @@ -3,6 +3,9 @@ #define _GCD_H #include +#include + +DECLARE_STATIC_KEY_TRUE(efficient_ffs_key); unsigned long gcd(unsigned long a, unsigned long b) __attribute_const__; diff --git a/lib/math/gcd.c b/lib/math/gcd.c index e3b042214d1b..62efca6787ae 100644 --- a/lib/math/gcd.c +++ b/lib/math/gcd.c @@ -11,22 +11,16 @@ * has decent hardware division. */ +DEFINE_STATIC_KEY_TRUE(efficient_ffs_key); + #if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) /* If __ffs is available, the even/odd algorithm benchmarks slower. */ -/** - * gcd - calculate and return the greatest common divisor of 2 unsigned longs - * @a: first value - * @b: second value - */ -unsigned long gcd(unsigned long a, unsigned long b) +static unsigned long binary_gcd(unsigned long a, unsigned long b) { unsigned long r = a | b; - if (!a || !b) - return r; - b >>= __ffs(b); if (b == 1) return r & -r; @@ -44,9 +38,15 @@ unsigned long gcd(unsigned long a, unsigned long b) } } -#else +#endif /* If normalization is done by loops, the even/odd algorithm is a win. */ + +/** + * gcd - calculate and return the greatest common divisor of 2 unsigned longs + * @a: first value + * @b: second value + */ unsigned long gcd(unsigned long a, unsigned long b) { unsigned long r = a | b; @@ -54,6 +54,11 @@ unsigned long gcd(unsigned long a, unsigned long b) if (!a || !b) return r; +#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) + if (static_branch_likely(&efficient_ffs_key)) + return binary_gcd(a, b); +#endif + /* Isolate lsbit of r */ r &= -r; @@ -80,6 +85,4 @@ unsigned long gcd(unsigned long a, unsigned long b) } } -#endif - EXPORT_SYMBOL_GPL(gcd); -- cgit v1.2.3 From ad38574a8e8223361e265973fbd87013ea058c5d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 8 Jul 2025 18:03:33 +0100 Subject: f2fs: Pass a folio to ADDRS_PER_PAGE() All callers now have a folio so pass it in. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/file.c | 18 +++++++++--------- fs/f2fs/recovery.c | 4 ++-- fs/f2fs/segment.c | 2 +- include/linux/f2fs_fs.h | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0e261caf2f91..8a2414ce39ff 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1588,7 +1588,7 @@ next_dnode: start_pgofs = pgofs; prealloc = 0; last_ofs_in_node = ofs_in_node = dn.ofs_in_node; - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); next_block: blkaddr = f2fs_data_blkaddr(&dn); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 04a5a1089320..60618c52ba50 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -489,7 +489,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) } } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; @@ -814,7 +814,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) goto out; } - count = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + count = ADDRS_PER_PAGE(dn.node_folio, inode); count -= dn.ofs_in_node; f2fs_bug_on(sbi, count < 0); @@ -1233,7 +1233,7 @@ int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) return err; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); @@ -1332,7 +1332,7 @@ next_dnode: goto next; } - done = min((pgoff_t)ADDRS_PER_PAGE(&dn.node_folio->page, inode) - + done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { *blkaddr = f2fs_data_blkaddr(&dn); @@ -1421,7 +1421,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, } ilen = min((pgoff_t) - ADDRS_PER_PAGE(&dn.node_folio->page, dst_inode) - + ADDRS_PER_PAGE(dn.node_folio, dst_inode) - dn.ofs_in_node, len - i); do { dn.data_blkaddr = f2fs_data_blkaddr(&dn); @@ -1717,7 +1717,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, goto out; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); end = min(pg_end, end_offset - dn.ofs_in_node + index); ret = f2fs_do_zero_range(&dn, index, end); @@ -3885,7 +3885,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) break; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); count = round_up(count, fi->i_cluster_size); @@ -4063,7 +4063,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) break; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); count = round_up(count, fi->i_cluster_size); @@ -4227,7 +4227,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) goto out; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); count = min(end_offset - dn.ofs_in_node, pg_end - index); for (i = 0; i < count; i++, index++, dn.ofs_in_node++) { struct block_device *cur_bdev; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 5a45d0d1f05c..894b27b0329d 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -527,7 +527,7 @@ got_it: nid = le32_to_cpu(sum.nid); ofs_in_node = le16_to_cpu(sum.ofs_in_node); - max_addrs = ADDRS_PER_PAGE(&dn->node_folio->page, dn->inode); + max_addrs = ADDRS_PER_PAGE(dn->node_folio, dn->inode); if (ofs_in_node >= max_addrs) { f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", ofs_in_node, dn->inode->i_ino, nid, max_addrs); @@ -649,7 +649,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, /* step 3: recover data indices */ start = f2fs_start_bidx_of_node(ofs_of_node(folio), inode); - end = start + ADDRS_PER_PAGE(&folio->page, inode); + end = start + ADDRS_PER_PAGE(folio, inode); set_new_dnode(&dn, inode, NULL, NULL, 0); retry_dn: diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2a6dcfba911f..909637873ff7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -334,7 +334,7 @@ static int __f2fs_commit_atomic_write(struct inode *inode) goto next; } - blen = min((pgoff_t)ADDRS_PER_PAGE(&dn.node_folio->page, cow_inode), + blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, cow_inode), len); index = off; for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 5206d63b3386..25857877eaec 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -268,7 +268,7 @@ struct node_footer { /* Node IDs in an Indirect Block */ #define NIDS_PER_BLOCK ((F2FS_BLKSIZE - sizeof(struct node_footer)) / sizeof(__le32)) -#define ADDRS_PER_PAGE(page, inode) (addrs_per_page(inode, IS_INODE(page))) +#define ADDRS_PER_PAGE(folio, inode) (addrs_per_page(inode, IS_INODE(&folio->page))) #define NODE_DIR1_BLOCK (DEF_ADDRS_PER_INODE + 1) #define NODE_DIR2_BLOCK (DEF_ADDRS_PER_INODE + 2) -- cgit v1.2.3 From a5f3be6e652a7beaaf6c482bc013b64129a5d239 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 8 Jul 2025 18:03:34 +0100 Subject: f2fs: Pass a folio to IS_INODE() All callers now have a folio so pass it in. Also make it const to help the compiler. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 +++--- fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 2 +- fs/f2fs/inline.c | 2 +- fs/f2fs/inode.c | 2 +- fs/f2fs/node.c | 14 ++++++-------- fs/f2fs/recovery.c | 8 ++++---- include/linux/f2fs_fs.h | 2 +- 8 files changed, 18 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3933327d8cc3..09ddc0626dfe 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3019,9 +3019,9 @@ static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, #define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino) -static inline bool IS_INODE(struct page *page) +static inline bool IS_INODE(const struct folio *folio) { - struct f2fs_node *p = F2FS_NODE(page); + struct f2fs_node *p = F2FS_NODE(&folio->page); return RAW_IS_INODE(p); } @@ -3041,7 +3041,7 @@ static inline int f2fs_has_extra_attr(struct inode *inode); static inline unsigned int get_dnode_base(struct inode *inode, struct folio *node_folio) { - if (!IS_INODE(&node_folio->page)) + if (!IS_INODE(node_folio)) return 0; return inode ? get_extra_isize(inode) : diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 60618c52ba50..36b32757d5b9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -819,7 +819,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) count -= dn.ofs_in_node; f2fs_bug_on(sbi, count < 0); - if (dn.ofs_in_node || IS_INODE(&dn.node_folio->page)) { + if (dn.ofs_in_node || IS_INODE(dn.node_folio)) { f2fs_truncate_data_blocks_range(&dn, count); free_from += count; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 781b955cbb77..c1d4ecbd2505 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1162,7 +1162,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return false; } - if (IS_INODE(&node_folio->page)) { + if (IS_INODE(node_folio)) { base = offset_in_addr(F2FS_INODE(node_folio)); max_addrs = DEF_ADDRS_PER_INODE; } else { diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 9851310cdb87..51adc43d5a5c 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -305,7 +305,7 @@ int f2fs_recover_inline_data(struct inode *inode, struct folio *nfolio) * x o -> remove data blocks, and then recover inline_data * x x -> recover data blocks */ - if (IS_INODE(&nfolio->page)) + if (IS_INODE(nfolio)) ri = F2FS_INODE(nfolio); if (f2fs_has_inline_data(inode) && diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index a8f64d206d19..dd3b43c24831 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -124,7 +124,7 @@ bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct folio *folio) if (!f2fs_sb_has_inode_chksum(sbi)) return false; - if (!IS_INODE(&folio->page) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + if (!IS_INODE(folio) || !(ri->i_inline & F2FS_EXTRA_ATTR)) return false; if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize), diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b56e627e0b56..908a1eb9c415 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -973,7 +973,7 @@ static int truncate_dnode(struct dnode_of_data *dn) else if (IS_ERR(folio)) return PTR_ERR(folio); - if (IS_INODE(&folio->page) || ino_of_node(folio) != dn->inode->i_ino) { + if (IS_INODE(folio) || ino_of_node(folio) != dn->inode->i_ino) { f2fs_err(sbi, "incorrect node reference, ino: %lu, nid: %u, ino_of_node: %u", dn->inode->i_ino, dn->nid, ino_of_node(folio)); set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -1474,10 +1474,8 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi, struct folio *folio, pgoff_t nid, enum node_type ntype) { - struct page *page = &folio->page; - if (unlikely(nid != nid_of_node(folio) || - (ntype == NODE_TYPE_INODE && !IS_INODE(page)) || + (ntype == NODE_TYPE_INODE && !IS_INODE(folio)) || (ntype == NODE_TYPE_XATTR && !f2fs_has_xattr_block(ofs_of_node(folio))) || time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) { @@ -1867,7 +1865,7 @@ continue_unlock: if (!atomic || folio == last_folio) { set_fsync_mark(folio, 1); percpu_counter_inc(&sbi->rf_node_block_count); - if (IS_INODE(&folio->page)) { + if (IS_INODE(folio)) { if (is_inode_flag_set(inode, FI_DIRTY_INODE)) f2fs_update_inode(inode, folio); @@ -1976,7 +1974,7 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - if (!IS_INODE(&folio->page)) + if (!IS_INODE(folio)) continue; folio_lock(folio); @@ -2077,7 +2075,7 @@ continue_unlock: } /* flush dirty inode */ - if (IS_INODE(&folio->page) && flush_dirty_inode(folio)) + if (IS_INODE(folio) && flush_dirty_inode(folio)) goto lock_node; write_node: f2fs_folio_wait_writeback(folio, NODE, true, true); @@ -2213,7 +2211,7 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping, if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); #ifdef CONFIG_F2FS_CHECK_FS - if (IS_INODE(&folio->page)) + if (IS_INODE(folio)) f2fs_inode_chksum_set(F2FS_M_SB(mapping), folio); #endif if (filemap_dirty_folio(mapping, folio)) { diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 894b27b0329d..4cb3a91801b4 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -437,7 +437,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, bool quota_inode = false; if (!check_only && - IS_INODE(&folio->page) && + IS_INODE(folio) && is_dent_dnode(folio)) { err = f2fs_recover_inode_page(sbi, folio); if (err) { @@ -463,7 +463,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, } entry->blkaddr = blkaddr; - if (IS_INODE(&folio->page) && is_dent_dnode(folio)) + if (IS_INODE(folio) && is_dent_dnode(folio)) entry->last_dentry = blkaddr; next: /* check next segment */ @@ -628,7 +628,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, int err = 0, recovered = 0; /* step 1: recover xattr */ - if (IS_INODE(&folio->page)) { + if (IS_INODE(folio)) { err = f2fs_recover_inline_xattr(inode, folio); if (err) goto out; @@ -821,7 +821,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, * In this case, we can lose the latest inode(x). * So, call recover_inode for the inode update. */ - if (IS_INODE(&folio->page)) { + if (IS_INODE(folio)) { err = recover_inode(entry->inode, folio); if (err) { f2fs_folio_put(folio, true); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 25857877eaec..2f8b8bfc0e73 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -268,7 +268,7 @@ struct node_footer { /* Node IDs in an Indirect Block */ #define NIDS_PER_BLOCK ((F2FS_BLKSIZE - sizeof(struct node_footer)) / sizeof(__le32)) -#define ADDRS_PER_PAGE(folio, inode) (addrs_per_page(inode, IS_INODE(&folio->page))) +#define ADDRS_PER_PAGE(folio, inode) (addrs_per_page(inode, IS_INODE(folio))) #define NODE_DIR1_BLOCK (DEF_ADDRS_PER_INODE + 1) #define NODE_DIR2_BLOCK (DEF_ADDRS_PER_INODE + 2) -- cgit v1.2.3 From a824388d911927b2a82bf7dcfd7cef6ee45c8b43 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 8 Jul 2025 18:03:36 +0100 Subject: f2fs: Use a folio in f2fs_is_cp_guaranteed() Convert the passed page to a folio and use it throughout. Removes a use of fscrypt_is_bounce_page(), which we're trying to remove. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 ++++++----- fs/f2fs/f2fs.h | 2 +- include/linux/fscrypt.h | 10 ++++++---- 3 files changed, 13 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f3e11f5672ec..c1fc8c7b1256 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -47,14 +47,15 @@ void f2fs_destroy_bioset(void) bioset_exit(&f2fs_bioset); } -bool f2fs_is_cp_guaranteed(struct page *page) +bool f2fs_is_cp_guaranteed(const struct page *page) { - struct address_space *mapping = page_folio(page)->mapping; + const struct folio *folio = page_folio(page); + struct address_space *mapping = folio->mapping; struct inode *inode; struct f2fs_sb_info *sbi; - if (fscrypt_is_bounce_page(page)) - return page_private_gcing(fscrypt_pagecache_page(page)); + if (fscrypt_is_bounce_folio(folio)) + return folio_test_f2fs_gcing(fscrypt_pagecache_folio(folio)); inode = mapping->host; sbi = F2FS_I_SB(inode); @@ -65,7 +66,7 @@ bool f2fs_is_cp_guaranteed(struct page *page) return true; if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || - page_private_gcing(page)) + folio_test_f2fs_gcing(folio)) return true; return false; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0e607305e308..be9b7a0120a9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3990,7 +3990,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); -bool f2fs_is_cp_guaranteed(struct page *page); +bool f2fs_is_cp_guaranteed(const struct page *page); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 56fad33043d5..8d9127a0fdb3 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -332,12 +332,13 @@ static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) return (struct page *)page_private(bounce_page); } -static inline bool fscrypt_is_bounce_folio(struct folio *folio) +static inline bool fscrypt_is_bounce_folio(const struct folio *folio) { return folio->mapping == NULL; } -static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) +static inline +struct folio *fscrypt_pagecache_folio(const struct folio *bounce_folio) { return bounce_folio->private; } @@ -518,12 +519,13 @@ static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) return ERR_PTR(-EINVAL); } -static inline bool fscrypt_is_bounce_folio(struct folio *folio) +static inline bool fscrypt_is_bounce_folio(const struct folio *folio) { return false; } -static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) +static inline +struct folio *fscrypt_pagecache_folio(const struct folio *bounce_folio) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); -- cgit v1.2.3 From ea4d331050b4cd43e6a900937db88b01ef75e1f2 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Wed, 16 Oct 2024 06:02:41 +0200 Subject: Input: touch-overlay - add touchscreen overlay handling Some touch devices provide mechanical overlays with different objects like buttons or clipped touchscreen surfaces. In order to support these objects, add a series of helper functions to the input subsystem to transform them into overlay objects via device tree nodes. These overlay objects consume the raw touch events and report the expected input events depending on the object properties. Note that the current implementation allows for multiple definitions of touchscreen areas (regions that report touch events), but only the first one will be used for the touchscreen device that the consumers typically provide. Should the need for multiple touchscreen areas arise, additional touchscreen devices would be required at the consumer side. There is no limitation in the number of touch areas defined as buttons. Reviewed-by: Jeff LaBundy Signed-off-by: Javier Carrasco Link: https://lore.kernel.org/r/20241016-feature-ts_virtobj_patch-v11-2-b292a1bbb0a1@wolfvision.net Signed-off-by: Dmitry Torokhov --- MAINTAINERS | 7 + drivers/input/Makefile | 2 +- drivers/input/touch-overlay.c | 277 ++++++++++++++++++++++++++++++++++++ include/linux/input/touch-overlay.h | 25 ++++ 4 files changed, 310 insertions(+), 1 deletion(-) create mode 100644 drivers/input/touch-overlay.c create mode 100644 include/linux/input/touch-overlay.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index adcd58147f97..3466d0d59c5f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24609,6 +24609,13 @@ L: platform-driver-x86@vger.kernel.org S: Maintained F: drivers/platform/x86/toshiba-wmi.c +TOUCH OVERLAY +M: Javier Carrasco +L: linux-input@vger.kernel.org +S: Maintained +F: drivers/input/touch-overlay.c +F: include/linux/input/touch-overlay.h + TPM DEVICE DRIVER M: Peter Huewe M: Jarkko Sakkinen diff --git a/drivers/input/Makefile b/drivers/input/Makefile index 930b64d2115e..2cd6e1c9a778 100644 --- a/drivers/input/Makefile +++ b/drivers/input/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_INPUT) += input-core.o input-core-y := input.o input-compat.o input-mt.o input-poller.o ff-core.o -input-core-y += touchscreen.o +input-core-y += touchscreen.o touch-overlay.o obj-$(CONFIG_INPUT_FF_MEMLESS) += ff-memless.o obj-$(CONFIG_INPUT_SPARSEKMAP) += sparse-keymap.o diff --git a/drivers/input/touch-overlay.c b/drivers/input/touch-overlay.c new file mode 100644 index 000000000000..8806373f7a4a --- /dev/null +++ b/drivers/input/touch-overlay.c @@ -0,0 +1,277 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Helper functions for overlay objects on touchscreens + * + * Copyright (c) 2023 Javier Carrasco + */ + +#include +#include +#include +#include +#include +#include + +struct touch_overlay_segment { + struct list_head list; + u32 x_origin; + u32 y_origin; + u32 x_size; + u32 y_size; + u32 key; + bool pressed; + int slot; +}; + +static int touch_overlay_get_segment(struct fwnode_handle *segment_node, + struct touch_overlay_segment *segment, + struct input_dev *input) +{ + int error; + + error = fwnode_property_read_u32(segment_node, "x-origin", + &segment->x_origin); + if (error) + return error; + + error = fwnode_property_read_u32(segment_node, "y-origin", + &segment->y_origin); + if (error) + return error; + + error = fwnode_property_read_u32(segment_node, "x-size", + &segment->x_size); + if (error) + return error; + + error = fwnode_property_read_u32(segment_node, "y-size", + &segment->y_size); + if (error) + return error; + + error = fwnode_property_read_u32(segment_node, "linux,code", + &segment->key); + if (!error) + input_set_capability(input, EV_KEY, segment->key); + else if (error != -EINVAL) + return error; + + return 0; +} + +/** + * touch_overlay_map - map overlay objects from the device tree and set + * key capabilities if buttons are defined. + * @list: pointer to the list that will hold the segments + * @input: pointer to the already allocated input_dev + * + * Returns 0 on success and error number otherwise. + * + * If buttons are defined, key capabilities are set accordingly. + */ +int touch_overlay_map(struct list_head *list, struct input_dev *input) +{ + struct fwnode_handle *fw_segment; + struct device *dev = input->dev.parent; + struct touch_overlay_segment *segment; + int error; + + struct fwnode_handle *overlay __free(fwnode_handle) = + device_get_named_child_node(dev, "touch-overlay"); + if (!overlay) + return 0; + + fwnode_for_each_available_child_node(overlay, fw_segment) { + segment = devm_kzalloc(dev, sizeof(*segment), GFP_KERNEL); + if (!segment) { + fwnode_handle_put(fw_segment); + return -ENOMEM; + } + error = touch_overlay_get_segment(fw_segment, segment, input); + if (error) { + fwnode_handle_put(fw_segment); + return error; + } + list_add_tail(&segment->list, list); + } + + return 0; +} +EXPORT_SYMBOL(touch_overlay_map); + +/** + * touch_overlay_get_touchscreen_abs - get abs size from the touchscreen area. + * @list: pointer to the list that holds the segments + * @x: horizontal abs + * @y: vertical abs + */ +void touch_overlay_get_touchscreen_abs(struct list_head *list, u16 *x, u16 *y) +{ + struct touch_overlay_segment *segment; + struct list_head *ptr; + + list_for_each(ptr, list) { + segment = list_entry(ptr, struct touch_overlay_segment, list); + if (!segment->key) { + *x = segment->x_size - 1; + *y = segment->y_size - 1; + break; + } + } +} +EXPORT_SYMBOL(touch_overlay_get_touchscreen_abs); + +static bool touch_overlay_segment_event(struct touch_overlay_segment *seg, + struct input_mt_pos *pos) +{ + if (pos->x >= seg->x_origin && pos->x < (seg->x_origin + seg->x_size) && + pos->y >= seg->y_origin && pos->y < (seg->y_origin + seg->y_size)) + return true; + + return false; +} + +/** + * touch_overlay_mapped_touchscreen - check if a touchscreen area is mapped + * @list: pointer to the list that holds the segments + * + * Returns true if a touchscreen area is mapped or false otherwise. + */ +bool touch_overlay_mapped_touchscreen(struct list_head *list) +{ + struct touch_overlay_segment *segment; + struct list_head *ptr; + + list_for_each(ptr, list) { + segment = list_entry(ptr, struct touch_overlay_segment, list); + if (!segment->key) + return true; + } + + return false; +} +EXPORT_SYMBOL(touch_overlay_mapped_touchscreen); + +static bool touch_overlay_event_on_ts(struct list_head *list, + struct input_mt_pos *pos) +{ + struct touch_overlay_segment *segment; + struct list_head *ptr; + + list_for_each(ptr, list) { + segment = list_entry(ptr, struct touch_overlay_segment, list); + if (segment->key) + continue; + + if (touch_overlay_segment_event(segment, pos)) { + pos->x -= segment->x_origin; + pos->y -= segment->y_origin; + return true; + } + /* ignore touch events outside the defined area */ + return false; + } + + return true; +} + +static bool touch_overlay_button_event(struct input_dev *input, + struct touch_overlay_segment *segment, + struct input_mt_pos *pos, int slot) +{ + struct input_mt *mt = input->mt; + struct input_mt_slot *s = &mt->slots[slot]; + bool button_contact = touch_overlay_segment_event(segment, pos); + + if (segment->slot == slot && segment->pressed) { + /* sliding out of the button releases it */ + if (!button_contact) { + input_report_key(input, segment->key, false); + segment->pressed = false; + /* keep available for a possible touch event */ + return false; + } + /* ignore sliding on the button while pressed */ + s->frame = mt->frame; + return true; + } else if (button_contact) { + input_report_key(input, segment->key, true); + s->frame = mt->frame; + segment->slot = slot; + segment->pressed = true; + return true; + } + + return false; +} + +/** + * touch_overlay_sync_frame - update the status of the segments and report + * buttons whose tracked slot is unused. + * @list: pointer to the list that holds the segments + * @input: pointer to the input device associated to the contact + */ +void touch_overlay_sync_frame(struct list_head *list, struct input_dev *input) +{ + struct touch_overlay_segment *segment; + struct input_mt *mt = input->mt; + struct input_mt_slot *s; + struct list_head *ptr; + + list_for_each(ptr, list) { + segment = list_entry(ptr, struct touch_overlay_segment, list); + if (!segment->key) + continue; + + s = &mt->slots[segment->slot]; + if (!input_mt_is_used(mt, s) && segment->pressed) { + input_report_key(input, segment->key, false); + segment->pressed = false; + } + } +} +EXPORT_SYMBOL(touch_overlay_sync_frame); + +/** + * touch_overlay_process_contact - process contacts according to the overlay + * mapping. This function acts as a filter to release the calling driver + * from the contacts that are either related to overlay buttons or out of the + * overlay touchscreen area, if defined. + * @list: pointer to the list that holds the segments + * @input: pointer to the input device associated to the contact + * @pos: pointer to the contact position + * @slot: slot associated to the contact (0 if multitouch is not supported) + * + * Returns true if the contact was processed (reported for valid key events + * and dropped for contacts outside the overlay touchscreen area) or false + * if the contact must be processed by the caller. In that case this function + * shifts the (x,y) coordinates to the overlay touchscreen axis if required. + */ +bool touch_overlay_process_contact(struct list_head *list, + struct input_dev *input, + struct input_mt_pos *pos, int slot) +{ + struct touch_overlay_segment *segment; + struct list_head *ptr; + + /* + * buttons must be prioritized over overlay touchscreens to account for + * overlappings e.g. a button inside the touchscreen area. + */ + list_for_each(ptr, list) { + segment = list_entry(ptr, struct touch_overlay_segment, list); + if (segment->key && + touch_overlay_button_event(input, segment, pos, slot)) + return true; + } + + /* + * valid contacts on the overlay touchscreen are left for the client + * to be processed/reported according to its (possibly) unique features. + */ + return !touch_overlay_event_on_ts(list, pos); +} +EXPORT_SYMBOL(touch_overlay_process_contact); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Helper functions for overlay objects on touch devices"); diff --git a/include/linux/input/touch-overlay.h b/include/linux/input/touch-overlay.h new file mode 100644 index 000000000000..0253e554d3cd --- /dev/null +++ b/include/linux/input/touch-overlay.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2023 Javier Carrasco + */ + +#ifndef _TOUCH_OVERLAY +#define _TOUCH_OVERLAY + +#include + +struct input_dev; + +int touch_overlay_map(struct list_head *list, struct input_dev *input); + +void touch_overlay_get_touchscreen_abs(struct list_head *list, u16 *x, u16 *y); + +bool touch_overlay_mapped_touchscreen(struct list_head *list); + +bool touch_overlay_process_contact(struct list_head *list, + struct input_dev *input, + struct input_mt_pos *pos, int slot); + +void touch_overlay_sync_frame(struct list_head *list, struct input_dev *input); + +#endif -- cgit v1.2.3 From 5523a466e905b6287b94654ddb364536f2f948cf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Jul 2025 11:06:03 +0200 Subject: i3c: fix module_i3c_i2c_driver() with I3C=n When CONFIG_I3C is disabled and the i3c_i2c_driver_register() happens to not be inlined, any driver calling it still references the i3c_driver instance, which then causes a link failure: x86_64-linux-ld: drivers/hwmon/lm75.o: in function `lm75_i3c_reg_read': lm75.c:(.text+0xc61): undefined reference to `i3cdev_to_dev' x86_64-linux-ld: lm75.c:(.text+0xd25): undefined reference to `i3c_device_do_priv_xfers' x86_64-linux-ld: lm75.c:(.text+0xdd8): undefined reference to `i3c_device_do_priv_xfers' This issue was part of the original i3c code, but only now caused problems when i3c support got added to lm75. Change the 'inline' annotations in the header to '__always_inline' to ensure that the dead-code-elimination pass in the compiler can optimize it out as intended. Fixes: 6071d10413ff ("hwmon: (lm75) add I3C support for P3T1755") Fixes: 3a379bbcea0a ("i3c: Add core I3C infrastructure") Signed-off-by: Arnd Bergmann Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap Reviewed-by: Guenter Roeck Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250725090609.2456262-1-arnd@kernel.org Signed-off-by: Alexandre Belloni --- include/linux/i3c/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h index b674f64d0822..7f136de4b73e 100644 --- a/include/linux/i3c/device.h +++ b/include/linux/i3c/device.h @@ -245,7 +245,7 @@ void i3c_driver_unregister(struct i3c_driver *drv); * * Return: 0 if both registrations succeeds, a negative error code otherwise. */ -static inline int i3c_i2c_driver_register(struct i3c_driver *i3cdrv, +static __always_inline int i3c_i2c_driver_register(struct i3c_driver *i3cdrv, struct i2c_driver *i2cdrv) { int ret; @@ -270,7 +270,7 @@ static inline int i3c_i2c_driver_register(struct i3c_driver *i3cdrv, * Note that when CONFIG_I3C is not enabled, this function only unregisters the * @i2cdrv. */ -static inline void i3c_i2c_driver_unregister(struct i3c_driver *i3cdrv, +static __always_inline void i3c_i2c_driver_unregister(struct i3c_driver *i3cdrv, struct i2c_driver *i2cdrv) { if (IS_ENABLED(CONFIG_I3C)) -- cgit v1.2.3 From 9c0609d685b27a0bb392390680207baa820ed118 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 24 Jul 2025 11:41:40 +0200 Subject: i3c: Standardize defines for specification parameters Align existing defines to follow the consistent pattern: I3C_BUS___. Prepare the codebase for adding new parameters and help avoid duplication. Signed-off-by: Wolfram Sang Tested-by: Tommaso Merciai Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250724094146.6443-2-wsa+renesas@sang-engineering.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master.c | 12 ++++++------ drivers/i3c/master/dw-i3c-master.c | 4 ++-- include/linux/i3c/master.h | 9 +++++---- 3 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index e00991444f31..2ef898a8fd80 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -727,12 +727,12 @@ static int i3c_bus_set_mode(struct i3c_bus *i3cbus, enum i3c_bus_mode mode, switch (i3cbus->mode) { case I3C_BUS_MODE_PURE: if (!i3cbus->scl_rate.i3c) - i3cbus->scl_rate.i3c = I3C_BUS_TYP_I3C_SCL_RATE; + i3cbus->scl_rate.i3c = I3C_BUS_I3C_SCL_TYP_RATE; break; case I3C_BUS_MODE_MIXED_FAST: case I3C_BUS_MODE_MIXED_LIMITED: if (!i3cbus->scl_rate.i3c) - i3cbus->scl_rate.i3c = I3C_BUS_TYP_I3C_SCL_RATE; + i3cbus->scl_rate.i3c = I3C_BUS_I3C_SCL_TYP_RATE; if (!i3cbus->scl_rate.i2c) i3cbus->scl_rate.i2c = max_i2c_scl_rate; break; @@ -754,8 +754,8 @@ static int i3c_bus_set_mode(struct i3c_bus *i3cbus, enum i3c_bus_mode mode, * I3C/I2C frequency may have been overridden, check that user-provided * values are not exceeding max possible frequency. */ - if (i3cbus->scl_rate.i3c > I3C_BUS_MAX_I3C_SCL_RATE || - i3cbus->scl_rate.i2c > I3C_BUS_I2C_FM_PLUS_SCL_RATE) + if (i3cbus->scl_rate.i3c > I3C_BUS_I3C_SCL_MAX_RATE || + i3cbus->scl_rate.i2c > I3C_BUS_I2C_FM_PLUS_SCL_MAX_RATE) return -EINVAL; return 0; @@ -2787,7 +2787,7 @@ int i3c_master_register(struct i3c_master_controller *master, const struct i3c_master_controller_ops *ops, bool secondary) { - unsigned long i2c_scl_rate = I3C_BUS_I2C_FM_PLUS_SCL_RATE; + unsigned long i2c_scl_rate = I3C_BUS_I2C_FM_PLUS_SCL_MAX_RATE; struct i3c_bus *i3cbus = i3c_master_get_bus(master); enum i3c_bus_mode mode = I3C_BUS_MODE_PURE; struct i2c_dev_boardinfo *i2cbi; @@ -2846,7 +2846,7 @@ int i3c_master_register(struct i3c_master_controller *master, } if (i2cbi->lvr & I3C_LVR_I2C_FM_MODE) - i2c_scl_rate = I3C_BUS_I2C_FM_SCL_RATE; + i2c_scl_rate = I3C_BUS_I2C_FM_SCL_MAX_RATE; } ret = i3c_bus_set_mode(i3cbus, mode, i2c_scl_rate); diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c index cc872b481691..e61be28cd1e3 100644 --- a/drivers/i3c/master/dw-i3c-master.c +++ b/drivers/i3c/master/dw-i3c-master.c @@ -605,14 +605,14 @@ static int dw_i2c_clk_cfg(struct dw_i3c_master *master) core_period = DIV_ROUND_UP(1000000000, core_rate); lcnt = DIV_ROUND_UP(I3C_BUS_I2C_FMP_TLOW_MIN_NS, core_period); - hcnt = DIV_ROUND_UP(core_rate, I3C_BUS_I2C_FM_PLUS_SCL_RATE) - lcnt; + hcnt = DIV_ROUND_UP(core_rate, I3C_BUS_I2C_FM_PLUS_SCL_MAX_RATE) - lcnt; scl_timing = SCL_I2C_FMP_TIMING_HCNT(hcnt) | SCL_I2C_FMP_TIMING_LCNT(lcnt); writel(scl_timing, master->regs + SCL_I2C_FMP_TIMING); master->i2c_fmp_timing = scl_timing; lcnt = DIV_ROUND_UP(I3C_BUS_I2C_FM_TLOW_MIN_NS, core_period); - hcnt = DIV_ROUND_UP(core_rate, I3C_BUS_I2C_FM_SCL_RATE) - lcnt; + hcnt = DIV_ROUND_UP(core_rate, I3C_BUS_I2C_FM_SCL_MAX_RATE) - lcnt; scl_timing = SCL_I2C_FM_TIMING_HCNT(hcnt) | SCL_I2C_FM_TIMING_LCNT(lcnt); writel(scl_timing, master->regs + SCL_I2C_FM_TIMING); diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index c67922ece617..7dfcbe530515 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -249,10 +249,11 @@ struct i3c_device { */ #define I3C_BUS_MAX_DEVS 11 -#define I3C_BUS_MAX_I3C_SCL_RATE 12900000 -#define I3C_BUS_TYP_I3C_SCL_RATE 12500000 -#define I3C_BUS_I2C_FM_PLUS_SCL_RATE 1000000 -#define I3C_BUS_I2C_FM_SCL_RATE 400000 +/* Taken from the I3C Spec V1.1.1, chapter 6.2. "Timing specification" */ +#define I3C_BUS_I2C_FM_PLUS_SCL_MAX_RATE 1000000 +#define I3C_BUS_I2C_FM_SCL_MAX_RATE 400000 +#define I3C_BUS_I3C_SCL_MAX_RATE 12900000 +#define I3C_BUS_I3C_SCL_TYP_RATE 12500000 #define I3C_BUS_TLOW_OD_MIN_NS 200 /** -- cgit v1.2.3 From 8acf1f3bae1ea48949458b67d68a72a95c3244a4 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 24 Jul 2025 11:41:41 +0200 Subject: i3c: Add more parameters for controllers to the header Add standard timing value definition from specification. Signed-off-by: Wolfram Sang Tested-by: Tommaso Merciai Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250724094146.6443-3-wsa+renesas@sang-engineering.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/master.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 7dfcbe530515..043f5c7ff398 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -254,6 +254,10 @@ struct i3c_device { #define I3C_BUS_I2C_FM_SCL_MAX_RATE 400000 #define I3C_BUS_I3C_SCL_MAX_RATE 12900000 #define I3C_BUS_I3C_SCL_TYP_RATE 12500000 +#define I3C_BUS_TAVAL_MIN_NS 1000 +#define I3C_BUS_TBUF_MIXED_FM_MIN_NS 1300 +#define I3C_BUS_THIGH_MIXED_MAX_NS 41 +#define I3C_BUS_TIDLE_MIN_NS 200000 #define I3C_BUS_TLOW_OD_MIN_NS 200 /** -- cgit v1.2.3 From 0060beec0bfa647c4b510df188b1c4673a197839 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 28 Jul 2025 13:04:29 +0900 Subject: ata: libata-sata: Add link_power_management_supported sysfs attribute A port link power management (LPM) policy can be controlled using the link_power_management_policy sysfs host attribute. However, this attribute exists also for hosts that do not support LPM and in such case, attempting to change the LPM policy for the host (port) will fail with -EOPNOTSUPP. Introduce the new sysfs link_power_management_supported host attribute to indicate to the user if a the port and the devices connected to the port for the host support LPM, which implies that the link_power_management_policy attribute can be used. Since checking that a port and its devices support LPM is common between the new ata_scsi_lpm_supported_show() function and the existing ata_scsi_lpm_store() function, the new helper ata_scsi_lpm_supported() is introduced. Fixes: 413e800cadbf ("ata: libata-sata: Disallow changing LPM state if not supported") Reported-by: Borah, Chaitanya Kumar Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202507251014.a5becc3b-lkp@intel.com Signed-off-by: Damien Le Moal Reviewed-by: Martin K. Petersen --- drivers/ata/ata_piix.c | 1 + drivers/ata/libahci.c | 1 + drivers/ata/libata-sata.c | 53 ++++++++++++++++++++++++++++++++++++----------- include/linux/libata.h | 1 + 4 files changed, 44 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c index 229429ba5027..495fa096dd65 100644 --- a/drivers/ata/ata_piix.c +++ b/drivers/ata/ata_piix.c @@ -1089,6 +1089,7 @@ static struct ata_port_operations ich_pata_ops = { }; static struct attribute *piix_sidpr_shost_attrs[] = { + &dev_attr_link_power_management_supported.attr, &dev_attr_link_power_management_policy.attr, NULL }; diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index b335fb7e5cb4..c79abdfcd7a9 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -111,6 +111,7 @@ static DEVICE_ATTR(em_buffer, S_IWUSR | S_IRUGO, static DEVICE_ATTR(em_message_supported, S_IRUGO, ahci_show_em_supported, NULL); static struct attribute *ahci_shost_attrs[] = { + &dev_attr_link_power_management_supported.attr, &dev_attr_link_power_management_policy.attr, &dev_attr_em_message_type.attr, &dev_attr_em_message.attr, diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c index 4734465d3b1e..b2817a2995d6 100644 --- a/drivers/ata/libata-sata.c +++ b/drivers/ata/libata-sata.c @@ -900,14 +900,52 @@ static const char *ata_lpm_policy_names[] = { [ATA_LPM_MIN_POWER] = "min_power", }; +/* + * Check if a port supports link power management. + * Must be called with the port locked. + */ +static bool ata_scsi_lpm_supported(struct ata_port *ap) +{ + struct ata_link *link; + struct ata_device *dev; + + if (ap->flags & ATA_FLAG_NO_LPM) + return false; + + ata_for_each_link(link, ap, EDGE) { + ata_for_each_dev(dev, &ap->link, ENABLED) { + if (dev->quirks & ATA_QUIRK_NOLPM) + return false; + } + } + + return true; +} + +static ssize_t ata_scsi_lpm_supported_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct ata_port *ap = ata_shost_to_port(shost); + unsigned long flags; + bool supported; + + spin_lock_irqsave(ap->lock, flags); + supported = ata_scsi_lpm_supported(ap); + spin_unlock_irqrestore(ap->lock, flags); + + return sysfs_emit(buf, "%d\n", supported); +} +DEVICE_ATTR(link_power_management_supported, S_IRUGO, + ata_scsi_lpm_supported_show, NULL); +EXPORT_SYMBOL_GPL(dev_attr_link_power_management_supported); + static ssize_t ata_scsi_lpm_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) { struct Scsi_Host *shost = class_to_shost(device); struct ata_port *ap = ata_shost_to_port(shost); - struct ata_link *link; - struct ata_device *dev; enum ata_lpm_policy policy; unsigned long flags; @@ -924,20 +962,11 @@ static ssize_t ata_scsi_lpm_store(struct device *device, spin_lock_irqsave(ap->lock, flags); - if (ap->flags & ATA_FLAG_NO_LPM) { + if (!ata_scsi_lpm_supported(ap)) { count = -EOPNOTSUPP; goto out_unlock; } - ata_for_each_link(link, ap, EDGE) { - ata_for_each_dev(dev, &ap->link, ENABLED) { - if (dev->quirks & ATA_QUIRK_NOLPM) { - count = -EOPNOTSUPP; - goto out_unlock; - } - } - } - ap->target_lpm_policy = policy; ata_port_schedule_eh(ap); out_unlock: diff --git a/include/linux/libata.h b/include/linux/libata.h index 912ace523880..0620dd67369f 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -545,6 +545,7 @@ typedef void (*ata_postreset_fn_t)(struct ata_link *link, unsigned int *classes) extern struct device_attribute dev_attr_unload_heads; #ifdef CONFIG_SATA_HOST +extern struct device_attribute dev_attr_link_power_management_supported; extern struct device_attribute dev_attr_link_power_management_policy; extern struct device_attribute dev_attr_ncq_prio_supported; extern struct device_attribute dev_attr_ncq_prio_enable; -- cgit v1.2.3 From 199d9ffb31650f948dd342ade1c1b920e157630f Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 11 Jul 2025 15:31:36 +0200 Subject: module: move 'struct module_use' to internal.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The struct was moved to the public header file in commit c8e21ced08b3 ("module: fix kdb's illicit use of struct module_use."). Back then the structure was used outside of the module core. Nowadays this is not true anymore, so the structure can be made internal. Signed-off-by: Thomas Weißschuh Reviewed-by: Daniel Gomez Reviewed-by: Petr Pavlu Link: https://lore.kernel.org/r/20250711-kunit-ifdef-modules-v2-1-39443decb1f8@linutronix.de Signed-off-by: Daniel Gomez --- include/linux/module.h | 7 ------- kernel/module/internal.h | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index a7cac01d95e7..97c38e1cd377 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -313,13 +313,6 @@ void *__symbol_get_gpl(const char *symbol); __used __section(".no_trim_symbol") = __stringify(x); \ (typeof(&x))(__symbol_get(__stringify(x))); }) -/* modules using other modules: kdb wants to see this. */ -struct module_use { - struct list_head source_list; - struct list_head target_list; - struct module *source, *target; -}; - enum module_state { MODULE_STATE_LIVE, /* Normal state. */ MODULE_STATE_COMING, /* Full formed, running module_init. */ diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 51ddd8866ef3..618202578b42 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -112,6 +112,13 @@ struct find_symbol_arg { enum mod_license license; }; +/* modules using other modules */ +struct module_use { + struct list_head source_list; + struct list_head target_list; + struct module *source, *target; +}; + int mod_verify_sig(const void *mod, struct load_info *info); int try_to_force_load(struct module *mod, const char *reason); bool find_symbol(struct find_symbol_arg *fsa); -- cgit v1.2.3 From 818783c804bc051f7faf0ac226b5597f8259c6f8 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 11 Jul 2025 15:31:37 +0200 Subject: module: make structure definitions always visible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To write code that works with both CONFIG_MODULES=y and CONFIG_MODULES=n it is convenient to use "if (IS_ENABLED(CONFIG_MODULES))" over raw #ifdef. The code will still fully typechecked but the unreachable parts are discarded by the compiler. This prevents accidental breakage when a certain kconfig combination was not specifically tested by the developer. This pattern is already supported to some extend by module.h defining empty stub functions if CONFIG_MODULES=n. However some users of module.h work on the structured defined by module.h. Therefore these structure definitions need to be visible, too. Many structure members are still gated by specific configuration settings. The assumption for those is that the code using them will be gated behind the same configuration setting anyways. Signed-off-by: Thomas Weißschuh Reviewed-by: Daniel Gomez Link: https://lore.kernel.org/r/20250711-kunit-ifdef-modules-v2-2-39443decb1f8@linutronix.de Signed-off-by: Daniel Gomez --- include/linux/module.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 97c38e1cd377..5fe812de2d84 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -303,16 +303,6 @@ static typeof(name) __mod_device_table__##type##__##name \ struct notifier_block; -#ifdef CONFIG_MODULES - -/* Get/put a kernel symbol (calls must be symmetric) */ -void *__symbol_get(const char *symbol); -void *__symbol_get_gpl(const char *symbol); -#define symbol_get(x) ({ \ - static const char __notrim[] \ - __used __section(".no_trim_symbol") = __stringify(x); \ - (typeof(&x))(__symbol_get(__stringify(x))); }) - enum module_state { MODULE_STATE_LIVE, /* Normal state. */ MODULE_STATE_COMING, /* Full formed, running module_init. */ @@ -597,6 +587,16 @@ struct module { #define MODULE_ARCH_INIT {} #endif +#ifdef CONFIG_MODULES + +/* Get/put a kernel symbol (calls must be symmetric) */ +void *__symbol_get(const char *symbol); +void *__symbol_get_gpl(const char *symbol); +#define symbol_get(x) ({ \ + static const char __notrim[] \ + __used __section(".no_trim_symbol") = __stringify(x); \ + (typeof(&x))(__symbol_get(__stringify(x))); }) + #ifndef HAVE_ARCH_KALLSYMS_SYMBOL_VALUE static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym) { -- cgit v1.2.3 From bdc877ba6b7ff1b6d2ebeff11e63da4a50a54854 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Mon, 30 Jun 2025 16:32:34 +0200 Subject: module: Restore the moduleparam prefix length check The moduleparam code allows modules to provide their own definition of MODULE_PARAM_PREFIX, instead of using the default KBUILD_MODNAME ".". Commit 730b69d22525 ("module: check kernel param length at compile time, not runtime") added a check to ensure the prefix doesn't exceed MODULE_NAME_LEN, as this is what param_sysfs_builtin() expects. Later, commit 58f86cc89c33 ("VERIFY_OCTAL_PERMISSIONS: stricter checking for sysfs perms.") removed this check, but there is no indication this was intentional. Since the check is still useful for param_sysfs_builtin() to function properly, reintroduce it in __module_param_call(), but in a modernized form using static_assert(). While here, clean up the __module_param_call() comments. In particular, remove the comment "Default value instead of permissions?", which comes from commit 9774a1f54f17 ("[PATCH] Compile-time check re world-writeable module params"). This comment was related to the test variable __param_perm_check_##name, which was removed in the previously mentioned commit 58f86cc89c33. Fixes: 58f86cc89c33 ("VERIFY_OCTAL_PERMISSIONS: stricter checking for sysfs perms.") Signed-off-by: Petr Pavlu Reviewed-by: Daniel Gomez Link: https://lore.kernel.org/r/20250630143535.267745-4-petr.pavlu@suse.com Signed-off-by: Daniel Gomez --- include/linux/moduleparam.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index bfb85fd13e1f..110e9d09de24 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -282,10 +282,9 @@ struct kparam_array #define __moduleparam_const const #endif -/* This is the fundamental function for registering boot/module - parameters. */ +/* This is the fundamental function for registering boot/module parameters. */ #define __module_param_call(prefix, name, ops, arg, perm, level, flags) \ - /* Default value instead of permissions? */ \ + static_assert(sizeof(""prefix) - 1 <= MAX_PARAM_PREFIX_LEN); \ static const char __param_str_##name[] = prefix #name; \ static struct kernel_param __moduleparam_const __param_##name \ __used __section("__param") \ -- cgit v1.2.3 From 40a826bd6c82ae45cfd3a19cd2a60a10f56b74c0 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Mon, 30 Jun 2025 16:32:36 +0200 Subject: module: Rename MAX_PARAM_PREFIX_LEN to __MODULE_NAME_LEN The maximum module name length (MODULE_NAME_LEN) is somewhat confusingly defined in terms of the maximum parameter prefix length (MAX_PARAM_PREFIX_LEN), when in fact the dependency is in the opposite direction. This split originates from commit 730b69d22525 ("module: check kernel param length at compile time, not runtime"). The code needed to use MODULE_NAME_LEN in moduleparam.h, but because module.h requires moduleparam.h, this created a circular dependency. It was resolved by introducing MAX_PARAM_PREFIX_LEN in moduleparam.h and defining MODULE_NAME_LEN in module.h in terms of MAX_PARAM_PREFIX_LEN. Rename MAX_PARAM_PREFIX_LEN to __MODULE_NAME_LEN for clarity. This matches the similar approach of defining MODULE_INFO in module.h and __MODULE_INFO in moduleparam.h. Signed-off-by: Petr Pavlu Reviewed-by: Daniel Gomez Link: https://lore.kernel.org/r/20250630143535.267745-6-petr.pavlu@suse.com Signed-off-by: Daniel Gomez --- include/linux/module.h | 2 +- include/linux/moduleparam.h | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 5fe812de2d84..313ecb8e5181 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -33,7 +33,7 @@ #include #include -#define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN +#define MODULE_NAME_LEN __MODULE_NAME_LEN struct modversion_info { unsigned long crc; diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 110e9d09de24..a04a2bc4f51e 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -6,6 +6,13 @@ #include #include +/* + * The maximum module name length, including the NUL byte. + * Chosen so that structs with an unsigned long line up, specifically + * modversion_info. + */ +#define __MODULE_NAME_LEN (64 - sizeof(unsigned long)) + /* You can override this manually, but generally this should match the module name. */ #ifdef MODULE @@ -17,9 +24,6 @@ #define __MODULE_INFO_PREFIX KBUILD_MODNAME "." #endif -/* Chosen so that structs with an unsigned long line up. */ -#define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long)) - #define __MODULE_INFO(tag, name, info) \ static const char __UNIQUE_ID(name)[] \ __used __section(".modinfo") __aligned(1) \ @@ -284,7 +288,7 @@ struct kparam_array /* This is the fundamental function for registering boot/module parameters. */ #define __module_param_call(prefix, name, ops, arg, perm, level, flags) \ - static_assert(sizeof(""prefix) - 1 <= MAX_PARAM_PREFIX_LEN); \ + static_assert(sizeof(""prefix) - 1 <= __MODULE_NAME_LEN); \ static const char __param_str_##name[] = prefix #name; \ static struct kernel_param __moduleparam_const __param_##name \ __used __section("__param") \ -- cgit v1.2.3 From 788fa4b47cdcd9b3d8c2d02ac0b3cd2540305f18 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 Aug 2025 16:37:24 -0400 Subject: tracing: Add guard(ring_buffer_nest) Some calls to the tracing ring buffer can happen when the ring buffer is already being written to by the same context (for example, a trace_printk() in between a ring_buffer_lock_reserve() and a ring_buffer_unlock_commit()). In order to not trigger the recursion detection, these functions use ring_buffer_nest_start() and ring_buffer_nest_end(). Create a guard() for these functions so that their use cases can be simplified and not need to use goto for the release. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250801203857.710501021@kernel.org Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 3 ++ kernel/trace/trace.c | 69 ++++++++++++++++----------------------- kernel/trace/trace_events_synth.c | 6 ++-- 3 files changed, 34 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index cd7f0ae26615..8253cb69540c 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -144,6 +144,9 @@ int ring_buffer_write(struct trace_buffer *buffer, void ring_buffer_nest_start(struct trace_buffer *buffer); void ring_buffer_nest_end(struct trace_buffer *buffer); +DEFINE_GUARD(ring_buffer_nest, struct trace_buffer *, + ring_buffer_nest_start(_T), ring_buffer_nest_end(_T)) + struct ring_buffer_event * ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts, unsigned long *lost_events); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0ec9cab9a812..332487179e1d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1160,13 +1160,11 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + guard(ring_buffer_nest)(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, trace_ctx); - if (!event) { - size = 0; - goto out; - } + if (!event) + return 0; entry = ring_buffer_event_data(event); entry->ip = ip; @@ -1182,8 +1180,6 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); - out: - ring_buffer_nest_end(buffer); return size; } EXPORT_SYMBOL_GPL(__trace_array_puts); @@ -1213,7 +1209,6 @@ int __trace_bputs(unsigned long ip, const char *str) struct bputs_entry *entry; unsigned int trace_ctx; int size = sizeof(struct bputs_entry); - int ret = 0; if (!printk_binsafe(tr)) return __trace_puts(ip, str, strlen(str)); @@ -1227,11 +1222,11 @@ int __trace_bputs(unsigned long ip, const char *str) trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + guard(ring_buffer_nest)(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, trace_ctx); if (!event) - goto out; + return 0; entry = ring_buffer_event_data(event); entry->ip = ip; @@ -1240,10 +1235,7 @@ int __trace_bputs(unsigned long ip, const char *str) __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); - ret = 1; - out: - ring_buffer_nest_end(buffer); - return ret; + return 1; } EXPORT_SYMBOL_GPL(__trace_bputs); @@ -3397,21 +3389,19 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, - trace_ctx); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->fmt = fmt; - - memcpy(entry->buf, tbuffer, sizeof(u32) * len); - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); + scoped_guard(ring_buffer_nest, buffer) { + event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + trace_ctx); + if (!event) + goto out_put; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->fmt = fmt; -out: - ring_buffer_nest_end(buffer); + memcpy(entry->buf, tbuffer, sizeof(u32) * len); + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); + } out_put: put_trace_buf(); @@ -3452,20 +3442,19 @@ int __trace_array_vprintk(struct trace_buffer *buffer, len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); size = sizeof(*entry) + len + 1; - ring_buffer_nest_start(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - trace_ctx); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->ip = ip; - - memcpy(&entry->buf, tbuffer, len + 1); - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); + scoped_guard(ring_buffer_nest, buffer) { + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + memcpy(&entry->buf, tbuffer, len + 1); + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); + } out: - ring_buffer_nest_end(buffer); put_trace_buf(); out_nobuffer: diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 33cfbd4ed76d..f24ee61f8884 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -536,12 +536,12 @@ static notrace void trace_event_raw_event_synth(void *__data, * is being performed within another event. */ buffer = trace_file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + guard(ring_buffer_nest)(buffer); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + fields_size); if (!entry) - goto out; + return; for (i = 0, n_u64 = 0; i < event->n_fields; i++) { val_idx = var_ref_idx[i]; @@ -584,8 +584,6 @@ static notrace void trace_event_raw_event_synth(void *__data, } trace_event_buffer_commit(&fbuffer); -out: - ring_buffer_nest_end(buffer); } static void free_synth_event_print_fmt(struct trace_event_call *call) -- cgit v1.2.3 From 6c6d8f8ba7789c221a2e4c43a0ed982c7a41f428 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 16 Jul 2025 14:32:45 +0100 Subject: lib/xxhash: remove unused functions xxh32_digest() and xxh32_update() were added in 2017 in the original xxhash commit, but have remained unused. Remove them. Link: https://lkml.kernel.org/r/20250716133245.243363-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Christoph Hellwig Cc: Dave Gilbert Cc: Nick Terrell Signed-off-by: Andrew Morton --- include/linux/xxhash.h | 26 ------------ lib/xxhash.c | 107 ------------------------------------------------- 2 files changed, 133 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h index df42511438d0..27f57eca8cb1 100644 --- a/include/linux/xxhash.h +++ b/include/linux/xxhash.h @@ -177,32 +177,6 @@ struct xxh64_state { */ void xxh32_reset(struct xxh32_state *state, uint32_t seed); -/** - * xxh32_update() - hash the data given and update the xxh32 state - * - * @state: The xxh32 state to update. - * @input: The data to hash. - * @length: The length of the data to hash. - * - * After calling xxh32_reset() call xxh32_update() as many times as necessary. - * - * Return: Zero on success, otherwise an error code. - */ -int xxh32_update(struct xxh32_state *state, const void *input, size_t length); - -/** - * xxh32_digest() - produce the current xxh32 hash - * - * @state: Produce the current xxh32 hash of this state. - * - * A hash value can be produced at any time. It is still possible to continue - * inserting input into the hash state after a call to xxh32_digest(), and - * generate new hashes later on, by calling xxh32_digest() again. - * - * Return: The xxh32 hash stored in the state. - */ -uint32_t xxh32_digest(const struct xxh32_state *state); - /** * xxh64_reset() - reset the xxh64 state to start a new hashing operation * diff --git a/lib/xxhash.c b/lib/xxhash.c index b5bd567aa6b3..cf629766f376 100644 --- a/lib/xxhash.c +++ b/lib/xxhash.c @@ -267,113 +267,6 @@ void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed) } EXPORT_SYMBOL(xxh64_reset); -int xxh32_update(struct xxh32_state *state, const void *input, const size_t len) -{ - const uint8_t *p = (const uint8_t *)input; - const uint8_t *const b_end = p + len; - - if (input == NULL) - return -EINVAL; - - state->total_len_32 += (uint32_t)len; - state->large_len |= (len >= 16) | (state->total_len_32 >= 16); - - if (state->memsize + len < 16) { /* fill in tmp buffer */ - memcpy((uint8_t *)(state->mem32) + state->memsize, input, len); - state->memsize += (uint32_t)len; - return 0; - } - - if (state->memsize) { /* some data left from previous update */ - const uint32_t *p32 = state->mem32; - - memcpy((uint8_t *)(state->mem32) + state->memsize, input, - 16 - state->memsize); - - state->v1 = xxh32_round(state->v1, get_unaligned_le32(p32)); - p32++; - state->v2 = xxh32_round(state->v2, get_unaligned_le32(p32)); - p32++; - state->v3 = xxh32_round(state->v3, get_unaligned_le32(p32)); - p32++; - state->v4 = xxh32_round(state->v4, get_unaligned_le32(p32)); - p32++; - - p += 16-state->memsize; - state->memsize = 0; - } - - if (p <= b_end - 16) { - const uint8_t *const limit = b_end - 16; - uint32_t v1 = state->v1; - uint32_t v2 = state->v2; - uint32_t v3 = state->v3; - uint32_t v4 = state->v4; - - do { - v1 = xxh32_round(v1, get_unaligned_le32(p)); - p += 4; - v2 = xxh32_round(v2, get_unaligned_le32(p)); - p += 4; - v3 = xxh32_round(v3, get_unaligned_le32(p)); - p += 4; - v4 = xxh32_round(v4, get_unaligned_le32(p)); - p += 4; - } while (p <= limit); - - state->v1 = v1; - state->v2 = v2; - state->v3 = v3; - state->v4 = v4; - } - - if (p < b_end) { - memcpy(state->mem32, p, (size_t)(b_end-p)); - state->memsize = (uint32_t)(b_end-p); - } - - return 0; -} -EXPORT_SYMBOL(xxh32_update); - -uint32_t xxh32_digest(const struct xxh32_state *state) -{ - const uint8_t *p = (const uint8_t *)state->mem32; - const uint8_t *const b_end = (const uint8_t *)(state->mem32) + - state->memsize; - uint32_t h32; - - if (state->large_len) { - h32 = xxh_rotl32(state->v1, 1) + xxh_rotl32(state->v2, 7) + - xxh_rotl32(state->v3, 12) + xxh_rotl32(state->v4, 18); - } else { - h32 = state->v3 /* == seed */ + PRIME32_5; - } - - h32 += state->total_len_32; - - while (p + 4 <= b_end) { - h32 += get_unaligned_le32(p) * PRIME32_3; - h32 = xxh_rotl32(h32, 17) * PRIME32_4; - p += 4; - } - - while (p < b_end) { - h32 += (*p) * PRIME32_5; - h32 = xxh_rotl32(h32, 11) * PRIME32_1; - p++; - } - - h32 ^= h32 >> 15; - h32 *= PRIME32_2; - h32 ^= h32 >> 13; - h32 *= PRIME32_3; - h32 ^= h32 >> 16; - - return h32; -} -EXPORT_SYMBOL(xxh32_digest); - int xxh64_update(struct xxh64_state *state, const void *input, const size_t len) { const uint8_t *p = (const uint8_t *)input; -- cgit v1.2.3 From 07d24902977e4704fab8472981e73a0ad6dfa1fd Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 10 Jun 2025 08:53:27 +0000 Subject: kexec: enable CMA based contiguous allocation When booting a new kernel with kexec_file, the kernel picks a target location that the kernel should live at, then allocates random pages, checks whether any of those patches magically happens to coincide with a target address range and if so, uses them for that range. For every page allocated this way, it then creates a page list that the relocation code - code that executes while all CPUs are off and we are just about to jump into the new kernel - copies to their final memory location. We can not put them there before, because chances are pretty good that at least some page in the target range is already in use by the currently running Linux environment. Copying is happening from a single CPU at RAM rate, which takes around 4-50 ms per 100 MiB. All of this is inefficient and error prone. To successfully kexec, we need to quiesce all devices of the outgoing kernel so they don't scribble over the new kernel's memory. We have seen cases where that does not happen properly (*cough* GIC *cough*) and hence the new kernel was corrupted. This started a month long journey to root cause failing kexecs to eventually see memory corruption, because the new kernel was corrupted severely enough that it could not emit output to tell us about the fact that it was corrupted. By allocating memory for the next kernel from a memory range that is guaranteed scribbling free, we can boot the next kernel up to a point where it is at least able to detect corruption and maybe even stop it before it becomes severe. This increases the chance for successful kexecs. Since kexec got introduced, Linux has gained the CMA framework which can perform physically contiguous memory mappings, while keeping that memory available for movable memory when it is not needed for contiguous allocations. The default CMA allocator is for DMA allocations. This patch adds logic to the kexec file loader to attempt to place the target payload at a location allocated from CMA. If successful, it uses that memory range directly instead of creating copy instructions during the hot phase. To ensure that there is a safety net in case anything goes wrong with the CMA allocation, it also adds a flag for user space to force disable CMA allocations. Using CMA allocations has two advantages: 1) Faster by 4-50 ms per 100 MiB. There is no more need to copy in the hot phase. 2) More robust. Even if by accident some page is still in use for DMA, the new kernel image will be safe from that access because it resides in a memory region that is considered allocated in the old kernel and has a chance to reinitialize that component. Link: https://lkml.kernel.org/r/20250610085327.51817-1-graf@amazon.com Signed-off-by: Alexander Graf Acked-by: Baoquan He Reviewed-by: Pasha Tatashin Cc: Zhongkun He Signed-off-by: Andrew Morton --- arch/riscv/kernel/kexec_elf.c | 1 + include/linux/kexec.h | 10 +++++ include/uapi/linux/kexec.h | 1 + kernel/kexec.c | 2 +- kernel/kexec_core.c | 100 ++++++++++++++++++++++++++++++++++++++---- kernel/kexec_file.c | 51 ++++++++++++++++++++- kernel/kexec_internal.h | 2 +- 7 files changed, 156 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/arch/riscv/kernel/kexec_elf.c b/arch/riscv/kernel/kexec_elf.c index f4755d49b89e..56444c7bd34e 100644 --- a/arch/riscv/kernel/kexec_elf.c +++ b/arch/riscv/kernel/kexec_elf.c @@ -95,6 +95,7 @@ static int elf_find_pbase(struct kimage *image, unsigned long kernel_len, kbuf.buf_align = PMD_SIZE; kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE); + kbuf.cma = NULL; kbuf.top_down = false; ret = arch_kexec_locate_mem_hole(&kbuf); if (!ret) { diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 03f85ad03025..1b10a5d84b68 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -79,6 +79,12 @@ extern note_buf_t __percpu *crash_notes; typedef unsigned long kimage_entry_t; +/* + * This is a copy of the UAPI struct kexec_segment and must be identical + * to it because it gets copied straight from user space into kernel + * memory. Do not modify this structure unless you change the way segments + * get ingested from user space. + */ struct kexec_segment { /* * This pointer can point to user memory if kexec_load() system @@ -172,6 +178,7 @@ int kexec_image_post_load_cleanup_default(struct kimage *image); * @buf_align: Minimum alignment needed. * @buf_min: The buffer can't be placed below this address. * @buf_max: The buffer can't be placed above this address. + * @cma: CMA page if the buffer is backed by CMA. * @top_down: Allocate from top of memory. * @random: Place the buffer at a random position. */ @@ -184,6 +191,7 @@ struct kexec_buf { unsigned long buf_align; unsigned long buf_min; unsigned long buf_max; + struct page *cma; bool top_down; #ifdef CONFIG_CRASH_DUMP bool random; @@ -340,6 +348,7 @@ struct kimage { unsigned long nr_segments; struct kexec_segment segment[KEXEC_SEGMENT_MAX]; + struct page *segment_cma[KEXEC_SEGMENT_MAX]; struct list_head control_pages; struct list_head dest_pages; @@ -361,6 +370,7 @@ struct kimage { */ unsigned int hotplug_support:1; #endif + unsigned int no_cma:1; #ifdef ARCH_HAS_KIMAGE_ARCH struct kimage_arch arch; diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 5ae1741ea8ea..8958ebfcff94 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -27,6 +27,7 @@ #define KEXEC_FILE_ON_CRASH 0x00000002 #define KEXEC_FILE_NO_INITRAMFS 0x00000004 #define KEXEC_FILE_DEBUG 0x00000008 +#define KEXEC_FILE_NO_CMA 0x00000010 /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. diff --git a/kernel/kexec.c b/kernel/kexec.c index a6b3f96bb50c..28008e3d462e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -152,7 +152,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, goto out; for (i = 0; i < nr_segments; i++) { - ret = kimage_load_segment(image, &image->segment[i]); + ret = kimage_load_segment(image, i); if (ret) goto out; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 3a9a9f240dbc..e390c0df6d55 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -553,6 +554,24 @@ static void kimage_free_entry(kimage_entry_t entry) kimage_free_pages(page); } +static void kimage_free_cma(struct kimage *image) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + struct page *cma = image->segment_cma[i]; + u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT; + + if (!cma) + continue; + + arch_kexec_pre_free_pages(page_address(cma), nr_pages); + dma_release_from_contiguous(NULL, cma, nr_pages); + image->segment_cma[i] = NULL; + } + +} + void kimage_free(struct kimage *image) { kimage_entry_t *ptr, entry; @@ -591,6 +610,9 @@ void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + /* Free CMA allocations */ + kimage_free_cma(image); + /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. @@ -716,9 +738,69 @@ static struct page *kimage_alloc_page(struct kimage *image, return page; } -static int kimage_load_normal_segment(struct kimage *image, - struct kexec_segment *segment) +static int kimage_load_cma_segment(struct kimage *image, int idx) +{ + struct kexec_segment *segment = &image->segment[idx]; + struct page *cma = image->segment_cma[idx]; + char *ptr = page_address(cma); + unsigned long maddr; + size_t ubytes, mbytes; + int result = 0; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; + + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + + /* Then copy from source buffer to the CMA one */ + while (mbytes) { + size_t uchunk, mchunk; + + ptr += maddr & ~PAGE_MASK; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + + if (uchunk) { + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + ubytes -= uchunk; + if (image->file_mode) + kbuf += uchunk; + else + buf += uchunk; + } + + if (result) { + result = -EFAULT; + goto out; + } + + ptr += mchunk; + maddr += mchunk; + mbytes -= mchunk; + + cond_resched(); + } + + /* Clear any remainder */ + memset(ptr, 0, mbytes); + +out: + return result; +} + +static int kimage_load_normal_segment(struct kimage *image, int idx) { + struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; @@ -733,6 +815,9 @@ static int kimage_load_normal_segment(struct kimage *image, mbytes = segment->memsz; maddr = segment->mem; + if (image->segment_cma[idx]) + return kimage_load_cma_segment(image, idx); + result = kimage_set_destination(image, maddr); if (result < 0) goto out; @@ -787,13 +872,13 @@ out: } #ifdef CONFIG_CRASH_DUMP -static int kimage_load_crash_segment(struct kimage *image, - struct kexec_segment *segment) +static int kimage_load_crash_segment(struct kimage *image, int idx) { /* For crash dumps kernels we simply copy the data from * user space to it's destination. * We do things a page at a time for the sake of kmap. */ + struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; @@ -858,18 +943,17 @@ out: } #endif -int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) +int kimage_load_segment(struct kimage *image, int idx) { int result = -ENOMEM; switch (image->type) { case KEXEC_TYPE_DEFAULT: - result = kimage_load_normal_segment(image, segment); + result = kimage_load_normal_segment(image, idx); break; #ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: - result = kimage_load_crash_segment(image, segment); + result = kimage_load_crash_segment(image, idx); break; #endif } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 69fe76fd9233..41271eee0f99 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "kexec_internal.h" #ifdef CONFIG_KEXEC_SIG @@ -253,6 +254,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, ret = 0; } + image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); if (IS_ERR(image->cmdline_buf)) { @@ -434,7 +437,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, i, ksegment->buf, ksegment->bufsz, ksegment->mem, ksegment->memsz); - ret = kimage_load_segment(image, &image->segment[i]); + ret = kimage_load_segment(image, i); if (ret) goto out; } @@ -663,6 +666,43 @@ static int kexec_walk_resources(struct kexec_buf *kbuf, return walk_system_ram_res(0, ULONG_MAX, kbuf, func); } +static int kexec_alloc_contig(struct kexec_buf *kbuf) +{ + size_t nr_pages = kbuf->memsz >> PAGE_SHIFT; + unsigned long mem; + struct page *p; + + /* User space disabled CMA allocations, bail out. */ + if (kbuf->image->no_cma) + return -EPERM; + + /* Skip CMA logic for crash kernel */ + if (kbuf->image->type == KEXEC_TYPE_CRASH) + return -EPERM; + + p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true); + if (!p) + return -ENOMEM; + + pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p)); + + mem = page_to_boot_pfn(p) << PAGE_SHIFT; + + if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) { + /* Our region is already in use by a statically defined one. Bail out. */ + pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz); + dma_release_from_contiguous(NULL, p, nr_pages); + return -EBUSY; + } + + kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT; + kbuf->cma = p; + + arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0); + + return 0; +} + /** * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel * @kbuf: Parameters for the memory search. @@ -687,6 +727,13 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (ret <= 0) return ret; + /* + * Try to find a free physically contiguous block of memory first. With that, we + * can avoid any copying at kexec time. + */ + if (!kexec_alloc_contig(kbuf)) + return 0; + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else @@ -732,6 +779,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) /* Ensure minimum alignment needed for segments. */ kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE); kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); + kbuf->cma = NULL; /* Walk the RAM ranges and allocate a suitable range for the buffer */ ret = arch_kexec_locate_mem_hole(kbuf); @@ -744,6 +792,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) ksegment->bufsz = kbuf->bufsz; ksegment->mem = kbuf->mem; ksegment->memsz = kbuf->memsz; + kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma; kbuf->image->nr_segments++; return 0; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 30a733a55a67..228bb88c018b 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -10,7 +10,7 @@ struct kimage *do_kimage_alloc_init(void); int sanity_check_segment_list(struct kimage *image); void kimage_free_page_list(struct list_head *list); void kimage_free(struct kimage *image); -int kimage_load_segment(struct kimage *image, struct kexec_segment *segment); +int kimage_load_segment(struct kimage *image, int idx); void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); -- cgit v1.2.3 From d171b10b2d7b067c16d79e1d069a23a34f088d23 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 22 Jul 2025 11:22:30 -0700 Subject: mm/page-flags: remove folio_start_writeback_keepwrite() Commit cd57b77197a4 ("ext4: Convert ext4_bio_write_page() to use a folio) removed set_page_writeback_keepwrite() which was the last/only caller of folio_start_writeback_keepwrite(). Link: https://lkml.kernel.org/r/20250722182230.2114587-1-joannelkoong@gmail.com Signed-off-by: Joanne Koong Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8e4d6eda8a8d..8d3fa3a91ce4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -837,8 +837,6 @@ void set_page_writeback(struct page *page); #define folio_start_writeback(folio) \ __folio_start_writeback(folio, false) -#define folio_start_writeback_keepwrite(folio) \ - __folio_start_writeback(folio, true) static __always_inline bool folio_test_head(const struct folio *folio) { -- cgit v1.2.3 From f225b34f1e6c81c50e48f6207ddb6d290be1b932 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 25 Jul 2025 09:29:41 +0100 Subject: mm/mseal: always define VM_SEALED Patch series "mseal cleanups", v4. Perform a number of cleanups to the mseal logic. Firstly, VM_SEALED is treated differently from every other VMA flag, it really doesn't make sense to do this, so we start by making this consistent with everything else. Next we place the madvise logic where it belongs - in mm/madvise.c. It really makes no sense to abstract this elsewhere. In doing so, we go to great lengths to explain very clearly the previously very confusing logic as to what sealed mappings are impacted here. In doing so, we retain existing logic regarding treatment of madvise() discard operations for a sealed, read-only MAP_PRIVATE file-backed mapping. This is something we likely need to revisit. We then abstract out and explain the 'are there are any gaps in this range in the mm?' check being performed as a prerequisite to mseal being performed. Finally, we simplify the actual mseal logic which is really quite straightforward. No functional change is intended. This patch (of 4): There is no reason to treat VM_SEALED in a special way, in each other case in which a VMA flag is unavailable due to configuration, we simply assign that flag to VM_NONE, so make VM_SEALED consistent with all other VMA flags in this respect. Additionally, use the next available bit for VM_SEALED, 42, rather than arbitrarily putting it at 63 and update the declaration to match all other VMA flags. No functional change intended. Link: https://lkml.kernel.org/r/cover.1753431105.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/aeb398a77029b6e7377cd944328bc9bbc3c90537.1753431105.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Pedro Falcato Acked-by: David Hildenbrand Cc: Jann Horn Cc: Jeff Xu Cc: Kees Cook Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++-- tools/testing/vma/vma_internal.h | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8e3a4c5b78ff..ceaa780a703a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -414,8 +414,10 @@ extern unsigned int kobjsize(const void *objp); #endif #ifdef CONFIG_64BIT -/* VM is sealed, in vm_flags */ -#define VM_SEALED _BITUL(63) +#define VM_SEALED_BIT 42 +#define VM_SEALED BIT(VM_SEALED_BIT) +#else +#define VM_SEALED VM_NONE #endif /* Bits set in the VMA until the stack is in its final location */ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 991022e9e0d3..0fe52fd6782b 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -108,8 +108,10 @@ extern unsigned long dac_mmap_min_addr; #define CAP_IPC_LOCK 14 #ifdef CONFIG_64BIT -/* VM is sealed, in vm_flags */ -#define VM_SEALED _BITUL(63) +#define VM_SEALED_BIT 42 +#define VM_SEALED BIT(VM_SEALED_BIT) +#else +#define VM_SEALED VM_NONE #endif #define FIRST_USER_ADDRESS 0UL -- cgit v1.2.3 From 3dfde97800e06882960cc926d2c428f2128b7c70 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Jul 2025 10:52:59 +0530 Subject: mm: add get_and_clear_ptes() and clear_ptes() Patch series "Optimizations for khugepaged", v4. If the underlying folio mapped by the ptes is large, we can process those ptes in a batch using folio_pte_batch(). For arm64 specifically, this results in a 16x reduction in the number of ptep_get() calls, since on a contig block, ptep_get() on arm64 will iterate through all 16 entries to collect a/d bits. Next, ptep_clear() will cause a TLBI for every contig block in the range via contpte_try_unfold(). Instead, use clear_ptes() to only do the TLBI at the first and last contig block of the range. For split folios, there will be no pte batching; the batch size returned by folio_pte_batch() will be 1. For pagetable split folios, the ptes will still point to the same large folio; for arm64, this results in the optimization described above, and for other arches, a minor improvement is expected due to a reduction in the number of function calls and batching atomic operations. This patch (of 3): Let's add variants to be used where "full" does not apply -- which will be the majority of cases in the future. "full" really only applies if we are about to tear down a full MM. Use get_and_clear_ptes() in existing code, clear_ptes() users will be added next. Link: https://lkml.kernel.org/r/20250724052301.23844-2-dev.jain@arm.com Signed-off-by: David Hildenbrand Signed-off-by: Dev Jain Reviewed-by: Baolin Wang Reviewed-by: Barry Song Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Liam Howlett Cc: Mariano Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 2 +- include/linux/pgtable.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ mm/mremap.c | 2 +- mm/rmap.c | 2 +- 4 files changed, 48 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index abd9725796e9..20a89ab97dc5 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1528,7 +1528,7 @@ early_initcall(prevent_bootmem_remove_init); pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { - pte_t pte = get_and_clear_full_ptes(vma->vm_mm, addr, ptep, nr, /* full = */ 0); + pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr); if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { /* diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e3b99920be05..4c035637eeb7 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -736,6 +736,29 @@ static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, } #endif +/** + * get_and_clear_ptes - Clear present PTEs that map consecutive pages of + * the same folio, collecting dirty/accessed bits. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * + * Use this instead of get_and_clear_full_ptes() if it is known that we don't + * need to clear the full mm, which is mostly the case. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline pte_t get_and_clear_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + return get_and_clear_full_ptes(mm, addr, ptep, nr, 0); +} + #ifndef clear_full_ptes /** * clear_full_ptes - Clear present PTEs that map consecutive pages of the same @@ -768,6 +791,28 @@ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, } #endif +/** + * clear_ptes - Clear present PTEs that map consecutive pages of the same folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * + * Use this instead of clear_full_ptes() if it is known that we don't need to + * clear the full mm, which is mostly the case. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void clear_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + clear_full_ptes(mm, addr, ptep, nr, 0); +} + /* * If two threads concurrently fault at the same page, the thread that * won the race updates the PTE and its local TLB/Cache. The other thread diff --git a/mm/mremap.c b/mm/mremap.c index ac39845e9718..677a4d744df9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -280,7 +280,7 @@ static int move_ptes(struct pagetable_move_control *pmc, old_pte, max_nr_ptes); force_flush = true; } - pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr_ptes, 0); + pte = get_and_clear_ptes(mm, old_addr, old_ptep, nr_ptes); pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte); diff --git a/mm/rmap.c b/mm/rmap.c index f93ce27132ab..568198e9efc2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2036,7 +2036,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, flush_cache_range(vma, address, end_addr); /* Nuke the page table entry. */ - pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0); + pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages); /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. -- cgit v1.2.3 From 9a4f90e246615d1f42a9b907deb9b4c0a418d996 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 25 Jul 2025 15:29:01 +0100 Subject: mm: remove mm/io-mapping.c This is dead code, which was used from commit b739f125e4eb ("i915: use io_mapping_map_user") but reverted a month later by commit 0e4fe0c9f2f9 ("Revert "i915: use io_mapping_map_user"") back in 2021. Since then nobody has used it, so remove it. [akpm@linux-foundation.org: update Documentation/core-api/mm-api.rst, per Vlastimil] Link: https://lkml.kernel.org/r/20250725142901.81502-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- Documentation/core-api/mm-api.rst | 1 - include/linux/io-mapping.h | 3 --- mm/Kconfig | 4 ---- mm/Makefile | 1 - mm/io-mapping.c | 30 ------------------------------ 5 files changed, 39 deletions(-) delete mode 100644 mm/io-mapping.c (limited to 'include/linux') diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index af8151db88b2..24970b91ac15 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -139,4 +139,3 @@ More Memory Management Functions .. kernel-doc:: mm/mmu_notifier.c .. kernel-doc:: mm/balloon_compaction.c .. kernel-doc:: mm/huge_memory.c -.. kernel-doc:: mm/io-mapping.c diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 7376c1df9c90..c16353cc6e3c 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -225,7 +225,4 @@ io_mapping_free(struct io_mapping *iomap) kfree(iomap); } -int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, unsigned long size); - #endif /* _LINUX_IO_MAPPING_H */ diff --git a/mm/Kconfig b/mm/Kconfig index d5d4eca947a6..e443fe8cd6cf 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1242,10 +1242,6 @@ config KMAP_LOCAL config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY bool -# struct io_mapping based helper. Selected by drivers that need them -config IO_MAPPING - bool - config MEMFD_CREATE bool "Enable memfd_create() system call" if EXPERT diff --git a/mm/Makefile b/mm/Makefile index 1a7a11d4933d..ef54aa615d9d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -141,7 +141,6 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o -obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o diff --git a/mm/io-mapping.c b/mm/io-mapping.c deleted file mode 100644 index d3586e95c12c..000000000000 --- a/mm/io-mapping.c +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only - -#include -#include - -/** - * io_mapping_map_user - remap an I/O mapping to userspace - * @iomap: the source io_mapping - * @vma: user vma to map to - * @addr: target user address to start at - * @pfn: physical address of kernel memory - * @size: size of map area - * - * Note: this is only safe if the mm semaphore is held when called. - */ -int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, unsigned long size) -{ - vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; - - if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags)) - return -EINVAL; - - pgprot_t remap_prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) | - (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)); - - /* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */ - return remap_pfn_range_notrack(vma, addr, pfn, size, remap_prot); -} -EXPORT_SYMBOL_GPL(io_mapping_map_user); -- cgit v1.2.3 From a222439e1e273fa0f4e37ce17aeb109f3e91824f Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 25 Jul 2025 14:16:24 +0200 Subject: mm/rmap: add anon_vma lifetime debug check If an anon folio is mapped into userspace, its anon_vma must be alive, otherwise rmap walks can hit UAF. There have been syzkaller reports a few months ago[1][2] of UAF in rmap walks that seems to indicate that there can be pages with elevated mapcount whose anon_vma has already been freed, but I think we never figured out what the cause is; and syzkaller only hit these UAFs when memory pressure randomly caused reclaim to rmap-walk the affected pages, so it of course didn't manage to create a reproducer. Add a VM_WARN_ON_FOLIO() when we add/remove mappings of anonymous folios to hopefully catch such issues more reliably. [1] https://lore.kernel.org/r/67abaeaf.050a0220.110943.0041.GAE@google.com [2] https://lore.kernel.org/r/67a76f33.050a0220.3d72c.0028.GAE@google.com Link: https://lkml.kernel.org/r/20250725-anonvma-uaf-debug-v2-1-bc3c7e5ba5b1@google.com Signed-off-by: Jann Horn Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Acked-by: Harry Yoo Cc: David Hildenbrand Cc: Jann Horn Cc: Liam Howlett Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/rmap.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 20803fcb49a7..6cd020eea37a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -449,6 +449,28 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, default: VM_WARN_ON_ONCE(true); } + + /* + * Anon folios must have an associated live anon_vma as long as they're + * mapped into userspace. + * Note that the atomic_read() mainly does two things: + * + * 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to + * check that the associated anon_vma has not yet been freed (subject + * to KASAN's usual limitations). This check will pass if the + * anon_vma's refcount has already dropped to 0 but an RCU grace + * period hasn't passed since then. + * 2. If the anon_vma has not yet been freed, it checks that the + * anon_vma still has a nonzero refcount (as opposed to being in the + * middle of an RCU delay for getting freed). + */ + if (folio_test_anon(folio) && !folio_test_ksm(folio)) { + unsigned long mapping = (unsigned long)folio->mapping; + struct anon_vma *anon_vma; + + anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON); + VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == 0, folio); + } } /* -- cgit v1.2.3 From 9bbffee67ffd16360179327b57f3b1245579ef08 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 28 Jul 2025 10:53:55 -0700 Subject: mm: fix a UAF when vma->mm is freed after vma->vm_refcnt got dropped By inducing delays in the right places, Jann Horn created a reproducer for a hard to hit UAF issue that became possible after VMAs were allowed to be recycled by adding SLAB_TYPESAFE_BY_RCU to their cache. Race description is borrowed from Jann's discovery report: lock_vma_under_rcu() looks up a VMA locklessly with mas_walk() under rcu_read_lock(). At that point, the VMA may be concurrently freed, and it can be recycled by another process. vma_start_read() then increments the vma->vm_refcnt (if it is in an acceptable range), and if this succeeds, vma_start_read() can return a recycled VMA. In this scenario where the VMA has been recycled, lock_vma_under_rcu() will then detect the mismatching ->vm_mm pointer and drop the VMA through vma_end_read(), which calls vma_refcount_put(). vma_refcount_put() drops the refcount and then calls rcuwait_wake_up() using a copy of vma->vm_mm. This is wrong: It implicitly assumes that the caller is keeping the VMA's mm alive, but in this scenario the caller has no relation to the VMA's mm, so the rcuwait_wake_up() can cause UAF. The diagram depicting the race: T1 T2 T3 == == == lock_vma_under_rcu mas_walk mmap vma_start_read __refcount_inc_not_zero_limited_acquire munmap __vma_enter_locked refcount_add_not_zero vma_end_read vma_refcount_put __refcount_dec_and_test rcuwait_wait_event rcuwait_wake_up [UAF] Note that rcuwait_wait_event() in T3 does not block because refcount was already dropped by T1. At this point T3 can exit and free the mm causing UAF in T1. To avoid this we move vma->vm_mm verification into vma_start_read() and grab vma->vm_mm to stabilize it before vma_refcount_put() operation. [surenb@google.com: v3] Link: https://lkml.kernel.org/r/20250729145709.2731370-1-surenb@google.com Link: https://lkml.kernel.org/r/20250728175355.2282375-1-surenb@google.com Fixes: 3104138517fc ("mm: make vma cache SLAB_TYPESAFE_BY_RCU") Signed-off-by: Suren Baghdasaryan Reported-by: Jann Horn Closes: https://lore.kernel.org/all/CAG48ez0-deFbVH=E3jbkWx=X3uVbd8nWeo6kbJPQ0KoUD+m2tA@mail.gmail.com/ Reviewed-by: Vlastimil Babka Acked-by: Lorenzo Stoakes Cc: Jann Horn Cc: Liam Howlett Cc: Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 30 ++++++++++++++++++++++++++++++ mm/mmap_lock.c | 10 +++------- 2 files changed, 33 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 1f4f44951abe..11a078de9150 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -12,6 +12,7 @@ extern int rcuwait_wake_up(struct rcuwait *w); #include #include #include +#include #define MMAP_LOCK_INITIALIZER(name) \ .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), @@ -154,6 +155,10 @@ static inline void vma_refcount_put(struct vm_area_struct *vma) * reused and attached to a different mm before we lock it. * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got * detached. + * + * WARNING! The vma passed to this function cannot be used if the function + * fails to lock it because in certain cases RCU lock is dropped and then + * reacquired. Once RCU lock is dropped the vma can be concurently freed. */ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, struct vm_area_struct *vma) @@ -183,6 +188,31 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, } rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + + /* + * If vma got attached to another mm from under us, that mm is not + * stable and can be freed in the narrow window after vma->vm_refcnt + * is dropped and before rcuwait_wake_up(mm) is called. Grab it before + * releasing vma->vm_refcnt. + */ + if (unlikely(vma->vm_mm != mm)) { + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + struct mm_struct *other_mm = vma->vm_mm; + + /* + * __mmdrop() is a heavy operation and we don't need RCU + * protection here. Release RCU lock during these operations. + * We reinstate the RCU read lock as the caller expects it to + * be held when this function returns even on error. + */ + rcu_read_unlock(); + mmgrab(other_mm); + vma_refcount_put(vma); + mmdrop(other_mm); + rcu_read_lock(); + return NULL; + } + /* * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. * False unlocked result is impossible because we modify and check diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 729fb7d0dd59..b006cec8e6fe 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -164,8 +164,7 @@ retry: */ /* Check if the vma we locked is the right one. */ - if (unlikely(vma->vm_mm != mm || - address < vma->vm_start || address >= vma->vm_end)) + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto inval_end_read; rcu_read_unlock(); @@ -236,11 +235,8 @@ retry: goto fallback; } - /* - * Verify the vma we locked belongs to the same address space and it's - * not behind of the last search position. - */ - if (unlikely(vma->vm_mm != mm || from_addr >= vma->vm_end)) + /* Verify the vma is not behind the last search position. */ + if (unlikely(from_addr >= vma->vm_end)) goto fallback_unlock; /* -- cgit v1.2.3 From fcd90ad31e29d0b403f3a074a64cd7f0876175dd Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 13 Jul 2025 10:17:23 +0300 Subject: execmem: drop unused execmem_update_copy() Patch series "x86: enable EXECMEM_ROX_CACHE for ftrace and kprobes", v3. These patches enable use of EXECMEM_ROX_CACHE for ftrace and kprobes allocations on x86. They also include some ground work in execmem. Since the execmem model for caching large ROX pages changed from the initial assumption that the memory that is allocated from ROX cache is always ROX to the current state where memory can be temporarily made RW and then restored to ROX, we can stop using text poking to update it. This also saves the hassle of trying lock text_mutex in execmem_cache_free() when kprobes already hold that mutex. This patch (of 8): The execmem_update_copy() that used text poking was required when memory allocated from ROX cache was always read-only. Since now its permissions can be switched to read-write there is no need in a function that updates memory with text poking. Remove it. Link: https://lkml.kernel.org/r/20250713071730.4117334-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20250713071730.4117334-2-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Peter Zijlstra (Intel) Cc: Daniel Gomez Cc: Masami Hiramatsu (Google) Cc: Petr Pavlu Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/execmem.h | 13 ------------- mm/execmem.c | 5 ----- 2 files changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 3be35680a54f..734fbe83d98e 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -185,19 +185,6 @@ DEFINE_FREE(execmem, void *, if (_T) execmem_free(_T)); struct vm_struct *execmem_vmap(size_t size); #endif -/** - * execmem_update_copy - copy an update to executable memory - * @dst: destination address to update - * @src: source address containing the data - * @size: how many bytes of memory shold be copied - * - * Copy @size bytes from @src to @dst using text poking if the memory at - * @dst is read-only. - * - * Return: a pointer to @dst or NULL on error - */ -void *execmem_update_copy(void *dst, const void *src, size_t size); - /** * execmem_is_rox - check if execmem is read-only * @type - the execmem type to check diff --git a/mm/execmem.c b/mm/execmem.c index 627e6cf64f4f..aac211bc88c5 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -399,11 +399,6 @@ void execmem_free(void *ptr) vfree(ptr); } -void *execmem_update_copy(void *dst, const void *src, size_t size) -{ - return text_poke_copy(dst, src, size); -} - bool execmem_is_rox(enum execmem_type type) { return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE); -- cgit v1.2.3 From 838955f64ae7582f009a3538889bb9244f37ab26 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 13 Jul 2025 10:17:24 +0300 Subject: execmem: introduce execmem_alloc_rw() Some callers of execmem_alloc() require the memory to be temporarily writable even when it is allocated from ROX cache. These callers use execemem_make_temp_rw() right after the call to execmem_alloc(). Wrap this sequence in execmem_alloc_rw() API. Link: https://lkml.kernel.org/r/20250713071730.4117334-3-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Daniel Gomez Reviewed-by: Petr Pavlu Acked-by: Peter Zijlstra (Intel) Cc: Masami Hiramatsu (Google) Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- arch/x86/kernel/alternative.c | 3 +-- include/linux/execmem.h | 38 ++++++++++++++++++++++---------------- kernel/module/main.c | 13 ++----------- mm/execmem.c | 27 ++++++++++++++++++++++++++- 4 files changed, 51 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ea1d984166cd..526a5fef93ab 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -120,7 +120,7 @@ struct its_array its_pages; static void *__its_alloc(struct its_array *pages) { - void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); + void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE); if (!page) return NULL; @@ -237,7 +237,6 @@ static void *its_alloc(void) if (!page) return NULL; - execmem_make_temp_rw(page, PAGE_SIZE); if (pages == &its_pages) set_memory_x((unsigned long)page, 1); diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 734fbe83d98e..8b61b05da7d5 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -67,21 +67,6 @@ enum execmem_range_flags { */ void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); -/** - * execmem_make_temp_rw - temporarily remap region with read-write - * permissions - * @ptr: address of the region to remap - * @size: size of the region to remap - * - * Remaps a part of the cached large page in the ROX cache in the range - * [@ptr, @ptr + @size) as writable and not executable. The caller must - * have exclusive ownership of this range and ensure nothing will try to - * execute code in this range. - * - * Return: 0 on success or negative error code on failure. - */ -int execmem_make_temp_rw(void *ptr, size_t size); - /** * execmem_restore_rox - restore read-only-execute permissions * @ptr: address of the region to remap @@ -95,7 +80,6 @@ int execmem_make_temp_rw(void *ptr, size_t size); */ int execmem_restore_rox(void *ptr, size_t size); #else -static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; } static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; } #endif @@ -165,6 +149,28 @@ struct execmem_info *execmem_arch_setup(void); */ void *execmem_alloc(enum execmem_type type, size_t size); +/** + * execmem_alloc_rw - allocate writable executable memory + * @type: type of the allocation + * @size: how many bytes of memory are required + * + * Allocates memory that will contain executable code, either generated or + * loaded from kernel modules. + * + * Allocates memory that will contain data coupled with executable code, + * like data sections in kernel modules. + * + * Forces writable permissions on the allocated memory and the caller is + * responsible to manage the permissions afterwards. + * + * For architectures that use ROX cache the permissions will be set to R+W. + * For architectures that don't use ROX cache the default permissions for @type + * will be used as they must be writable. + * + * Return: a pointer to the allocated memory or %NULL + */ +void *execmem_alloc_rw(enum execmem_type type, size_t size); + /** * execmem_free - free executable memory * @ptr: pointer to the memory that should be freed diff --git a/kernel/module/main.c b/kernel/module/main.c index 413ac6ea3702..d009326ef7bb 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1292,20 +1292,11 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) else execmem_type = EXECMEM_MODULE_TEXT; - ptr = execmem_alloc(execmem_type, size); + ptr = execmem_alloc_rw(execmem_type, size); if (!ptr) return -ENOMEM; - if (execmem_is_rox(execmem_type)) { - int err = execmem_make_temp_rw(ptr, size); - - if (err) { - execmem_free(ptr); - return -ENOMEM; - } - - mod->mem[type].is_rox = true; - } + mod->mem[type].is_rox = execmem_is_rox(execmem_type); /* * The pointer to these blocks of memory are stored on the module diff --git a/mm/execmem.c b/mm/execmem.c index aac211bc88c5..d0bf0123bce4 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -336,7 +336,7 @@ static bool execmem_cache_free(void *ptr) return true; } -int execmem_make_temp_rw(void *ptr, size_t size) +static int execmem_force_rw(void *ptr, size_t size) { unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long addr = (unsigned long)ptr; @@ -358,6 +358,16 @@ int execmem_restore_rox(void *ptr, size_t size) } #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ +/* + * when ROX cache is not used the permissions defined by architectures for + * execmem ranges that are updated before use (e.g. EXECMEM_MODULE_TEXT) must + * be writable anyway + */ +static inline int execmem_force_rw(void *ptr, size_t size) +{ + return 0; +} + static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { return NULL; @@ -387,6 +397,21 @@ void *execmem_alloc(enum execmem_type type, size_t size) return kasan_reset_tag(p); } +void *execmem_alloc_rw(enum execmem_type type, size_t size) +{ + void *p __free(execmem) = execmem_alloc(type, size); + int err; + + if (!p) + return NULL; + + err = execmem_force_rw(p, size); + if (err) + return NULL; + + return no_free_ptr(p); +} + void execmem_free(void *ptr) { /* -- cgit v1.2.3 From ab674b6871b049aab2e86d1d7375526368ed175a Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 13 Jul 2025 10:17:28 +0300 Subject: execmem: drop writable parameter from execmem_fill_trapping_insns() After update of execmem_cache_free() that made memory writable before updating it, there is no need to update read only memory, so the writable parameter to execmem_fill_trapping_insns() is not needed. Drop it. Link: https://lkml.kernel.org/r/20250713071730.4117334-7-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Peter Zijlstra (Intel) Cc: Daniel Gomez Cc: Masami Hiramatsu (Google) Cc: Petr Pavlu Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- arch/x86/mm/init.c | 8 ++------ include/linux/execmem.h | 3 +-- mm/execmem.c | 4 ++-- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7456df985d96..dbc63f0d538f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1063,13 +1063,9 @@ unsigned long arch_max_swapfile_size(void) static struct execmem_info execmem_info __ro_after_init; #ifdef CONFIG_ARCH_HAS_EXECMEM_ROX -void execmem_fill_trapping_insns(void *ptr, size_t size, bool writeable) +void execmem_fill_trapping_insns(void *ptr, size_t size) { - /* fill memory with INT3 instructions */ - if (writeable) - memset(ptr, INT3_INSN_OPCODE, size); - else - text_poke_set(ptr, INT3_INSN_OPCODE, size); + memset(ptr, INT3_INSN_OPCODE, size); } #endif diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 8b61b05da7d5..7de229134e30 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -60,12 +60,11 @@ enum execmem_range_flags { * will trap * @ptr: pointer to memory to fill * @size: size of the range to fill - * @writable: is the memory poited by @ptr is writable or ROX * * A hook for architecures to fill execmem ranges with invalid instructions. * Architectures that use EXECMEM_ROX_CACHE must implement this. */ -void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); +void execmem_fill_trapping_insns(void *ptr, size_t size); /** * execmem_restore_rox - restore read-only-execute permissions diff --git a/mm/execmem.c b/mm/execmem.c index 9abf76a63a79..1785d7f435e4 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -304,7 +304,7 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) goto err_free_mem; /* fill memory with instructions that will trap */ - execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); + execmem_fill_trapping_insns(p, alloc_size); err = set_memory_rox((unsigned long)p, vm->nr_pages); if (err) @@ -363,7 +363,7 @@ static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask) if (err) return err; - execmem_fill_trapping_insns(ptr, size, /* writable = */ true); + execmem_fill_trapping_insns(ptr, size); execmem_restore_rox(ptr, size); err = execmem_cache_add_locked(ptr, size, gfp_mask); -- cgit v1.2.3 From bb5b0b4317c9516bdc5e9a4235e3b5f1a73b7e48 Mon Sep 17 00:00:00 2001 From: Joshua Kinard Date: Mon, 21 Jul 2025 13:00:51 -0400 Subject: rtc: ds1685: Update Joshua Kinard's email address. I am switching my address to a personal domain, so need to update the driver's files and the entry in MAINTAINERS. Signed-off-by: Joshua Kinard Link: https://lore.kernel.org/r/20250721170051.32407-1-kumba@gentoo.org Signed-off-by: Alexandre Belloni --- MAINTAINERS | 2 +- drivers/rtc/rtc-ds1685.c | 4 ++-- include/linux/rtc/ds1685.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index a92290fffa16..536befd32be8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6608,7 +6608,7 @@ S: Supported F: drivers/input/keyboard/dlink-dir685-touchkeys.c DALLAS/MAXIM DS1685-FAMILY REAL TIME CLOCK -M: Joshua Kinard +M: Joshua Kinard S: Maintained F: drivers/rtc/rtc-ds1685.c F: include/linux/rtc/ds1685.h diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c index 38e25f63597a..97423f1d0361 100644 --- a/drivers/rtc/rtc-ds1685.c +++ b/drivers/rtc/rtc-ds1685.c @@ -3,7 +3,7 @@ * An rtc driver for the Dallas/Maxim DS1685/DS1687 and related real-time * chips. * - * Copyright (C) 2011-2014 Joshua Kinard . + * Copyright (C) 2011-2014 Joshua Kinard . * Copyright (C) 2009 Matthias Fuchs . * * References: @@ -1436,7 +1436,7 @@ EXPORT_SYMBOL_GPL(ds1685_rtc_poweroff); /* ----------------------------------------------------------------------- */ -MODULE_AUTHOR("Joshua Kinard "); +MODULE_AUTHOR("Joshua Kinard "); MODULE_AUTHOR("Matthias Fuchs "); MODULE_DESCRIPTION("Dallas/Maxim DS1685/DS1687-series RTC driver"); MODULE_LICENSE("GPL"); diff --git a/include/linux/rtc/ds1685.h b/include/linux/rtc/ds1685.h index 5a41c3bbcbe3..01da4582db6d 100644 --- a/include/linux/rtc/ds1685.h +++ b/include/linux/rtc/ds1685.h @@ -8,7 +8,7 @@ * include larger, battery-backed NV-SRAM, burst-mode access, and an RTC * write counter. * - * Copyright (C) 2011-2014 Joshua Kinard . + * Copyright (C) 2011-2014 Joshua Kinard . * Copyright (C) 2009 Matthias Fuchs . * * References: -- cgit v1.2.3 From 86624ba3b522b6512def25534341da93356c8da4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Jul 2025 13:08:25 -0300 Subject: vfio/pci: Do vf_token checks for VFIO_DEVICE_BIND_IOMMUFD This was missed during the initial implementation. The VFIO PCI encodes the vf_token inside the device name when opening the device from the group FD, something like: "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3" This is used to control access to a VF unless there is co-ordination with the owner of the PF. Since we no longer have a device name in the cdev path, pass the token directly through VFIO_DEVICE_BIND_IOMMUFD using an optional field indicated by VFIO_DEVICE_BIND_FLAG_TOKEN. Fixes: 5fcc26969a16 ("vfio: Add VFIO_DEVICE_BIND_IOMMUFD") Tested-by: Shameer Kolothum Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/0-v3-bdd8716e85fe+3978a-vfio_token_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/device_cdev.c | 38 ++++++++++++++++++++++++-- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 1 + drivers/vfio/pci/mlx5/main.c | 1 + drivers/vfio/pci/nvgrace-gpu/main.c | 2 ++ drivers/vfio/pci/pds/vfio_dev.c | 1 + drivers/vfio/pci/qat/main.c | 1 + drivers/vfio/pci/vfio_pci.c | 1 + drivers/vfio/pci/vfio_pci_core.c | 22 +++++++++------ drivers/vfio/pci/virtio/main.c | 3 ++ include/linux/vfio.h | 4 +++ include/linux/vfio_pci_core.h | 2 ++ include/uapi/linux/vfio.h | 12 +++++++- 12 files changed, 76 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index 281a8dc3ed49..480cac3a0c27 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -60,22 +60,50 @@ static void vfio_df_get_kvm_safe(struct vfio_device_file *df) spin_unlock(&df->kvm_ref_lock); } +static int vfio_df_check_token(struct vfio_device *device, + const struct vfio_device_bind_iommufd *bind) +{ + uuid_t uuid; + + if (!device->ops->match_token_uuid) { + if (bind->flags & VFIO_DEVICE_BIND_FLAG_TOKEN) + return -EINVAL; + return 0; + } + + if (!(bind->flags & VFIO_DEVICE_BIND_FLAG_TOKEN)) + return device->ops->match_token_uuid(device, NULL); + + if (copy_from_user(&uuid, u64_to_user_ptr(bind->token_uuid_ptr), + sizeof(uuid))) + return -EFAULT; + return device->ops->match_token_uuid(device, &uuid); +} + long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, struct vfio_device_bind_iommufd __user *arg) { + const u32 VALID_FLAGS = VFIO_DEVICE_BIND_FLAG_TOKEN; struct vfio_device *device = df->device; struct vfio_device_bind_iommufd bind; unsigned long minsz; + u32 user_size; int ret; static_assert(__same_type(arg->out_devid, df->devid)); minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); - if (copy_from_user(&bind, arg, minsz)) - return -EFAULT; + ret = get_user(user_size, &arg->argsz); + if (ret) + return ret; + if (user_size < minsz) + return -EINVAL; + ret = copy_struct_from_user(&bind, minsz, arg, user_size); + if (ret) + return ret; - if (bind.argsz < minsz || bind.flags || bind.iommufd < 0) + if (bind.iommufd < 0 || bind.flags & ~VALID_FLAGS) return -EINVAL; /* BIND_IOMMUFD only allowed for cdev fds */ @@ -93,6 +121,10 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, goto out_unlock; } + ret = vfio_df_check_token(device, &bind); + if (ret) + goto out_unlock; + df->iommufd = iommufd_ctx_from_fd(bind.iommufd); if (IS_ERR(df->iommufd)) { ret = PTR_ERR(df->iommufd); diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 2149f49aeec7..397f5e445136 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1583,6 +1583,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 93f894fe60d2..7ec47e736a8e 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1372,6 +1372,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index e5ac39c4cc6b..d95761dcdd58 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -696,6 +696,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .mmap = nvgrace_gpu_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -715,6 +716,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index f6e0253a8a14..f3ccb0008f67 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -201,6 +201,7 @@ static const struct vfio_device_ops pds_vfio_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index 845ed15b6771..5cce6b0b8d2f 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -614,6 +614,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 5ba39f7623bb..ac10f14417f2 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -138,6 +138,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 261a6dc5a5fc..fad410cf91bc 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1821,9 +1821,13 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count) } EXPORT_SYMBOL_GPL(vfio_pci_core_request); -static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, - bool vf_token, uuid_t *uuid) +int vfio_pci_core_match_token_uuid(struct vfio_device *core_vdev, + const uuid_t *uuid) + { + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + /* * There's always some degree of trust or collaboration between SR-IOV * PF and VFs, even if just that the PF hosts the SR-IOV capability and @@ -1854,7 +1858,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, bool match; if (!pf_vdev) { - if (!vf_token) + if (!uuid) return 0; /* PF is not vfio-pci, no VF token */ pci_info_ratelimited(vdev->pdev, @@ -1862,7 +1866,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, return -EINVAL; } - if (!vf_token) { + if (!uuid) { pci_info_ratelimited(vdev->pdev, "VF token required to access device\n"); return -EACCES; @@ -1880,7 +1884,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, } else if (vdev->vf_token) { mutex_lock(&vdev->vf_token->lock); if (vdev->vf_token->users) { - if (!vf_token) { + if (!uuid) { mutex_unlock(&vdev->vf_token->lock); pci_info_ratelimited(vdev->pdev, "VF token required to access device\n"); @@ -1893,12 +1897,12 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, "Incorrect VF token provided for device\n"); return -EACCES; } - } else if (vf_token) { + } else if (uuid) { uuid_copy(&vdev->vf_token->uuid, uuid); } mutex_unlock(&vdev->vf_token->lock); - } else if (vf_token) { + } else if (uuid) { pci_info_ratelimited(vdev->pdev, "VF token incorrectly provided, not a PF or VF\n"); return -EINVAL; @@ -1906,6 +1910,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, return 0; } +EXPORT_SYMBOL_GPL(vfio_pci_core_match_token_uuid); #define VF_TOKEN_ARG "vf_token=" @@ -1952,7 +1957,8 @@ int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf) } } - ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid); + ret = core_vdev->ops->match_token_uuid(core_vdev, + vf_token ? &uuid : NULL); if (ret) return ret; diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index 515fe1b9f94d..8084f3e36a9f 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -94,6 +94,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -114,6 +115,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -134,6 +136,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 707b00772ce1..eb563f538dee 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -105,6 +105,9 @@ struct vfio_device { * @match: Optional device name match callback (return: 0 for no-match, >0 for * match, -errno for abort (ex. match with insufficient or incorrect * additional args) + * @match_token_uuid: Optional device token match/validation. Return 0 + * if the uuid is valid for the device, -errno otherwise. uuid is NULL + * if none was provided. * @dma_unmap: Called when userspace unmaps IOVA from the container * this device is attached to. * @device_feature: Optional, fill in the VFIO_DEVICE_FEATURE ioctl @@ -132,6 +135,7 @@ struct vfio_device_ops { int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); void (*request)(struct vfio_device *vdev, unsigned int count); int (*match)(struct vfio_device *vdev, char *buf); + int (*match_token_uuid)(struct vfio_device *vdev, const uuid_t *uuid); void (*dma_unmap)(struct vfio_device *vdev, u64 iova, u64 length); int (*device_feature)(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index fbb472dd99b3..f541044e42a2 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -122,6 +122,8 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma); void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count); int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf); +int vfio_pci_core_match_token_uuid(struct vfio_device *core_vdev, + const uuid_t *uuid); int vfio_pci_core_enable(struct vfio_pci_core_device *vdev); void vfio_pci_core_disable(struct vfio_pci_core_device *vdev); void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 5764f315137f..75100bf009ba 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -905,10 +905,12 @@ struct vfio_device_feature { * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 18, * struct vfio_device_bind_iommufd) * @argsz: User filled size of this data. - * @flags: Must be 0. + * @flags: Must be 0 or a bit flags of VFIO_DEVICE_BIND_* * @iommufd: iommufd to bind. * @out_devid: The device id generated by this bind. devid is a handle for * this device/iommufd bond and can be used in IOMMUFD commands. + * @token_uuid_ptr: Valid if VFIO_DEVICE_BIND_FLAG_TOKEN. Points to a 16 byte + * UUID in the same format as VFIO_DEVICE_FEATURE_PCI_VF_TOKEN. * * Bind a vfio_device to the specified iommufd. * @@ -917,13 +919,21 @@ struct vfio_device_feature { * * Unbind is automatically conducted when device fd is closed. * + * A token is sometimes required to open the device, unless this is known to be + * needed VFIO_DEVICE_BIND_FLAG_TOKEN should not be set and token_uuid_ptr is + * ignored. The only case today is a PF/VF relationship where the VF bind must + * be provided the same token as VFIO_DEVICE_FEATURE_PCI_VF_TOKEN provided to + * the PF. + * * Return: 0 on success, -errno on failure. */ struct vfio_device_bind_iommufd { __u32 argsz; __u32 flags; +#define VFIO_DEVICE_BIND_FLAG_TOKEN (1 << 0) __s32 iommufd; __u32 out_devid; + __aligned_u64 token_uuid_ptr; }; #define VFIO_DEVICE_BIND_IOMMUFD _IO(VFIO_TYPE, VFIO_BASE + 18) -- cgit v1.2.3